def _run_task(self): self._init_mapping() self._init_db() self._expand_mapping() start_step = self.options.get("start_step") started = False for name, mapping in self.mapping.items(): # Skip steps until start_step if not started and start_step and name != start_step: self.logger.info(f"Skipping step: {name}") continue started = True self.logger.info(f"Running Job: {name}") result = self._load_mapping(mapping) if result not in ("Completed", "CompletedWithFailures"): raise BulkDataException( f"Job {name} did not complete successfully") if name in self.after_steps: for after_name, after_step in self.after_steps[name].items(): self.logger.info(f"Running post-load step: {after_name}") result = self._load_mapping(after_step) if result not in ("Completed", "CompletedWithFailures"): raise BulkDataException( f"Job {after_name} did not complete successfully")
def _run_task(self): self._init_mapping() self._init_db() self._expand_mapping() start_step = self.options.get("start_step") started = False for name, mapping in self.mapping.items(): # Skip steps until start_step if not started and start_step and name != start_step: self.logger.info(f"Skipping step: {name}") continue started = True self.logger.info(f"Running step: {name}") result = self._execute_step(mapping) if result.status is DataOperationStatus.JOB_FAILURE: raise BulkDataException( f"Step {name} did not complete successfully: {','.join(result.job_errors)}" ) if name in self.after_steps: for after_name, after_step in self.after_steps[name].items(): self.logger.info(f"Running post-load step: {after_name}") result = self._execute_step(after_step) if result.status is DataOperationStatus.JOB_FAILURE: raise BulkDataException( f"Step {after_name} did not complete successfully: {','.join(result.job_errors)}" )
def _run_task(self): self._validate_and_inject_namespace() for obj in self.sobjects: query = f"SELECT Id FROM {obj}" if self.options["where"]: query += f" WHERE {self.options['where']}" qs = get_query_operation( sobject=obj, fields=["Id"], api_options={}, context=self, query=query, api=self.options["api"], ) self.logger.info(f"Querying for {obj} objects") qs.query() if qs.job_result.status is not DataOperationStatus.SUCCESS: raise BulkDataException( f"Unable to query records for {obj}: {','.join(qs.job_result.job_errors)}" ) if not qs.job_result.records_processed: self.logger.info( f"No records found, skipping delete operation for {obj}" ) continue self.logger.info(f"Deleting {self._object_description(obj)} ") ds = get_dml_operation( sobject=obj, operation=( DataOperationType.HARD_DELETE if self.options["hardDelete"] else DataOperationType.DELETE ), fields=["Id"], api_options={}, context=self, api=self.options["api"], volume=qs.job_result.records_processed, ) ds.start() ds.load_records(qs.get_results()) ds.end() if ds.job_result.status not in [ DataOperationStatus.SUCCESS, DataOperationStatus.ROW_FAILURE, ]: raise BulkDataException( f"Unable to delete records for {obj}: {','.join(qs.job_result.job_errors)}" ) error_checker = RowErrorChecker( self.logger, self.options["ignore_row_errors"], self.row_warning_limit ) for result in ds.get_results(): error_checker.check_for_row_error(result, result.id)
def validate_and_inject_mapping( *, mapping: Dict, org_config: OrgConfig, namespace: str, data_operation: DataOperationType, inject_namespaces: bool, drop_missing: bool, org_has_person_accounts_enabled: bool = False, ): should_continue = [ m.validate_and_inject_namespace( org_config, namespace, data_operation, inject_namespaces, drop_missing ) for m in mapping.values() ] if not drop_missing and not all(should_continue): raise BulkDataException( "One or more schema or permissions errors blocked the operation.\n" "If you would like to attempt the load regardless, you can specify " "'-o drop_missing_schema True' on the command." ) if drop_missing: # Drop any steps with sObjects that are not present. for (include, step_name) in zip(should_continue, list(mapping.keys())): if not include: del mapping[step_name] # Remove any remaining lookups to dropped objects. for m in mapping.values(): describe = getattr(org_config.salesforce_client, m.sf_object).describe() describe = {entry["name"]: entry for entry in describe["fields"]} for field in list(m.lookups.keys()): lookup = m.lookups[field] if lookup.table not in [step.table for step in mapping.values()]: del m.lookups[field] # Make sure this didn't cause the operation to be invalid # by dropping a required field. if not describe[field]["nillable"]: raise BulkDataException( f"{m.sf_object}.{field} is a required field, but the target object " f"{describe[field]['referenceTo']} was removed from the operation " "due to missing permissions." ) # If the org has person accounts enable, add a field mapping to track "IsPersonAccount". # IsPersonAccount field values are used to properly load person account records. if org_has_person_accounts_enabled and data_operation == DataOperationType.QUERY: for step in mapping.values(): if step["sf_object"] in ("Account", "Contact"): step["fields"]["IsPersonAccount"] = "IsPersonAccount"
def _wait_for_job(self, job_id, error_behaviour: str = "raise") -> str: """With for job_id to finish. If there are any row errors, error_behaviour says what happens. If its "raise" (the default), then throw an exception. If its "return" then return "fail" and set an instance variable: self.error_messages """ assert error_behaviour in ("raise", "return") while True: job_status = self.bulk.job_status(job_id) self.logger.info( f" Waiting for job {job_id} ({job_status['numberBatchesCompleted']}/{job_status['numberBatchesTotal']})" ) result, messages = self._job_state_from_batches(job_id) if result != "InProgress": break time.sleep(10) self.logger.info(f"Job {job_id} finished with result: {result}") if "Fail" in result: for state_message in messages: self.logger.error(f"Batch failure message: {state_message}") if error_behaviour == "raise": raise BulkDataException("Job Error", messages) else: self.error_messages = messages return result
def _validate_org_has_person_accounts_enabled_if_person_account_data_exists(self): """ To ensure data is loaded from the dataset as expected as well as avoid partial failues, raise a BulkDataException if there exists Account or Contact records with IsPersonAccount as 'true' but the org does not have person accounts enabled. """ for mapping in self.mapping.values(): if ( mapping.sf_object in [ "Account", "Contact", ] and self._db_has_person_accounts_column(mapping) ): table = self.models[mapping.table].__table__ if ( self.session.query(table) .filter(table.columns.get("IsPersonAccount") == "true") .first() and not self.org_config.is_person_accounts_enabled ): raise BulkDataException( "Your dataset contains Person Account data but Person Accounts is not enabled for your org." )
def get_results(self): for batch_id in self.batch_ids: try: results_url = ( f"{self.bulk.endpoint}/job/{self.job_id}/batch/{batch_id}/result" ) # Download entire result file to a temporary file first # to avoid the server dropping connections with download_file(results_url, self.bulk) as f: self.logger.info( f"Downloaded results for batch {batch_id}") reader = csv.reader(f) next(reader) # skip header for row in reader: success = process_bool_arg(row[1]) yield DataOperationResult( row[0] if success else None, success, row[3] if not success else None, ) except Exception as e: raise BulkDataException( f"Failed to download results for batch {batch_id} ({str(e)})" )
def _run_query(self, soql, mapping): """Execute a Bulk or REST API query job and store the results.""" step = get_query_operation( sobject=mapping.sf_object, api=mapping.api, fields=list( mapping.get_complete_field_map(include_id=True).keys()), api_options={}, context=self, query=soql, ) self.logger.info(f"Extracting data for sObject {mapping['sf_object']}") step.query() if step.job_result.status is DataOperationStatus.SUCCESS: if step.job_result.records_processed: self.logger.info("Downloading and importing records") self._import_results(mapping, step) else: self.logger.info( f"No records found for sObject {mapping['sf_object']}") else: raise BulkDataException( f"Unable to execute query: {','.join(step.job_result.job_errors)}" )
def _store_inserted_ids(self, mapping, job_id, local_ids_for_batch): """Get the job results and store inserted SF Ids in a new table""" id_table_name = self._reset_id_table(mapping) conn = self.session.connection() for batch_id, local_ids in local_ids_for_batch.items(): try: results_url = "{}/job/{}/batch/{}/result".format( self.bulk.endpoint, job_id, batch_id ) # Download entire result file to a temporary file first # to avoid the server dropping connections with download_file(results_url, self.bulk) as f: self.logger.info( " Downloaded results for batch {}".format(batch_id) ) self._store_inserted_ids_for_batch( f, local_ids, id_table_name, conn ) self.logger.info( " Updated {} for batch {}".format(id_table_name, batch_id) ) except BulkDataException: raise except Exception as e: raise BulkDataException( "Failed to download results for batch {} ({})".format( batch_id, str(e) ) ) self.session.commit()
def check_for_row_error(self, result, row_id): if not result.success: msg = f"Error on record with id {row_id}: {result.error}" if self.ignore_row_errors: if self.row_error_count < self.row_warning_limit: self.logger.warning(msg) elif self.row_error_count == self.row_warning_limit: self.logger.warning("Further warnings suppressed") self.row_error_count += 1 return self.row_error_count else: raise BulkDataException(msg)
def _generate_results_id_map(self, result_file, local_ids): """Iterate over job results and prepare rows for id table""" reader = unicodecsv.reader(result_file) next(reader) # skip header i = 0 for row, local_id in zip(reader, local_ids): if row[1] == "true": # Success sf_id = row[0] yield f"{local_id},{sf_id}\n".encode("utf-8") else: if self.options["ignore_row_errors"]: self.logger.warning(f" Error on row {i}: {row[3]}") else: raise BulkDataException(f"Error on row {i}: {row[3]}") i += 1
def produce_csv(): """Iterate over job results and prepare rows for id table""" reader = unicodecsv.reader(result_file) next(reader) # skip header i = 0 for row, local_id in zip(reader, local_ids): if row[1] == "true": # Success sf_id = row[0] yield "{},{}\n".format(local_id, sf_id).encode("utf-8") else: if self.options["ignore_row_errors"]: self.logger.warning( " Error on row {}: {}".format(i, row[3]) ) else: raise BulkDataException("Error on row {}: {}".format(i, row[3])) i += 1
def _get_statics(self, mapping): """Return the static values (not column names) to be appended to records for this mapping.""" statics = list(mapping.get("static", {}).values()) if mapping.get("record_type"): query = ( f"SELECT Id FROM RecordType WHERE SObjectType='{mapping.get('sf_object')}'" f"AND DeveloperName = '{mapping['record_type']}' LIMIT 1") records = self.sf.query(query)["records"] if records: record_type_id = records[0]["Id"] else: raise BulkDataException( f"Cannot find RecordType with query `{query}`") statics.append(record_type_id) return statics
def _create_table(self, mapping): model_name = "{}Model".format(mapping["table"]) mapper_kwargs = {} table_kwargs = {} self.models[mapping["table"]] = type(model_name, (object, ), {}) # Provide support for legacy mappings which used the OID as the pk but # default to using an autoincrementing int pk and a separate sf_id column fields = [] mapping["oid_as_pk"] = bool(mapping.get("fields", {}).get("Id")) if mapping["oid_as_pk"]: id_column = mapping["fields"]["Id"] fields.append(Column(id_column, Unicode(255), primary_key=True)) else: fields.append( Column("id", Integer(), primary_key=True, autoincrement=True)) for field in self._fields_for_mapping(mapping): if mapping["oid_as_pk"] and field["sf"] == "Id": continue fields.append(Column(field["db"], Unicode(255))) if "record_type" in mapping: fields.append(Column("record_type", Unicode(255))) t = Table(mapping["table"], self.metadata, *fields, **table_kwargs) if t.exists(): raise BulkDataException("Table already exists: {}".format( mapping["table"])) if not mapping["oid_as_pk"]: mapping["sf_id_table"] = mapping["table"] + "_sf_id" # If multiple mappings point to the same table, don't recreate the table if mapping["sf_id_table"] not in self.models: sf_id_model_name = "{}Model".format(mapping["sf_id_table"]) self.models[mapping["sf_id_table"]] = type( sf_id_model_name, (object, ), {}) sf_id_fields = [ Column("id", Integer(), primary_key=True, autoincrement=True), Column("sf_id", Unicode(24)), ] id_t = Table(mapping["sf_id_table"], self.metadata, *sf_id_fields) mapper(self.models[mapping["sf_id_table"]], id_t) mapper(self.models[mapping["table"]], t, **mapper_kwargs)
def _validate_and_inject_namespace(self): """Perform namespace injection and ensure that we can successfully delete all of the selected objects.""" global_describe = { entry["name"]: entry for entry in self.org_config.salesforce_client.describe()["sobjects"] } # Namespace injection if ( self.options["inject_namespaces"] and self.project_config.project__package__namespace ): def inject(element: str): return f"{self.project_config.project__package__namespace}__{element}" self.sobjects = [] for sobject in self.options["objects"]: if self._is_injectable(sobject): injected = inject(sobject) if sobject in global_describe and injected in global_describe: self.logger.warning( f"Both {sobject} and {injected} are present in the target org. Using {sobject}." ) if sobject not in global_describe and injected in global_describe: self.sobjects.append(injected) else: self.sobjects.append(sobject) else: self.sobjects.append(sobject) else: self.sobjects = self.options["objects"] # Validate CRUD non_deletable_objects = [ s for s in self.sobjects if not (s in global_describe and global_describe[s]["deletable"]) ] if non_deletable_objects: raise BulkDataException( f"The objects {', '.join(non_deletable_objects)} are not present or cannot be deleted." )
def _split_dependencies(self, objs, dependencies): """Attempt to flatten the object network into a sequence of load operations. May throw BulkDataException if reference cycles exist in the network""" stack = [] objs_remaining = objs.copy() # The structure of `dependencies` is: # key = object, value = set of objects it references. # Iterate through our list of objects # For each object, if it is not dependent on any other objects, place it at the end of the stack. # Once an object is placed in the stack, remove dependencies to it (they're satisfied) while objs_remaining: objs_without_deps = [ obj for obj in objs_remaining if obj not in dependencies or not dependencies[obj] ] if not objs_without_deps: self.logger.error( "Unable to complete mapping; the schema contains reference cycles or unresolved dependencies." ) self.logger.info("Mapped objects: {}".format(", ".join(stack))) self.logger.info("Remaining objects:") for obj in objs_remaining: self.logger.info(obj) for other_obj in dependencies[obj]: self.logger.info(" references {} via: {}".format( other_obj, ", ".join(dependencies[obj][other_obj]))) raise BulkDataException("Cannot complete mapping") for obj in objs_without_deps: stack.append(obj) # Remove all dependencies on this object (they're satisfied) for other_obj in dependencies: if obj in dependencies.get(other_obj): del dependencies[other_obj][obj] # Remove this object from our remaining set. objs_remaining.remove(obj) return stack
def _run_task(self): self._init_mapping() self._init_db() start_step = self.options.get("start_step") started = False for name, mapping in self.mapping.items(): # Skip steps until start_step if not started and start_step and name != start_step: self.logger.info("Skipping step: {}".format(name)) continue started = True self.logger.info("Running Job: {}".format(name)) result = self._load_mapping(mapping) if result != "Completed": raise BulkDataException( "Job {} did not complete successfully".format(name) )
def _run_query(self, soql, mapping): """Execute a Bulk API query job and store the results.""" step = BulkApiQueryOperation(sobject=mapping["sf_object"], api_options={}, context=self, query=soql) self.logger.info(f"Extracting data for sObject {mapping['sf_object']}") step.query() if step.job_result.status is DataOperationStatus.SUCCESS: if step.job_result.records_processed: self._import_results(mapping, step) else: self.logger.info( f"No records found for sObject {mapping['sf_object']}") else: raise BulkDataException( f"Unable to execute query: {','.join(step.job_result.job_errors)}" )
def _process_job_results(self, mapping, job_id, local_ids_for_batch): """Get the job results and process the results. If we're raising for row-level errors, do so; if we're inserting, store the new Ids.""" if mapping["action"] == "insert": id_table_name = self._reset_id_table(mapping) conn = self.session.connection() for batch_id, local_ids in local_ids_for_batch.items(): try: results_url = ( f"{self.bulk.endpoint}/job/{job_id}/batch/{batch_id}/result" ) # Download entire result file to a temporary file first # to avoid the server dropping connections with download_file(results_url, self.bulk) as f: self.logger.info( f" Downloaded results for batch {batch_id}") results_generator = self._generate_results_id_map( f, local_ids) if mapping["action"] == "insert": self._sql_bulk_insert_from_csv( conn, id_table_name, ("id", "sf_id"), IteratorBytesIO(results_generator), ) self.logger.info( f" Updated {id_table_name} for batch {batch_id}") else: for r in results_generator: pass # Drain generator to validate results except BulkDataException: raise except Exception as e: raise BulkDataException( f"Failed to download results for batch {batch_id} ({str(e)})" ) if mapping["action"] == "insert": self.session.commit()
def _create_table(self, mapping): model_name = "{}Model".format(mapping["table"]) mapper_kwargs = {} table_kwargs = {} if mapping["table"] in self.models: raise BulkDataException("Table already exists: {}".format( mapping["table"])) self.models[mapping["table"]] = type(model_name, (object, ), {}) id_column = mapping["fields"].get("Id") or "id" fields = [] fields.append(Column(id_column, Unicode(255), primary_key=True)) for field in self._fields_for_mapping(mapping): if field["sf"] == "Id": continue fields.append(Column(field["db"], Unicode(255))) if "record_type" in mapping: fields.append(Column("record_type", Unicode(255))) t = Table(mapping["table"], self.metadata, *fields, **table_kwargs) mapper(self.models[mapping["table"]], t, **mapper_kwargs)
def _convert(rec): result = dict(zip(self.fields, rec)) for boolean_field in self.boolean_fields: try: result[boolean_field] = process_bool_arg( result[boolean_field] or False) except TypeError as e: raise BulkDataException(e) # Remove empty fields (different semantics in REST API) # We do this for insert only - on update, any fields set to `null` # are meant to be blanked out. if self.operation is DataOperationType.INSERT: result = { k: result[k] for k in result if result[k] is not None and result[k] != "" } result["attributes"] = {"type": self.sobject} return result
def create_table(mapping, metadata): """Given a mapping data structure (from mapping.yml) and SQLAlchemy metadata, create a table matching the mapping. Mapping should be a MappingStep instance""" fields = [] _handle_primary_key(mapping, fields) # make a field list to create for field, db in mapping.get_complete_field_map().items(): if field == "Id": continue fields.append(Column(db, Unicode(255))) if mapping.record_type: fields.append(Column("record_type", Unicode(255))) t = Table(mapping.table, metadata, *fields) if t.exists(): raise BulkDataException(f"Table already exists: {mapping.table}") return t
def create_table(mapping, metadata): """Given a mapping data structure (from mapping.yml) and SQLAlchemy metadata, create a table matching the mapping. Mapping should be a dict-like with keys "fields", "table" and optionally "oid_as_pk" and "record_type" """ fields = [] _handle_primary_key(mapping, fields) # make a field list to create for field in fields_for_mapping(mapping): if mapping["oid_as_pk"] and field["sf"] == "Id": continue fields.append(Column(field["db"], Unicode(255))) if "record_type" in mapping: fields.append(Column("record_type", Unicode(255))) t = Table(mapping["table"], metadata, *fields) if t.exists(): raise BulkDataException(f"Table already exists: {mapping['table']}") return t
def _run_task(self): # This demonstration supports only one object at a time, but accepts lists. # Gather permutable fields for the object # Picklists, checkboxes, and Record Type (if present) object_name = self.options["objects"][0] object_details = getattr(self.sf, object_name).describe() field_list = {field["name"]: field for field in object_details["fields"]} permutable_values = {} for name, f in field_list.items(): if name == "RecordTypeId": # Query Record Types and add their Ids are permutable values rt_ids = { rt["Id"] for rt in self.sf.query( "SELECT Id FROM RecordType WHERE SobjectType = '{}'".format( object_name ) )["records"] } permutable_values["RecordTypeId"] = rt_ids elif f["type"] == "picklist" and f["custom"]: permutable_values[name] = { pl["value"] for pl in f["picklistValues"] if pl["active"] } elif f["type"] == "boolean" and f["custom"]: permutable_values[name] = {"True", "False"} populate_name = field_list["Name"]["updateable"] def generate_random_name(object_name): i = 0 while True: i = i + 1 yield f"{object_name} {i}" def generate_permutations( perms, template=None, populate_name=False, name_generator=generate_random_name(object_name), ): if template is None: template = {} f = list(perms.keys())[0] for v in perms[f]: template[f] = v next_perms = perms.copy() del next_perms[f] if next_perms: yield from generate_permutations( next_perms, template, populate_name, name_generator=name_generator, ) else: if populate_name: template["Name"] = next(name_generator) yield template file_name = object_details["labelPlural"] with open(f"{file_name}.csv", mode="w") as output_file: field_names = list(permutable_values.keys()) field_names.append("Name") writer = csv.DictWriter(output_file, field_names) writer.writeheader() for row in generate_permutations( permutable_values, template=None, populate_name=populate_name ): writer.writerow(row) job_id = self.bulk.create_insert_job(object_name, contentType="CSV") with open(f"{file_name}.csv", mode="rb") as input_file: batch_id = self.bulk.post_batch(job_id, input_file) self.bulk.close_job(job_id) result = self._wait_for_job(job_id) if result != "Completed": raise BulkDataException("Job {} did not complete successfully".format(name))