def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter): bulk = Bulk(sf) current_bookmark = singer.get_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry) current_bookmark = singer_utils.strptime_with_tz(current_bookmark) batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'BatchIDs') start_time = singer_utils.now() stream = catalog_entry['stream'] stream_id = catalog_entry['tap_stream_id'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry.get('metadata')) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) schema = catalog_entry['schema'] if not bulk.job_exists(job_id): LOGGER.info( "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state." ) return counter # Iterate over the remaining batches, removing them once they are synced for batch_id in batch_ids[:]: with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_id or stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) # Update bookmark if necessary replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark: current_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(current_bookmark)) batch_ids.remove(batch_id) LOGGER.info("Finished syncing batch %s. Removing batch from state.", batch_id) LOGGER.info("Batches to go: %d", len(batch_ids)) singer.write_state(state) return counter
def query(self, catalog_entry, state): if self.api_type == BULK_API_TYPE: bulk = Bulk(self) return bulk.query(catalog_entry, state) elif self.api_type == REST_API_TYPE: rest = Rest(self) return rest.query(catalog_entry, state) else: raise TapSalesforceException( "api_type should be REST or BULK was: {}".format( self.api_type))
def do_discover(sf): """Describes a Salesforce instance's objects and generates a JSON schema for each field.""" global_description = sf.describe() objects_to_discover = {o['name'] for o in global_description['sobjects']} key_properties = ['Id'] sf_custom_setting_objects = [] object_to_tag_references = {} # For each SF Object describe it, loop its fields and build a schema entries = [] # Check if the user has BULK API enabled if sf.api_type == 'BULK' and not Bulk(sf).has_permissions(): raise TapSalesforceBulkAPIDisabledException( 'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code' ) for sobject_name in objects_to_discover: # Skip blacklisted SF objects depending on the api_type in use # ChangeEvent objects are not queryable via Bulk or REST (undocumented) if sobject_name in sf.get_blacklisted_objects() \ or sobject_name.endswith("ChangeEvent"): continue sobject_description = sf.describe(sobject_name) # Cache customSetting and Tag objects to check for blacklisting after # all objects have been described if sobject_description.get("customSetting"): sf_custom_setting_objects.append(sobject_name) elif sobject_name.endswith("__Tag"): relationship_field = next((f for f in sobject_description["fields"] if f.get("relationshipName") == "Item"), None) if relationship_field: # Map {"Object":"Object__Tag"} object_to_tag_references[relationship_field["referenceTo"] [0]] = sobject_name fields = sobject_description['fields'] replication_key = get_replication_key(sobject_name, fields) unsupported_fields = set() properties = {} mdata = metadata.new() found_id_field = False # Loop over the object's fields for f in fields: field_name = f['name'] if field_name == "Id": found_id_field = True property_schema, mdata = create_property_schema(f, mdata) # Compound Address fields cannot be queried by the Bulk API if f['type'] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE: unsupported_fields.add( (field_name, 'cannot query compound address fields with bulk API')) # we haven't been able to observe any records with a json field, so we # are marking it as unavailable until we have an example to work with if f['type'] == "json": unsupported_fields.add(( field_name, 'do not currently support json fields - please contact support' )) # Blacklisted fields are dependent on the api_type being used field_pair = (sobject_name, field_name) if field_pair in sf.get_blacklisted_fields(): unsupported_fields.add( (field_name, sf.get_blacklisted_fields()[field_pair])) inclusion = metadata.get(mdata, ('properties', field_name), 'inclusion') if sf.select_fields_by_default and inclusion != 'unsupported': mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) properties[field_name] = property_schema if replication_key: mdata = metadata.write(mdata, ('properties', replication_key), 'inclusion', 'automatic') # There are cases where compound fields are referenced by the associated # subfields but are not actually present in the field list field_name_set = {f['name'] for f in fields} filtered_unsupported_fields = [ f for f in unsupported_fields if f[0] in field_name_set ] missing_unsupported_field_names = [ f[0] for f in unsupported_fields if f[0] not in field_name_set ] if missing_unsupported_field_names: LOGGER.info( "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s", sobject_name, ', '.join(sorted(missing_unsupported_field_names))) if filtered_unsupported_fields: LOGGER.info( "Not syncing the following unsupported fields for object %s: %s", sobject_name, ', '.join(sorted([k for k, _ in filtered_unsupported_fields]))) # Salesforce Objects are skipped when they do not have an Id field if not found_id_field: LOGGER.info("Skipping Salesforce Object %s, as it has no Id field", sobject_name) continue # Any property added to unsupported_fields has metadata generated and # removed for prop, description in filtered_unsupported_fields: if metadata.get(mdata, ('properties', prop), 'selected-by-default'): metadata.delete(mdata, ('properties', prop), 'selected-by-default') mdata = metadata.write(mdata, ('properties', prop), 'unsupported-description', description) mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'unsupported') if replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [replication_key]) else: mdata = metadata.write( mdata, (), 'forced-replication-method', { 'replication-method': 'FULL_TABLE', 'reason': 'No replication keys found from the Salesforce API' }) mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) schema = { 'type': 'object', 'additionalProperties': False, 'properties': properties } entry = { 'stream': sobject_name, 'tap_stream_id': sobject_name, 'schema': schema, 'metadata': metadata.to_list(mdata) } entries.append(entry) # For each custom setting field, remove its associated tag from entries # See Blacklisting.md for more information unsupported_tag_objects = [ object_to_tag_references[f] for f in sf_custom_setting_objects if f in object_to_tag_references ] if unsupported_tag_objects: LOGGER.info( #pylint:disable=logging-not-lazy "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " + "are not supported by the Bulk API:") LOGGER.info(unsupported_tag_objects) entries = [ e for e in entries if e['stream'] not in unsupported_tag_objects ] result = {'streams': entries} json.dump(result, sys.stdout, indent=4)
def do_discover(sf): """Describes a Salesforce instance's objects and generates a JSON schema for each field.""" global_description = sf.describe() objects_set = {o["name"] for o in global_description["sobjects"]} objects_to_discover = [ "Account", "Contact", "Lead", "Opportunity", "Campaign", "AccountContactRelation", "AccountContactRole", "OpportunityContactRole", "CampaignMember", "Task", "Invoice__c", "OpportunityHistory", "AccountHistory", "LeadHistory", "User" ] key_properties = ["Id"] sf_custom_setting_objects = [] object_to_tag_references = {} # For each SF Object describe it, loop its fields and build a schema entries = [] # Check if the user has BULK API enabled if sf.api_type == "BULK" and not Bulk(sf).has_permissions(): raise TapSalesforceBulkAPIDisabledException( 'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code' ) for sobject_name in objects_to_discover: # Skip blacklisted SF objects depending on the api_type in use # ChangeEvent objects are not queryable via Bulk or REST (undocumented) if sobject_name in sf.get_blacklisted_objects( ) or sobject_name.endswith("ChangeEvent"): continue if sobject_name not in objects_set: continue sobject_description = sf.describe(sobject_name) # Cache customSetting and Tag objects to check for blacklisting after # all objects have been described if sobject_description.get("customSetting"): sf_custom_setting_objects.append(sobject_name) elif sobject_name.endswith("__Tag"): relationship_field = next( (f for f in sobject_description["fields"] if f.get("relationshipName") == "Item"), None, ) if relationship_field: # Map {"Object":"Object__Tag"} object_to_tag_references[relationship_field["referenceTo"] [0]] = sobject_name fields = sobject_description["fields"] replication_key = get_replication_key(sobject_name, fields) unsupported_fields = set() properties = {} mdata = metadata.new() found_id_field = False # Loop over the object's fields for f in fields: field_name = f["name"] if field_name == "Id": found_id_field = True property_schema, mdata = create_property_schema(f, mdata) # Compound Address fields cannot be queried by the Bulk API if (f["type"] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE): unsupported_fields.add( (field_name, "cannot query compound address fields with bulk API")) # we haven't been able to observe any records with a json field, so we # are marking it as unavailable until we have an example to work with if f["type"] == "json": unsupported_fields.add(( field_name, "do not currently support json fields - please contact support", )) # Blacklisted fields are dependent on the api_type being used field_pair = (sobject_name, field_name) if field_pair in sf.get_blacklisted_fields(): unsupported_fields.add( (field_name, sf.get_blacklisted_fields()[field_pair])) inclusion = metadata.get(mdata, ("properties", field_name), "inclusion") if sf.select_fields_by_default and inclusion != "unsupported": mdata = metadata.write(mdata, ("properties", field_name), "selected-by-default", True) properties[field_name] = property_schema if replication_key: mdata = metadata.write(mdata, ("properties", replication_key), "inclusion", "automatic") # There are cases where compound fields are referenced by the associated # subfields but are not actually present in the field list field_name_set = {f["name"] for f in fields} filtered_unsupported_fields = [ f for f in unsupported_fields if f[0] in field_name_set ] missing_unsupported_field_names = [ f[0] for f in unsupported_fields if f[0] not in field_name_set ] if missing_unsupported_field_names: LOGGER.info( "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s", sobject_name, ", ".join(sorted(missing_unsupported_field_names)), ) if filtered_unsupported_fields: LOGGER.info( "Not syncing the following unsupported fields for object %s: %s", sobject_name, ", ".join(sorted([k for k, _ in filtered_unsupported_fields])), ) # Salesforce Objects are skipped when they do not have an Id field if not found_id_field: LOGGER.info("Skipping Salesforce Object %s, as it has no Id field", sobject_name) continue # Any property added to unsupported_fields has metadata generated and # removed for prop, description in filtered_unsupported_fields: if metadata.get(mdata, ("properties", prop), "selected-by-default"): metadata.delete(mdata, ("properties", prop), "selected-by-default") mdata = metadata.write(mdata, ("properties", prop), "unsupported-description", description) mdata = metadata.write(mdata, ("properties", prop), "inclusion", "unsupported") if replication_key: mdata = metadata.write(mdata, (), "valid-replication-keys", [replication_key]) else: mdata = metadata.write( mdata, (), "forced-replication-method", { "replication-method": "FULL_TABLE", "reason": "No replication keys found from the Salesforce API", }, ) mdata = metadata.write(mdata, (), "table-key-properties", key_properties) mdata = metadata.write(mdata, (), "selected", True) schema = { "type": "object", "additionalProperties": False, "properties": properties, } entry = { "stream": sobject_name, "tap_stream_id": sobject_name, "schema": schema, "metadata": metadata.to_list(mdata), } entries.append(entry) # For each custom setting field, remove its associated tag from entries # See Blacklisting.md for more information unsupported_tag_objects = [ object_to_tag_references[f] for f in sf_custom_setting_objects if f in object_to_tag_references ] if unsupported_tag_objects: LOGGER.info( # pylint:disable=logging-not-lazy "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " + "are not supported by the Bulk API:") LOGGER.info(unsupported_tag_objects) entries = [ e for e in entries if e["stream"] not in unsupported_tag_objects ] result = {"streams": entries} return result