Exemplo n.º 1
0
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter):
    bulk = Bulk(sf)
    current_bookmark = singer.get_bookmark(
        state, catalog_entry['tap_stream_id'],
        'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry)
    current_bookmark = singer_utils.strptime_with_tz(current_bookmark)
    batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                    'BatchIDs')

    start_time = singer_utils.now()
    stream = catalog_entry['stream']
    stream_id = catalog_entry['tap_stream_id']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry.get('metadata'))
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    schema = catalog_entry['schema']

    if not bulk.job_exists(job_id):
        LOGGER.info(
            "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state."
        )
        return counter

    # Iterate over the remaining batches, removing them once they are synced
    for batch_id in batch_ids[:]:
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry):
                counter.increment()
                rec = transformer.transform(rec, schema)
                rec = fix_record_anytype(rec, schema)
                singer.write_message(
                    singer.RecordMessage(stream=(stream_id or stream_alias
                                                 or stream),
                                         record=rec,
                                         version=stream_version,
                                         time_extracted=start_time))

                # Update bookmark if necessary
                replication_key_value = replication_key and singer_utils.strptime_with_tz(
                    rec[replication_key])
                if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark:
                    current_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])

        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'JobHighestBookmarkSeen',
                                      singer_utils.strftime(current_bookmark))
        batch_ids.remove(batch_id)
        LOGGER.info("Finished syncing batch %s. Removing batch from state.",
                    batch_id)
        LOGGER.info("Batches to go: %d", len(batch_ids))
        singer.write_state(state)

    return counter
Exemplo n.º 2
0
 def query(self, catalog_entry, state):
     if self.api_type == BULK_API_TYPE:
         bulk = Bulk(self)
         return bulk.query(catalog_entry, state)
     elif self.api_type == REST_API_TYPE:
         rest = Rest(self)
         return rest.query(catalog_entry, state)
     else:
         raise TapSalesforceException(
             "api_type should be REST or BULK was: {}".format(
                 self.api_type))
Exemplo n.º 3
0
def do_discover(sf):
    """Describes a Salesforce instance's objects and generates a JSON schema for each field."""
    global_description = sf.describe()

    objects_to_discover = {o['name'] for o in global_description['sobjects']}
    key_properties = ['Id']

    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []

    # Check if the user has BULK API enabled
    if sf.api_type == 'BULK' and not Bulk(sf).has_permissions():
        raise TapSalesforceBulkAPIDisabledException(
            'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code'
        )

    for sobject_name in objects_to_discover:

        # Skip blacklisted SF objects depending on the api_type in use
        # ChangeEvent objects are not queryable via Bulk or REST (undocumented)
        if sobject_name in sf.get_blacklisted_objects() \
           or sobject_name.endswith("ChangeEvent"):
            continue

        sobject_description = sf.describe(sobject_name)

        # Cache customSetting and Tag objects to check for blacklisting after
        # all objects have been described
        if sobject_description.get("customSetting"):
            sf_custom_setting_objects.append(sobject_name)
        elif sobject_name.endswith("__Tag"):
            relationship_field = next((f for f in sobject_description["fields"]
                                       if f.get("relationshipName") == "Item"),
                                      None)
            if relationship_field:
                # Map {"Object":"Object__Tag"}
                object_to_tag_references[relationship_field["referenceTo"]
                                         [0]] = sobject_name

        fields = sobject_description['fields']
        replication_key = get_replication_key(sobject_name, fields)

        unsupported_fields = set()
        properties = {}
        mdata = metadata.new()

        found_id_field = False

        # Loop over the object's fields
        for f in fields:
            field_name = f['name']

            if field_name == "Id":
                found_id_field = True

            property_schema, mdata = create_property_schema(f, mdata)

            # Compound Address fields cannot be queried by the Bulk API
            if f['type'] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE:
                unsupported_fields.add(
                    (field_name,
                     'cannot query compound address fields with bulk API'))

            # we haven't been able to observe any records with a json field, so we
            # are marking it as unavailable until we have an example to work with
            if f['type'] == "json":
                unsupported_fields.add((
                    field_name,
                    'do not currently support json fields - please contact support'
                ))

            # Blacklisted fields are dependent on the api_type being used
            field_pair = (sobject_name, field_name)
            if field_pair in sf.get_blacklisted_fields():
                unsupported_fields.add(
                    (field_name, sf.get_blacklisted_fields()[field_pair]))

            inclusion = metadata.get(mdata, ('properties', field_name),
                                     'inclusion')

            if sf.select_fields_by_default and inclusion != 'unsupported':
                mdata = metadata.write(mdata, ('properties', field_name),
                                       'selected-by-default', True)

            properties[field_name] = property_schema

        if replication_key:
            mdata = metadata.write(mdata, ('properties', replication_key),
                                   'inclusion', 'automatic')

        # There are cases where compound fields are referenced by the associated
        # subfields but are not actually present in the field list
        field_name_set = {f['name'] for f in fields}
        filtered_unsupported_fields = [
            f for f in unsupported_fields if f[0] in field_name_set
        ]
        missing_unsupported_field_names = [
            f[0] for f in unsupported_fields if f[0] not in field_name_set
        ]

        if missing_unsupported_field_names:
            LOGGER.info(
                "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s",
                sobject_name,
                ', '.join(sorted(missing_unsupported_field_names)))

        if filtered_unsupported_fields:
            LOGGER.info(
                "Not syncing the following unsupported fields for object %s: %s",
                sobject_name,
                ', '.join(sorted([k for k, _ in filtered_unsupported_fields])))

        # Salesforce Objects are skipped when they do not have an Id field
        if not found_id_field:
            LOGGER.info("Skipping Salesforce Object %s, as it has no Id field",
                        sobject_name)
            continue

        # Any property added to unsupported_fields has metadata generated and
        # removed
        for prop, description in filtered_unsupported_fields:
            if metadata.get(mdata, ('properties', prop),
                            'selected-by-default'):
                metadata.delete(mdata, ('properties', prop),
                                'selected-by-default')

            mdata = metadata.write(mdata, ('properties', prop),
                                   'unsupported-description', description)
            mdata = metadata.write(mdata, ('properties', prop), 'inclusion',
                                   'unsupported')

        if replication_key:
            mdata = metadata.write(mdata, (), 'valid-replication-keys',
                                   [replication_key])
        else:
            mdata = metadata.write(
                mdata, (), 'forced-replication-method', {
                    'replication-method': 'FULL_TABLE',
                    'reason':
                    'No replication keys found from the Salesforce API'
                })

        mdata = metadata.write(mdata, (), 'table-key-properties',
                               key_properties)

        schema = {
            'type': 'object',
            'additionalProperties': False,
            'properties': properties
        }

        entry = {
            'stream': sobject_name,
            'tap_stream_id': sobject_name,
            'schema': schema,
            'metadata': metadata.to_list(mdata)
        }

        entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [
        object_to_tag_references[f] for f in sf_custom_setting_objects
        if f in object_to_tag_references
    ]
    if unsupported_tag_objects:
        LOGGER.info(  #pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects "
            + "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [
            e for e in entries if e['stream'] not in unsupported_tag_objects
        ]

    result = {'streams': entries}
    json.dump(result, sys.stdout, indent=4)
Exemplo n.º 4
0
def do_discover(sf):
    """Describes a Salesforce instance's objects and generates a JSON schema for each field."""
    global_description = sf.describe()

    objects_set = {o["name"] for o in global_description["sobjects"]}
    objects_to_discover = [
        "Account", "Contact", "Lead", "Opportunity", "Campaign",
        "AccountContactRelation", "AccountContactRole",
        "OpportunityContactRole", "CampaignMember", "Task", "Invoice__c",
        "OpportunityHistory", "AccountHistory", "LeadHistory", "User"
    ]
    key_properties = ["Id"]

    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []

    # Check if the user has BULK API enabled
    if sf.api_type == "BULK" and not Bulk(sf).has_permissions():
        raise TapSalesforceBulkAPIDisabledException(
            'This client does not have Bulk API permissions, received "API_DISABLED_FOR_ORG" error code'
        )

    for sobject_name in objects_to_discover:

        # Skip blacklisted SF objects depending on the api_type in use
        # ChangeEvent objects are not queryable via Bulk or REST (undocumented)
        if sobject_name in sf.get_blacklisted_objects(
        ) or sobject_name.endswith("ChangeEvent"):
            continue
        if sobject_name not in objects_set:
            continue

        sobject_description = sf.describe(sobject_name)

        # Cache customSetting and Tag objects to check for blacklisting after
        # all objects have been described
        if sobject_description.get("customSetting"):
            sf_custom_setting_objects.append(sobject_name)
        elif sobject_name.endswith("__Tag"):
            relationship_field = next(
                (f for f in sobject_description["fields"]
                 if f.get("relationshipName") == "Item"),
                None,
            )
            if relationship_field:
                # Map {"Object":"Object__Tag"}
                object_to_tag_references[relationship_field["referenceTo"]
                                         [0]] = sobject_name

        fields = sobject_description["fields"]
        replication_key = get_replication_key(sobject_name, fields)

        unsupported_fields = set()
        properties = {}
        mdata = metadata.new()

        found_id_field = False

        # Loop over the object's fields
        for f in fields:
            field_name = f["name"]

            if field_name == "Id":
                found_id_field = True

            property_schema, mdata = create_property_schema(f, mdata)

            # Compound Address fields cannot be queried by the Bulk API
            if (f["type"] == "address" and sf.api_type
                    == tap_salesforce.salesforce.BULK_API_TYPE):
                unsupported_fields.add(
                    (field_name,
                     "cannot query compound address fields with bulk API"))

            # we haven't been able to observe any records with a json field, so we
            # are marking it as unavailable until we have an example to work with
            if f["type"] == "json":
                unsupported_fields.add((
                    field_name,
                    "do not currently support json fields - please contact support",
                ))

            # Blacklisted fields are dependent on the api_type being used
            field_pair = (sobject_name, field_name)
            if field_pair in sf.get_blacklisted_fields():
                unsupported_fields.add(
                    (field_name, sf.get_blacklisted_fields()[field_pair]))

            inclusion = metadata.get(mdata, ("properties", field_name),
                                     "inclusion")

            if sf.select_fields_by_default and inclusion != "unsupported":
                mdata = metadata.write(mdata, ("properties", field_name),
                                       "selected-by-default", True)

            properties[field_name] = property_schema

        if replication_key:
            mdata = metadata.write(mdata, ("properties", replication_key),
                                   "inclusion", "automatic")

        # There are cases where compound fields are referenced by the associated
        # subfields but are not actually present in the field list
        field_name_set = {f["name"] for f in fields}
        filtered_unsupported_fields = [
            f for f in unsupported_fields if f[0] in field_name_set
        ]
        missing_unsupported_field_names = [
            f[0] for f in unsupported_fields if f[0] not in field_name_set
        ]

        if missing_unsupported_field_names:
            LOGGER.info(
                "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s",
                sobject_name,
                ", ".join(sorted(missing_unsupported_field_names)),
            )

        if filtered_unsupported_fields:
            LOGGER.info(
                "Not syncing the following unsupported fields for object %s: %s",
                sobject_name,
                ", ".join(sorted([k for k, _ in filtered_unsupported_fields])),
            )

        # Salesforce Objects are skipped when they do not have an Id field
        if not found_id_field:
            LOGGER.info("Skipping Salesforce Object %s, as it has no Id field",
                        sobject_name)
            continue

        # Any property added to unsupported_fields has metadata generated and
        # removed
        for prop, description in filtered_unsupported_fields:
            if metadata.get(mdata, ("properties", prop),
                            "selected-by-default"):
                metadata.delete(mdata, ("properties", prop),
                                "selected-by-default")

            mdata = metadata.write(mdata, ("properties", prop),
                                   "unsupported-description", description)
            mdata = metadata.write(mdata, ("properties", prop), "inclusion",
                                   "unsupported")

        if replication_key:
            mdata = metadata.write(mdata, (), "valid-replication-keys",
                                   [replication_key])
        else:
            mdata = metadata.write(
                mdata,
                (),
                "forced-replication-method",
                {
                    "replication-method": "FULL_TABLE",
                    "reason":
                    "No replication keys found from the Salesforce API",
                },
            )

        mdata = metadata.write(mdata, (), "table-key-properties",
                               key_properties)
        mdata = metadata.write(mdata, (), "selected", True)

        schema = {
            "type": "object",
            "additionalProperties": False,
            "properties": properties,
        }

        entry = {
            "stream": sobject_name,
            "tap_stream_id": sobject_name,
            "schema": schema,
            "metadata": metadata.to_list(mdata),
        }

        entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [
        object_to_tag_references[f] for f in sf_custom_setting_objects
        if f in object_to_tag_references
    ]
    if unsupported_tag_objects:
        LOGGER.info(  # pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects "
            + "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [
            e for e in entries if e["stream"] not in unsupported_tag_objects
        ]

    result = {"streams": entries}
    return result