Exemplo n.º 1
0
def sync_companies(STATE, catalog):
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    start = utils.strptime_with_tz(
        get_start(STATE, "companies", 'hs_lastmodifieddate'))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"],
                        catalog.get('stream_alias'))
    metadata = catalog.get('metadata', {})

    url = get_url("companies_all")
    max_bk_value = start

    with bumble_bee:
        params = default_company_params
        if metadata.get('additional_properties'):
            params['properties'] = params['properties'] + metadata[
                'additional_properties']
        for row in gen_request(STATE, 'companies', url, params, 'companies',
                               'has-more', ['offset'], ['offset']):
            row_properties = row['properties']
            modified_time = None
            if 'hs_lastmodifieddate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['hs_lastmodifieddate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                # 'additional_properties' implies detail requests should be skipped
                if metadata.get('additional_properties'):
                    record = bumble_bee.transform(row, schema)
                else:
                    response = request(
                        get_url("companies_detail",
                                company_id=row['companyId'])).json()
                    record = bumble_bee.transform(response, schema)

                singer.write_record("companies", record,
                                    catalog.get('stream_alias'))

                # Syncing contacts by company is enabled by default
                if metadata.get('sync_contacts_by_company', True):
                    STATE = _sync_contacts_by_company(STATE,
                                                      record['companyId'])

    STATE = singer.write_bookmark(STATE, 'companies', 'hs_lastmodifieddate',
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 2
0
    def sync(
            self,
            state: dict,
            stream_schema: dict,
            stream_metadata: dict,
            config: dict,
            transformer: Transformer) -> dict:
        """
        The sync logic for an full table stream.

        :param state: A dictionary representing singer state
        :param stream_schema: A dictionary containing the stream schema
        :param stream_metadata: A dictionnary containing stream metadata
        :param config: A dictionary containing tap config data
        :param transformer: A singer Transformer object
        :return: State data in the form of a dictionary
        """
        with metrics.record_counter(self.tap_stream_id) as counter:
            for record in self.get_records(config):
                transformed_record = transformer.transform(
                    record,
                    stream_schema,
                    stream_metadata)
                singer.write_record(self.tap_stream_id, transformed_record)
                counter.increment()

        singer.write_state(state)

        return state
Exemplo n.º 3
0
    def sync(self, state: dict, stream_schema: dict, stream_metadata: dict,
             config: dict, transformer: Transformer) -> dict:
        """
        The sync logic for an incremental stream.

        :param state: A dictionary representing singer state
        :param stream_schema: A dictionary containing the stream schema
        :param stream_metadata: A dictionnary containing stream metadata
        :param config: A dictionary containing tap config data
        :param transformer: A singer Transformer object
        :return: State data in the form of a dictionary
        """
        start_date = singer.get_bookmark(state, self.tap_stream_id,
                                         self.replication_key,
                                         config['start_date'])

        bookmark_datetime = singer.utils.strptime_to_utc(start_date)
        max_datetime = bookmark_datetime

        with metrics.record_counter(self.tap_stream_id) as counter:
            for record in self.get_records(bookmark_datetime):
                record_datetime = rfc2822_to_datetime(
                    record[self.replication_key])
                if record_datetime >= bookmark_datetime:
                    transform_keys_to_snake_case(record)
                    transformed_record = transformer.transform(
                        record, stream_schema, stream_metadata)
                    singer.write_record(self.tap_stream_id, transformed_record)
                    counter.increment()
                    max_datetime = max(record_datetime, max_datetime)
            bookmark_date = singer.utils.strftime(max_datetime)

        state = singer.write_bookmark(state, self.tap_stream_id,
                                      self.replication_key, bookmark_date)
        return state
Exemplo n.º 4
0
    def sync(self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict:
        """
        The sync logic for an incremental stream.

        :param state: A dictionary representing singer state
        :param stream_schema: A dictionary containing the stream schema
        :param stream_metadata: A dictionnary containing stream metadata
        :param config: A dictionary containing tap config data
        :param transformer: A singer Transformer object
        :return: State data in the form of a dictionary
        """
        start_time = singer.get_bookmark(state,
                                        self.tap_stream_id,
                                        self.replication_key,
                                        config['start_date'])
        max_record_value = start_time

        with metrics.record_counter(self.tap_stream_id) as counter:
            for record in self.get_records(config.get('max_pagesize'), max_record_value):
                transformed_record = transformer.transform(record, stream_schema, stream_metadata)
                record_replication_value = singer.utils.strptime_to_utc(transformed_record[self.replication_key])
                if record_replication_value >= singer.utils.strptime_to_utc(max_record_value):
                    singer.write_record(self.tap_stream_id, transformed_record)
                    counter.increment()
                    max_record_value = record_replication_value.isoformat()

        state = singer.write_bookmark(state, self.tap_stream_id, self.replication_key, max_record_value)
        singer.write_state(state)
        return state
Exemplo n.º 5
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(record, schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE,
                                                      record['companyId'])

    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 6
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE, "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema, ["contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    url=get_url("companies_detail", company_id=row['companyId']),
                    params={"includePropertyVersions": "true"}
                ).json()
                record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now())

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 7
0
    def sync(self, state: dict, stream_schema: dict, stream_metadata: dict,
             config: dict, transformer: Transformer) -> dict:
        """
        The sync logic for an incremental stream.

        :param state: A dictionary representing singer state
        :param stream_schema: A dictionary containing the stream schema
        :param stream_metadata: A dictionnary containing stream metadata
        :param config: A dictionary containing tap config data
        :param transformer: A singer Transformer object
        :return: State data in the form of a dictionary
        """
        start_date = get_recharge_bookmark(state, self.tap_stream_id,
                                           config['start_date'])
        bookmark_datetime = utils.strptime_to_utc(start_date)
        max_datetime = bookmark_datetime

        with metrics.record_counter(self.tap_stream_id) as counter:
            for record in self.get_records(config, bookmark_datetime):
                transformed_record = transformer.transform(
                    record, stream_schema, stream_metadata)
                replication_value = transformed_record.get(
                    self.replication_key)

                # if replication value is not found then, write record
                if replication_value:
                    record_datetime = utils.strptime_to_utc(replication_value)

                    # write record if we get record greater than the bookmark date or start date
                    if record_datetime >= bookmark_datetime:
                        singer.write_record(self.tap_stream_id,
                                            transformed_record)
                        counter.increment()
                        max_datetime = max(record_datetime, max_datetime)
                else:
                    singer.write_record(self.tap_stream_id, transformed_record)
                    counter.increment()

            bookmark_date = utils.strftime(max_datetime)

        state = write_recharge_bookmark(state, self.tap_stream_id,
                                        bookmark_date)

        singer.write_state(state)

        return state
Exemplo n.º 8
0
def build_record(record: dict, schema: Schema) -> dict:
    lift_properties_and_pop_versions(record)
    transformer = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    record = transformer.transform(record, schema)
    return record