def sync_companies(STATE, catalog): bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) start = utils.strptime_with_tz( get_start(STATE, "companies", 'hs_lastmodifieddate')) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], catalog.get('stream_alias')) metadata = catalog.get('metadata', {}) url = get_url("companies_all") max_bk_value = start with bumble_bee: params = default_company_params if metadata.get('additional_properties'): params['properties'] = params['properties'] + metadata[ 'additional_properties'] for row in gen_request(STATE, 'companies', url, params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if 'hs_lastmodifieddate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['hs_lastmodifieddate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: # 'additional_properties' implies detail requests should be skipped if metadata.get('additional_properties'): record = bumble_bee.transform(row, schema) else: response = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform(response, schema) singer.write_record("companies", record, catalog.get('stream_alias')) # Syncing contacts by company is enabled by default if metadata.get('sync_contacts_by_company', True): STATE = _sync_contacts_by_company(STATE, record['companyId']) STATE = singer.write_bookmark(STATE, 'companies', 'hs_lastmodifieddate', utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync( self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict: """ The sync logic for an full table stream. :param state: A dictionary representing singer state :param stream_schema: A dictionary containing the stream schema :param stream_metadata: A dictionnary containing stream metadata :param config: A dictionary containing tap config data :param transformer: A singer Transformer object :return: State data in the form of a dictionary """ with metrics.record_counter(self.tap_stream_id) as counter: for record in self.get_records(config): transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record(self.tap_stream_id, transformed_record) counter.increment() singer.write_state(state) return state
def sync(self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict: """ The sync logic for an incremental stream. :param state: A dictionary representing singer state :param stream_schema: A dictionary containing the stream schema :param stream_metadata: A dictionnary containing stream metadata :param config: A dictionary containing tap config data :param transformer: A singer Transformer object :return: State data in the form of a dictionary """ start_date = singer.get_bookmark(state, self.tap_stream_id, self.replication_key, config['start_date']) bookmark_datetime = singer.utils.strptime_to_utc(start_date) max_datetime = bookmark_datetime with metrics.record_counter(self.tap_stream_id) as counter: for record in self.get_records(bookmark_datetime): record_datetime = rfc2822_to_datetime( record[self.replication_key]) if record_datetime >= bookmark_datetime: transform_keys_to_snake_case(record) transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record(self.tap_stream_id, transformed_record) counter.increment() max_datetime = max(record_datetime, max_datetime) bookmark_date = singer.utils.strftime(max_datetime) state = singer.write_bookmark(state, self.tap_stream_id, self.replication_key, bookmark_date) return state
def sync(self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict: """ The sync logic for an incremental stream. :param state: A dictionary representing singer state :param stream_schema: A dictionary containing the stream schema :param stream_metadata: A dictionnary containing stream metadata :param config: A dictionary containing tap config data :param transformer: A singer Transformer object :return: State data in the form of a dictionary """ start_time = singer.get_bookmark(state, self.tap_stream_id, self.replication_key, config['start_date']) max_record_value = start_time with metrics.record_counter(self.tap_stream_id) as counter: for record in self.get_records(config.get('max_pagesize'), max_record_value): transformed_record = transformer.transform(record, stream_schema, stream_metadata) record_replication_value = singer.utils.strptime_to_utc(transformed_record[self.replication_key]) if record_replication_value >= singer.utils.strptime_to_utc(max_record_value): singer.write_record(self.tap_stream_id, transformed_record) counter.increment() max_record_value = record_replication_value.isoformat() state = singer.write_bookmark(state, self.tap_stream_id, self.replication_key, max_record_value) singer.write_state(state) return state
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform(record, schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, record['companyId']) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() STATE = write_current_sync_start(STATE, "companies", current_sync_start) singer.write_state(STATE) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( url=get_url("companies_detail", company_id=row['companyId']), params={"includePropertyVersions": "true"} ).json() record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'companies', None) singer.write_state(STATE) return STATE
def sync(self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict: """ The sync logic for an incremental stream. :param state: A dictionary representing singer state :param stream_schema: A dictionary containing the stream schema :param stream_metadata: A dictionnary containing stream metadata :param config: A dictionary containing tap config data :param transformer: A singer Transformer object :return: State data in the form of a dictionary """ start_date = get_recharge_bookmark(state, self.tap_stream_id, config['start_date']) bookmark_datetime = utils.strptime_to_utc(start_date) max_datetime = bookmark_datetime with metrics.record_counter(self.tap_stream_id) as counter: for record in self.get_records(config, bookmark_datetime): transformed_record = transformer.transform( record, stream_schema, stream_metadata) replication_value = transformed_record.get( self.replication_key) # if replication value is not found then, write record if replication_value: record_datetime = utils.strptime_to_utc(replication_value) # write record if we get record greater than the bookmark date or start date if record_datetime >= bookmark_datetime: singer.write_record(self.tap_stream_id, transformed_record) counter.increment() max_datetime = max(record_datetime, max_datetime) else: singer.write_record(self.tap_stream_id, transformed_record) counter.increment() bookmark_date = utils.strftime(max_datetime) state = write_recharge_bookmark(state, self.tap_stream_id, bookmark_date) singer.write_state(state) return state
def build_record(record: dict, schema: Schema) -> dict: lift_properties_and_pop_versions(record) transformer = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) record = transformer.transform(record, schema) return record