def sync_generic_basic_endpoint(sdk_client, stream, stream_metadata): discovered_schema = load_schema(stream) field_list = get_field_list(discovered_schema, stream, stream_metadata) discovered_schema['properties']['_sdc_customer_id'] = { 'description': 'Profile ID', 'type': 'string', 'field': "customer_id" } primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys'] write_schema(stream, discovered_schema, primary_keys) LOGGER.info("Syncing %s for customer %s", stream, sdk_client.client_customer_id) start_index = 0 selector = { 'fields': field_list, 'paging': { 'startIndex': str(start_index), 'numberResults': str(PAGE_SIZE) } } while True: page = get_page(sdk_client, selector, stream, start_index) if page['totalNumEntries'] > GOOGLE_MAX_START_INDEX: raise Exception("Too many %s (%s > %s) for customer %s", stream, GOOGLE_MAX_START_INDEX, page['totalNumEntries'], sdk_client.client_customer_id) if 'entries' in page: with metrics.record_counter(stream) as counter: time_extracted = utils.now() for entry in page['entries']: obj = suds_to_dict(entry) obj['_sdc_customer_id'] = sdk_client.client_customer_id with Transformer( singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING ) as bumble_bee: bumble_bee.pre_hook = transform_pre_hook record = bumble_bee.transform(obj, discovered_schema) singer.write_record(stream, record, time_extracted=time_extracted) counter.increment() start_index += PAGE_SIZE if start_index > int(page['totalNumEntries']): break LOGGER.info("Done syncing %s for customer_id %s", stream, sdk_client.client_customer_id)
def sync_branches(project): url = get_url("branches", project['id']) with Transformer(pre_hook=transform_row) as transformer: for row in gen_request(url): row['project_id'] = project['id'] flatten_id(row, "commit") transformed_row = transformer.transform( row, RESOURCES["branches"]["schema"]) singer.write_record("branches", transformed_row, time_extracted=utils.now())
def sync_issues(project): url = get_url("issues", project['id']) with Transformer(pre_hook=format_timestamp) as transformer: for row in gen_request(url): flatten_id(row, "author") flatten_id(row, "assignee") flatten_id(row, "milestone") transformed_row = transformer.transform(row, RESOURCES["issues"]["schema"]) if row["updated_at"] >= get_start("project_{}".format(project["id"])): singer.write_record("issues", transformed_row, time_extracted=utils.now())
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata): # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. params['includeAllProperties'] = True params['allPropertiesFetchMode'] = 'latest_version' url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer() as transformer: try: transformed_record = transformer.transform( record, schema, stream_metadata) except Exception as err: LOGGER.error('Transformer Error: {}'.format(err)) LOGGER.error('Stream: {}, record: {}'.format( stream_name, record)) raise err # Reset max_bookmark_value to new value if higher if transformed_record.get(bookmark_field): if max_bookmark_value is None or \ transformed_record[bookmark_field] > transform_datetime(max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm: if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + CHUNK_SIZES[entity_name] params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform(row, schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc ))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def sync_stream(service, catalog, state, start_date, stream, mdata): stream_name = stream.tap_stream_id last_datetime = get_bookmark(state, stream_name, start_date) write_schema(stream) max_modified = last_datetime ## TODO: add metrics? entitycls = service.entities[stream_name] query = service.query(entitycls) if hasattr(entitycls, MODIFIED_DATE_FIELD): LOGGER.info('{} - Syncing data since {}'.format( stream.tap_stream_id, last_datetime)) query = (query.filter( getattr(entitycls, MODIFIED_DATE_FIELD) >= singer.utils.strptime_with_tz(last_datetime)).order_by( getattr(entitycls, MODIFIED_DATE_FIELD).asc())) else: LOGGER.info('{} - Syncing using full replication'.format( stream.tap_stream_id)) schema = stream.schema.to_dict() optionset_map = get_optionset_metadata(service, stream.tap_stream_id) with metrics.record_counter(stream.tap_stream_id) as counter: for record in query: dict_record = {} for odata_prop in entitycls.__odata_schema__['properties']: prop_name = odata_prop['name'] value = getattr(record, prop_name) if isinstance(value, datetime): value = singer.utils.strftime(value) dict_record[prop_name] = value if prop_name in optionset_map: label_prop_name = get_optionset_fieldname(prop_name) if value is None: dict_record[label_prop_name] = None else: dict_record[label_prop_name] = optionset_map[ prop_name][value] if MODIFIED_DATE_FIELD in dict_record and dict_record[ MODIFIED_DATE_FIELD] > max_modified: max_modified = dict_record[MODIFIED_DATE_FIELD] with Transformer() as transformer: dict_record_t = transformer.transform(dict_record, schema, mdata) singer.write_record(stream.tap_stream_id, dict_record_t) counter.increment() write_bookmark(state, stream_name, max_modified)
def sync_users(project): url = get_url("users", project['id']) project["users"] = [] with Transformer(pre_hook=transform_row) as transformer: for row in gen_request(url): transformed_row = transformer.transform( row, RESOURCES["users"]["schema"]) project["users"].append(row["id"]) singer.write_record("users", transformed_row, time_extracted=utils.now())
def sync_stream(service, catalog, state, start_date, stream, mdata): stream_name = stream.tap_stream_id last_datetime = get_bookmark(state, stream_name, start_date) write_schema(stream) max_modified = last_datetime ## TODO: add metrics? entitycls = service.entities[stream_name] query = service.query(entitycls) if hasattr(entitycls, MODIFIED_DATE_FIELD): LOGGER.info("{} - Syncing data since {}".format( stream.tap_stream_id, last_datetime)) query = query.filter( getattr(entitycls, MODIFIED_DATE_FIELD) >= singer.utils.strptime_with_tz(last_datetime)).order_by( getattr(entitycls, MODIFIED_DATE_FIELD).asc()) else: LOGGER.info("{} - Syncing using full replication".format( stream.tap_stream_id)) schema = stream.schema.to_dict() count = 0 with metrics.http_request_timer(stream.tap_stream_id): with metrics.record_counter(stream.tap_stream_id) as counter: for record in query: dict_record = {} for odata_prop in entitycls.__odata_schema__["properties"]: prop_name = odata_prop["name"] value = getattr(record, prop_name) if isinstance(value, datetime): value = singer.utils.strftime(value) dict_record[prop_name] = value if MODIFIED_DATE_FIELD in dict_record: if dict_record[MODIFIED_DATE_FIELD] > max_modified: max_modified = dict_record[MODIFIED_DATE_FIELD] else: continue with Transformer() as transformer: dict_record = transformer.transform( dict_record, schema, mdata) singer.write_record(stream.tap_stream_id, dict_record) counter.increment() count += 1 if count % 5000 == 0: write_bookmark(state, stream_name, max_modified) write_bookmark(state, stream_name, max_modified)
def sync_milestones(project): url = get_url("milestones", project['id']) with Transformer(pre_hook=transform_row) as transformer: for row in gen_request(url): transformed_row = transformer.transform( row, RESOURCES["milestones"]["schema"]) if row["updated_at"] >= get_start("project_{}".format( project["id"])): singer.write_record("milestones", transformed_row, time_extracted=utils.now())
def sync(client, config, catalog, state): LOGGER.info('Starting Sync..') selected_streams = catalog.get_selected_streams(state) streams = [] stream_keys = [] with Transformer() as transformer: for catalog_entry in selected_streams: streams.append(catalog_entry) stream_keys.append(catalog_entry.stream) for catalog_entry in streams: stream = AVAILABLE_STREAMS[catalog_entry.stream](client=client, config=config, catalog=catalog, state=state) LOGGER.info('Syncing stream: %s', catalog_entry.stream) stream.update_currently_syncing() stream.write_state() bookmark_date = stream.get_bookmark(stream.name, config['start_date']) bookmark_dttm = strptime_to_utc(bookmark_date) stream_schema = catalog_entry.schema.to_dict() stream.write_schema() stream_metadata = metadata.to_map(catalog_entry.metadata) new_bookmark = bookmark_dttm with singer.metrics.job_timer(job_type=stream.name): with singer.metrics.record_counter( endpoint=stream.name) as counter: for page in stream.sync(bookmark_dttm): transformed_records = transform(page) for transformed in transformed_records: record_bookmark = datetime.datetime.fromtimestamp( transformed.get('last_modified') / 1000.0, tz=timezone.utc) new_bookmark = max(new_bookmark, record_bookmark) if record_bookmark > bookmark_dttm: singer.write_record( catalog_entry.stream, transformer.transform( transformed, stream_schema, stream_metadata, )) counter.increment() stream.update_bookmark(stream.name, strftime(new_bookmark)) stream.write_state() LOGGER.info('Finished Sync..')
def persist_records(catalog, stream_id, records): stream = catalog.get_stream(stream_id) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_id) as counter: for record in records: with Transformer( integer_datetime_fmt=UNIX_SECONDS_INTEGER_DATETIME_PARSING ) as transformer: record = transformer.transform(record, schema, stream_metadata) singer.write_record(stream_id, record) counter.increment()
def sync(): initialize_shopify_client() # Emit all schemas first so we have them for child streams for stream in Context.catalog["streams"]: if Context.is_selected(stream["tap_stream_id"]): singer.write_schema(stream["tap_stream_id"], stream["schema"], stream["key_properties"], bookmark_properties=stream["replication_key"]) Context.counts[stream["tap_stream_id"]] = 0 # If there is a currently syncing stream bookmark, shuffle the # stream order so it gets sync'd first currently_sync_stream_name = Context.state.get( 'bookmarks', {}).get('currently_sync_stream') if currently_sync_stream_name: shuffle_streams(currently_sync_stream_name) # Loop over streams in catalog for catalog_entry in Context.catalog['streams']: stream_id = catalog_entry['tap_stream_id'] stream = Context.stream_objects[stream_id]() if not Context.is_selected(stream_id): LOGGER.info('Skipping stream: %s', stream_id) continue LOGGER.info('Syncing stream: %s', stream_id) if not Context.state.get('bookmarks'): Context.state['bookmarks'] = {} Context.state['bookmarks']['currently_sync_stream'] = stream_id with Transformer() as transformer: for rec in stream.sync(): extraction_time = singer.utils.now() record_schema = catalog_entry['schema'] record_metadata = metadata.to_map(catalog_entry['metadata']) rec = transformer.transform(rec, record_schema, record_metadata) singer.write_record(stream_id, rec, time_extracted=extraction_time) Context.counts[stream_id] += 1 Context.state['bookmarks'].pop('currently_sync_stream') singer.write_state(Context.state) LOGGER.info('----------------------') for stream_id, stream_count in Context.counts.items(): LOGGER.info('%s: %d', stream_id, stream_count) LOGGER.info('----------------------')
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, bookmark_type=None, max_bookmark_value=None, last_datetime=None, last_integer=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): if (max_bookmark_value is None) or \ (transformed_record[bookmark_field] > transform_datetime(max_bookmark_value)): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): if bookmark_type == 'integer': # Keep only records whose bookmark is after the last_integer if transformed_record[bookmark_field] >= last_integer: write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() elif bookmark_type == 'datetime': last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: # LOGGER.info('record1: {}'.format(record)) # TESTING, comment out write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() else: # LOGGER.info('record2: {}'.format(record)) # TESTING, comment out write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() LOGGER.info('Stream: {}, Processed {} records'.format( stream_name, counter.value)) return max_bookmark_value
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter): bulk = Bulk(sf) current_bookmark = singer.get_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry) current_bookmark = singer_utils.strptime_with_tz(current_bookmark) batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'BatchIDs') start_time = singer_utils.now() stream = catalog_entry['stream'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry.get('metadata')) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) schema = catalog_entry['schema'] if not bulk.job_exists(job_id): LOGGER.info( "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state." ) return counter # Iterate over the remaining batches, removing them once they are synced for batch_id in batch_ids[:]: with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) # Update bookmark if necessary replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark: current_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(current_bookmark)) batch_ids.remove(batch_id) LOGGER.info("Finished syncing batch %s. Removing batch from state.", batch_id) LOGGER.info("Batches to go: %d", len(batch_ids)) singer.write_state(state) return counter
def persist_records(catalog, stream_id, records): if records: # check for empty array stream = catalog.get_stream(stream_id) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_id) as counter: for record in records: with Transformer() as transformer: record = transformer.transform(record, schema, stream_metadata) singer.write_record(stream_id, record) counter.increment()
def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) schema = load_schema('deal_pipelines') singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias')) LOGGER.info('sync_deal_pipelines') data = request(get_url('deal_pipelines')).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema) singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now()) singer.write_state(STATE) return STATE
def process_records(catalog, stream_name, records): if records: stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metedata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: with Transformer() as transformer: record = transformer.transform(record, schema, stream_metedata) singer.write_record(stream_name, record) counter.increment()
def sync_sub_stream(sub_stream_name, parent_obj, updates=False): """ Given a parent object, retrieve its values for the specified substream. """ extraction_time = singer.utils.now() if sub_stream_name == "invoice_line_items": object_list = parent_obj.lines elif sub_stream_name == "subscription_items": # parent_obj.items is a function that returns a dict iterator, so use the attribute object_list = parent_obj.get("items") elif sub_stream_name == "payout_transactions": payout_id = parent_obj['id'] acct_id = Context.config.get('account_id') # Balance transaction history with a payout id param # provides the link of transactions to payouts object_list = stripe.BalanceTransaction.list(limit=100, stripe_account=acct_id, payout=payout_id) else: raise Exception("Attempted to sync substream that is not implemented: {}" .format(sub_stream_name)) with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: iterator = get_object_list_iterator(object_list) for sub_stream_obj in iterator: obj_ad_dict = sub_stream_obj.to_dict_recursive() if sub_stream_name == "invoice_line_items": # Synthetic addition of a key to the record we sync obj_ad_dict["invoice"] = parent_obj.id elif sub_stream_name == "payout_transactions": # payout_transactions is a join table obj_ad_dict = {"id": obj_ad_dict['id'], "payout_id": parent_obj['id']} rec = transformer.transform(unwrap_data_objects(obj_ad_dict), Context.get_catalog_entry(sub_stream_name)['schema'], metadata.to_map( Context.get_catalog_entry(sub_stream_name)['metadata'] )) # NB: Older structures (such as invoice_line_items) may not have had their ID present. # Skip these if they don't match the structure we expect. if "id" in rec: singer.write_record(sub_stream_name, rec, time_extracted=extraction_time) if updates: Context.updated_counts[sub_stream_name] += 1 else: Context.new_counts[sub_stream_name] += 1
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Append all the properties fields for deals to the request additional_properties = schema.get("properties").get("properties").get( "properties") for key in additional_properties.keys(): params['properties'].append(key) url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform(row, schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] sync_one_one = config.get('sync_one_one', "True") if sync_one_one or sync_one_one == "True" or sync_one_one == "true": sync_one_one = True elif not sync_one_one or sync_one_one == "False" or sync_one_one == "false": sync_one_one = False else: raise Exception("Don't understand sync_one_one param in config, must be boolean") table_name = table_spec['table_name'] s3_file_handle, tags = s3.get_file_handle_custom(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: if not sync_one_one: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) if sync_one_one: write_message( OneOneMessage(table_name, row, TagSet=tags, sync_one_one=sync_one_one, _sdc_source_file=s3_path)) records_synced += 1 return records_synced
def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("engagements") bookmark_key = 'lastUpdated' singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must save a lookback window # that handles the duration of time that this stream was last syncing, # and look back by that amount on the next sync last_sync_duration = get_previous_time_window(STATE, "engagements") current_sync_start = utils.now() if has_bookmark(STATE, "engagements", bookmark_key) and \ last_sync_duration is not None: LOGGER.info(("Last sync of engagements lasted {} seconds. Adjusting bookmark by this " "amount to account for race conditions with record updates.").format(last_sync_duration)) start = utils.strptime_to_utc(start) - datetime.timedelta(seconds=last_sync_duration) start = utils.strftime(start) max_bk_value = start LOGGER.info("sync_engagements from %s", start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") params = {'limit': 250} top_level_key = "results" engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform(engagement, schema, mdata) if record['engagement'][bookmark_key] >= start: # hoist PK and bookmark field to top-level record record['engagement_id'] = record['engagement']['id'] record[bookmark_key] = record['engagement'][bookmark_key] singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record['engagement'][bookmark_key] >= max_bk_value: max_bk_value = record['engagement'][bookmark_key] STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, max_bk_value) # Write duration for next sync's lookback window STATE = write_stream_duration(STATE, 'engagements', current_sync_start, utils.now()) singer.write_state(STATE) return STATE
def sync_sub_stream(sub_stream_name, parent_obj, parent_replication_key, save_bookmarks=True, updates=False): """ Given a parent object, retrieve its values for the specified substream. """ extraction_time = singer.utils.now() if sub_stream_name == "invoice_line_items": object_list = parent_obj.lines elif sub_stream_name == "subscription_items": # parent_obj.items is a function that returns a dict iterator, so use the attribute object_list = parent_obj.get("items") else: raise Exception( "Attempted to sync substream that is not implemented: {}".format( sub_stream_name)) with Transformer( singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: iterator = get_object_list_iterator(object_list) for sub_stream_obj in iterator: obj_ad_dict = sub_stream_obj.to_dict_recursive() if sub_stream_name == "invoice_line_items": # Synthetic addition of a key to the record we sync obj_ad_dict["invoice"] = parent_obj.id rec = transformer.transform( unwrap_data_objects(obj_ad_dict), Context.get_catalog_entry(sub_stream_name)['schema'], metadata.to_map( Context.get_catalog_entry(sub_stream_name)['metadata'])) # NB: Older structures (such as invoice_line_items) may not have had their ID present. # Skip these if they don't match the structure we expect. if "id" in rec: singer.write_record(sub_stream_name, rec, time_extracted=extraction_time) if updates: Context.updated_counts[sub_stream_name] += 1 else: Context.new_counts[sub_stream_name] += 1 sub_stream_bookmark = parent_obj.get(parent_replication_key) if save_bookmarks: singer.write_bookmark(Context.state, sub_stream_name, parent_replication_key, sub_stream_bookmark)
def process_records(catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, bookmark_type=None, max_bookmark_value=None, last_datetime=None, last_integer=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform(record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): if (max_bookmark_value is None) or \ (transformed_record[bookmark_field] > max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field: if bookmark_field in transformed_record: if bookmark_type == 'integer': # Keep only records whose bookmark is after the last_integer if transformed_record[bookmark_field] >= last_integer: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() elif bookmark_type == 'datetime': last_dttm = transformer._transform_datetime(last_datetime) bookmark_dttm = transformer._transform_datetime(transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def sync_campaign_ids_endpoint(sdk_client, campaign_ids, stream_schema, stream, stream_metadata): discovered_schema = load_schema(stream) field_list = get_field_list(discovered_schema, stream, stream_metadata) discovered_schema['properties']['_sdc_customer_id'] = { 'description': 'Profile ID', 'type': 'string', 'field': "customer_id" } primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys'] write_schema(stream, discovered_schema, primary_keys) LOGGER.info("Syncing %s for customer %s", stream, sdk_client.client_customer_id) for safe_selector in get_campaign_ids_safe_selectors( sdk_client, campaign_ids, stream): start_index = 0 while True: page = get_campaign_ids_filtered_page(sdk_client, field_list, safe_selector, stream, start_index) if page['totalNumEntries'] > GOOGLE_MAX_RESULTSET_SIZE: raise Exception("Too many {} ({} > {}) for customer {}, campaigns {}".format( stream, GOOGLE_MAX_RESULTSET_SIZE, page['totalNumEntries'], sdk_client.client_customer_id, campaign_ids)) if 'entries' in page: with metrics.record_counter(stream) as counter: for entry in page['entries']: obj = suds_to_dict(entry) obj['_sdc_customer_id'] = sdk_client.client_customer_id with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: #pylint: disable=line-too-long bumble_bee.pre_hook = transform_pre_hook record = bumble_bee.transform(obj, discovered_schema) singer.write_record(stream, record) counter.increment() start_index += PAGE_SIZE if start_index > int(page['totalNumEntries']): break LOGGER.info("Done syncing %s for customer_id %s", stream, sdk_client.client_customer_id)
def sync_deal_pipelines(STATE, catalog): schema = load_schema('deal_pipelines') singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias')) LOGGER.info('sync_deal_pipelines') data = request(get_url('deal_pipelines')).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema) singer.write_record("deal_pipelines", record, catalog.get('stream_alias')) singer.write_state(STATE) return STATE
def tidy_response(stream, raw_row, needs_coalescing, replication_key): if needs_coalescing: bookmark_value = raw_row[replication_key[0]] or raw_row[ replication_key[1]] else: bookmark_value = raw_row[replication_key] with Transformer() as transformer: tidy_row = transformer.transform(raw_row, stream.schema.to_dict(), metadata.to_map(stream.metadata)) return tidy_row, bookmark_value
def write_page(self, page): stream = Context.get_catalog_entry(self.tap_stream_id) stream_metadata = metadata.to_map(stream.metadata) extraction_time = singer.utils.now() for rec in page: with Transformer() as transformer: rec = transformer.transform(rec, stream.schema.to_dict(), stream_metadata) singer.write_record(self.tap_stream_id, rec, time_extracted=extraction_time) with metrics.record_counter(self.tap_stream_id) as counter: counter.increment(len(page))
def write_members(members): """ Process members array and output SCHEMA and RECORD messages """ members_stream = CATALOG.get_stream("members") write_catalog_schema(members_stream) for member in members: user = member.pop("user") member.update(user) with Transformer(pre_hook=transform_datetimes_hook) as xform: member = xform.transform(member, members_stream.schema.to_dict()) singer.write_record("members", member)
def write_page(self, ctx, page): """Formats a list of records in place and outputs the data to stdout.""" stream = ctx.catalog.get_stream(self.tap_stream_id) with Transformer() as transformer: for rec in page: singer.write_record( self.tap_stream_id, transformer.transform( rec, stream.schema.to_dict(), metadata.to_map(stream.metadata), )) self.metrics(page)