def sync(client, config, catalog, state): start_date = config.get('start_date') # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams or selected_streams == []: return # Get current datetime (now_dt_str) for query parameters now_dttm = utils.now() now_dt_str = strftime(now_dttm)[0:10] # Reference: https://support.google.com/webmasters/answer/96568?hl=en # There is some delay/lag in Google Search Console results reconcilliation attribution_start_dttm = now_dttm - timedelta(days=ATTRIBUTION_DAYS) # Loop through selected_streams for stream_name in selected_streams: LOGGER.info('STARTED Syncing: {}'.format(stream_name)) update_currently_syncing(state, stream_name) write_schema(catalog, stream_name) endpoint_config = STREAMS[stream_name] bookmark_field = next( iter(endpoint_config.get('replication_keys', [])), None) body_params = endpoint_config.get('body', {}) endpoint_total = 0 # Initialize body body = endpoint_config.get('body', {}) # Loop through sites from config site_urls site_list = [] if 'site_urls' in config: site_list = config['site_urls'].replace(" ", "").split(",") for site in site_list: # Skip/ignore sitemaps for domain property sites # Reference issue: https://github.com/googleapis/google-api-php-client/issues/1607 # "...sitemaps API does not support domain property urls at this time." if stream_name == 'sitemaps' and site[0:9] == 'sc-domain': LOGGER.info('Skipping Site: {}'.format(site)) LOGGER.info( ' Sitemaps API does not support domain property urls at this time.' ) else: # Not sitemaps and sites = sc-domain LOGGER.info('STARTED Syncing: {}, Site: {}'.format( stream_name, site)) site_total = 0 site_encoded = quote(site, safe='') path = endpoint_config.get('path').format(site_encoded) # Set dimension_list for performance_reports if stream_name == 'performance_report_custom': dimensions_list = [] # Create dimensions_list from catalog breadcrumb stream = catalog.get_stream(stream_name) mdata = metadata.to_map(stream.metadata) dimensions_all = [ 'date', 'country', 'device', 'page', 'query' ] for dim in dimensions_all: if singer.should_sync_field( singer.metadata.get(mdata, ('properties', dim), 'inclusion'), singer.metadata.get(mdata, ('properties', dim), 'selected')): # metadata is selected for the dimension dimensions_list.append(dim) body_params['dimensions'] = dimensions_list dimensions_list = body_params.get('dimensions') LOGGER.info('stream: {}, dimensions_list: {}'.format( stream_name, dimensions_list)) # loop through each sub type sub_types = endpoint_config.get('sub_types', ['self']) for sub_type in sub_types: sub_type_total = 0 # Initialize date window if stream_name.startswith('performance_report'): reports_dttm_str = get_bookmark( state, stream_name, site, sub_type, start_date) reports_dttm = strptime_to_utc(reports_dttm_str) if reports_dttm < attribution_start_dttm: start_dttm = reports_dttm else: start_dttm = attribution_start_dttm end_dttm = start_dttm + timedelta( days=DATE_WINDOW_SIZE) if end_dttm > now_dttm: end_dttm = now_dttm else: start_dttm = strptime_to_utc(start_date) end_dttm = now_dttm # Date window loop while start_dttm < now_dttm: start_str = strftime(start_dttm)[0:10] end_str = strftime(end_dttm)[0:10] if stream_name.startswith('performance_report'): body = { 'searchType': sub_type, 'startDate': start_str, 'endDate': end_str, **body_params } else: body = None LOGGER.info( 'START Syncing Stream: {}, Site: {}, Type: {}, {} to {}' .format(stream_name, site, sub_type, start_str, end_str)) total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, site=site, sub_type=sub_type, dimensions_list=dimensions_list, path=path, endpoint_config=endpoint_config, api_method=endpoint_config.get( 'api_method', 'GET'), pagination=endpoint_config.get( 'pagination', 'none'), static_params=endpoint_config.get('params', {}), bookmark_field=bookmark_field, data_key=endpoint_config.get('data_key', None), body_params=body, id_fields=endpoint_config.get('key_properties')) # Increment totals endpoint_total = endpoint_total + total_records site_total = site_total + total_records sub_type_total = sub_type_total + total_records LOGGER.info( 'FINISHED Syncing Stream: {}, Site: {}, Type: {}, {} to {}' .format(stream_name, site, sub_type, start_str, end_str)) LOGGER.info( ' Records Synced for Date Window: {}'.format( total_records)) # Set next date window start_dttm = end_dttm end_dttm = start_dttm + timedelta( days=DATE_WINDOW_SIZE) if end_dttm > now_dttm: end_dttm = now_dttm # End date window loop LOGGER.info( 'FINISHED Syncing Stream: {}, Site: {}, Type: {}'. format(stream_name, site, sub_type)) LOGGER.info( ' Records Synced for Type: {}'.format(sub_type_total)) # End sub-type loop # End else: Not sitemaps and sites = sc-domain LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format( stream_name, site)) LOGGER.info(' Records Synced for Site: {}'.format(site_total)) # End site loop LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name)) LOGGER.info(' Records Synced for Stream: {}'.format(endpoint_total)) update_currently_syncing(state, None)
def is_bookmark_old(self, value): bookmark = self.get_bookmark() return utils.strptime_to_utc(value) >= bookmark
def get_current_sync_start(state, tap_stream_id): current_sync_start_value = singer.get_bookmark(state, tap_stream_id, "current_sync_start") if current_sync_start_value is None: return current_sync_start_value return utils.strptime_to_utc(current_sync_start_value)
def string_to_datetime(value): try: return strftime(strptime_to_utc(value)) except Exception as ex: LOGGER.warning("%s, (%s)", ex, value) return None
def sync_event_updates(stream_name): ''' Get updates via events endpoint look at 'events update' bookmark and pull events after that ''' LOGGER.info("Started syncing event based updates") bookmark_value = singer.get_bookmark(Context.state, stream_name + '_events', 'updates_created') or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) max_created = bookmark_value date_window_start = max_created date_window_end = max_created + 604800 # Number of seconds in a week stop_paging = False # Create a map to hold relate event object ids to timestamps updated_object_timestamps = {} while not stop_paging: extraction_time = singer.utils.now() response = STREAM_SDK_OBJECTS['events']['sdk_object'].list( **{ "limit": 100, "type": STREAM_TO_TYPE_FILTER[stream_name]['type'], "stripe_account": Context.config.get('account_id'), # None passed to starting_after appears to retrieve # all of them so this should always be safe. "created[gte]": date_window_start, "created[lt]": date_window_end, }) # If no results, and we are not up to current time if not len(response) and date_window_end > extraction_time.timestamp(): # pylint: disable=len-as-condition stop_paging = True for events_obj in response.auto_paging_iter(): event_resource_obj = events_obj.data.object sub_stream_name = SUB_STREAMS.get(stream_name) # Check whether we should sync the event based on its created time if not should_sync_event( events_obj, STREAM_TO_TYPE_FILTER[stream_name]['object'], updated_object_timestamps): continue # Syncing an event as its the first time we've seen it or its the most recent version with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING ) as transformer: event_resource_metadata = metadata.to_map( Context.get_catalog_entry(stream_name)['metadata']) # Filter out line items with null ids if isinstance( events_obj.get('data').get('object'), stripe.Invoice): invoice_obj = events_obj.get('data', {}).get('object', {}) line_items = invoice_obj.get('lines', {}).get('data') if line_items: filtered_line_items = [ line_item for line_item in line_items if line_item.get('id') ] invoice_obj['lines']['data'] = filtered_line_items rec = recursive_to_dict(event_resource_obj) rec = unwrap_data_objects(rec) rec = reduce_foreign_keys(rec, stream_name) rec["updated"] = events_obj.created rec = transformer.transform( rec, Context.get_catalog_entry(stream_name)['schema'], event_resource_metadata) if events_obj.created >= bookmark_value: if rec.get('id') is not None: singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.updated_counts[stream_name] += 1 # Delete events should be synced but not their subobjects if events_obj.get('type', '').endswith('.deleted'): continue if sub_stream_name and Context.is_selected( sub_stream_name): if event_resource_obj: sync_sub_stream(sub_stream_name, event_resource_obj, updates=True) if events_obj.created > max_created: max_created = events_obj.created date_window_start = date_window_end date_window_end = date_window_end + 604800 singer.write_bookmark(Context.state, stream_name + '_events', 'updates_created', max_created) singer.write_state(Context.state) singer.write_state(Context.state)
def sync(client, config, catalog, state): if 'start_date' in config: start_date = config['start_date'] # LOGGER.info('start_date = {}'.format(start_date)) # Get datetimes for endpoint parameters communications_dttm_str = get_bookmark(state, 'communications', 'self', start_date) communications_dt_str = transform_datetime(communications_dttm_str)[:10] # LOGGER.info('communications bookmark_date = {}'.format(communications_dt_str)) deposit_transactions_dttm_str = get_bookmark(state, 'deposit_transactions', 'self', start_date) deposit_transactions_dt_str = transform_datetime( deposit_transactions_dttm_str)[:10] # LOGGER.info('deposit_transactions bookmark_date = {}'.format(deposit_transactions_dt_str)) loan_transactions_dttm_str = get_bookmark(state, 'loan_transactions', 'self', start_date) loan_transactions_dt_str = transform_datetime( loan_transactions_dttm_str)[:10] loan_transactions_dttm = strptime_to_utc(loan_transactions_dt_str) clients_dttm_str = get_bookmark(state, 'clients', 'self', start_date) clients_dt_str = transform_datetime(clients_dttm_str)[:10] groups_dttm_str = get_bookmark(state, 'groups', 'self', start_date) groups_dt_str = transform_datetime(groups_dttm_str)[:10] lookback_days = int(config.get('lookback_window', LOOKBACK_DEFAULT)) lookback_date = utils.now() - timedelta(lookback_days) if loan_transactions_dttm > lookback_date: loan_transactions_dt_str = transform_datetime( strftime(lookback_date))[:10] # LOGGER.info('loan_transactions bookmark_date = {}'.format(loan_transactions_dt_str)) # endpoints: API URL endpoints to be called # properties: # <root node>: Plural stream name for the endpoint # path: API endpoint relative path, when added to the base URL, creates the full path # api_version: v1 or v2 (default v2). # api_method: GET or POST (default GET). # params: Query, sort, and other endpoint specific parameters # data_key: JSON element containing the records for the endpoint # bookmark_query_field: Typically a date-time field used for filtering the query # bookmark_field: Replication key field, typically a date-time, used for filtering the results # and setting the state # bookmark_type: Data type for bookmark, integer or datetime # id_fields: Primary key (and other IDs) from the Parent stored when store_ids is true. # children: A collection of child endpoints (where the endpoint path includes the parent id) # parent: On each of the children, the singular stream name for parent element # Details Level: https://api.mambu.com/?http#detail-level, FULL includes custom fields endpoints = { 'branches': { 'path': 'branches', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'lastModifiedDate:ASC', 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'communications': { 'path': 'communications/messages:search', 'api_version': 'v2', 'api_method': 'POST', 'params': { 'detailsLevel': 'FULL' }, 'body': [{ 'field': 'state', 'operator': 'EQUALS', 'value': 'SENT' }, { 'field': 'creationDate', 'operator': 'AFTER', 'value': communications_dt_str }], 'bookmark_field': 'creation_date', 'bookmark_type': 'datetime', 'id_fields': ['encoded_key'] }, 'centres': { 'path': 'centres', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'lastModifiedDate:ASC', 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'clients': { 'path': 'clients:search', 'api_version': 'v2', 'api_method': 'POST', 'params': { 'detailsLevel': 'FULL' }, 'body': { "sortingCriteria": { "field": "lastModifiedDate", "order": "ASC" }, "filterCriteria": [{ "field": "lastModifiedDate", "operator": "AFTER", "value": clients_dt_str }] }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'credit_arrangements': { 'path': 'creditarrangements', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'creationDate:ASC', 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'custom_field_sets': { 'path': 'customfieldsets', 'api_version': 'v1', 'api_method': 'GET', 'params': {}, 'id_fields': ['id'] }, 'deposit_accounts': { 'path': 'deposits', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'lastModifiedDate:ASC', 'detailsLevel': 'FULL' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'], 'store_ids': True, 'children': { 'cards': { 'path': 'deposits/{}/cards', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'detailsLevel': 'FULL' }, 'id_fields': ['deposit_id', 'reference_token'], 'parent': 'deposit' } } }, 'deposit_products': { 'path': 'savingsproducts', 'api_version': 'v1', 'api_method': 'GET', 'params': { "fullDetails": True }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'deposit_transactions': { 'path': 'deposits/transactions:search', 'api_version': 'v2', 'api_method': 'POST', 'params': { 'detailsLevel': 'FULL' }, 'body': { "sortingCriteria": { "field": "creationDate", "order": "ASC" }, "filterCriteria": [{ "field": "creationDate", "operator": "AFTER", "value": deposit_transactions_dt_str }] }, 'bookmark_field': 'creation_date', 'bookmark_type': 'datetime', 'id_fields': ['encoded_key'] }, 'groups': { 'path': 'groups:search', 'api_version': 'v2', 'api_method': 'POST', 'params': { 'detailsLevel': 'FULL' }, 'body': { "sortingCriteria": { "field": "lastModifiedDate", "order": "ASC" }, "filterCriteria": [{ "field": "lastModifiedDate", "operator": "AFTER", "value": groups_dt_str }] }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'loan_accounts': { 'path': 'loans', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'lastModifiedDate:ASC', 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'], 'children': { 'loan_repayments': { 'path': 'loans/{}/repayments', 'api_version': 'v1', 'api_method': 'GET', 'params': { 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'id_fields': ['encoded_key'], 'parent': 'loan_accounts' } } }, 'loan_products': { 'path': 'loanproducts', 'api_version': 'v1', 'api_method': 'GET', 'params': { "fullDetails": True }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'loan_transactions': { 'path': 'loans/transactions:search', 'api_version': 'v2', 'api_method': 'POST', 'params': { 'detailsLevel': 'FULL' }, 'body': { "sortingCriteria": { "field": "creationDate", "order": "ASC" }, "filterCriteria": [{ "field": "creationDate", "operator": "AFTER", "value": loan_transactions_dt_str }] }, 'bookmark_field': 'creation_date', 'bookmark_type': 'datetime', 'id_fields': ['encoded_key'] }, 'tasks': { 'path': 'tasks', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'lastModifiedDate:ASC', 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'users': { 'path': 'users', 'api_version': 'v2', 'api_method': 'GET', 'params': { 'sortBy': 'lastModifiedDate:ASC', 'detailsLevel': 'FULL', 'paginationDetails': 'ON' }, 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'id_fields': ['id'] }, 'gl_accounts': { 'path': 'glaccounts', 'api_version': 'v1', 'api_method': 'GET', 'params': { 'type': '{sub_type}' }, 'id_fields': ['gl_code'], 'bookmark_field': 'last_modified_date', 'bookmark_type': 'datetime', 'sub_types': ['ASSET', 'LIABILITY', 'EQUITY', 'INCOME', 'EXPENSE'] }, 'gl_journal_entries': { 'path': 'gljournalentries/search', 'api_version': 'v1', 'api_method': 'POST', 'body': { "filterConstraints": [{ "filterSelection": "CREATION_DATE", "filterElement": "BETWEEN", "value": '{gl_journal_entries_from_dt_str}', "secondValue": "{now_date_str}" }] }, 'id_fields': ['entry_id'], 'bookmark_field': 'booking_date', 'bookmark_type': 'datetime' }, 'activities': { 'path': 'activities', 'api_version': 'v1', 'api_method': 'GET', 'params': { 'from': '{activities_from_dt_str}', 'to': '{now_date_str}' }, 'id_fields': ['encoded_key'], 'bookmark_field': 'timestamp', 'bookmark_type': 'datetime' }, 'index_rate_sources': { 'path': 'indexratesources', 'api_version': 'v2', 'api_method': 'GET', 'id_fields': ['encoded_key'], 'params': {} }, 'installments': { 'path': 'installments', 'api_version': 'v2', 'api_method': 'GET', 'id_fields': ['encoded_key'], 'params': { 'dueFrom': '{installments_from_dt_str}', 'dueTo': '{now_date_str}' }, 'bookmark_field': 'last_paid_date', 'bookmark_type': 'datetime' } } selected_streams = get_selected_streams(catalog) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams: return # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) # For each endpoint (above), determine if the stream should be streamed # (based on the catalog and last_stream), then sync those streams. for stream_name, endpoint_config in endpoints.items(): should_stream, last_stream = should_sync_stream( selected_streams, last_stream, stream_name) if should_stream: # loop through each sub type sub_types = endpoint_config.get('sub_types', ['self']) for sub_type in sub_types: LOGGER.info('START Syncing: {}, Type: {}'.format( stream_name, sub_type)) # Now date if stream_name == 'gl_journal_entries': now_date_str = strftime(utils.now())[:10] gl_journal_entries_from_dttm_str = get_bookmark( state, 'gl_journal_entries', sub_type, start_date) gl_journal_entries_from_dt_str = transform_datetime( gl_journal_entries_from_dttm_str)[:10] gl_journal_entries_from_param = endpoint_config.get( 'body', {}).get('filterConstraints', {})[0].get('value') if gl_journal_entries_from_param: endpoint_config['body']['filterConstraints'][0][ 'value'] = gl_journal_entries_from_dt_str gl_journal_entries_to_param = endpoint_config.get( 'body', {}).get('filterConstraints', {})[0].get('secondValue') if gl_journal_entries_to_param: endpoint_config['body']['filterConstraints'][0][ 'secondValue'] = now_date_str if stream_name == 'activities': now_date_str = strftime(utils.now())[:10] activities_from_dttm_str = get_bookmark( state, 'activities', sub_type, start_date) activities_from_dt_str = transform_datetime( activities_from_dttm_str)[:10] activities_from_param = endpoint_config.get('params', {}).get('from') if activities_from_param: endpoint_config['params'][ 'from'] = activities_from_dt_str activities_to_param = endpoint_config.get('params', {}).get('to') if activities_to_param: endpoint_config['params']['to'] = now_date_str if stream_name == 'installments': now_date_str = strftime(utils.now())[:10] installments_from_dttm_str = get_bookmark( state, 'installments', sub_type, start_date) installments_from_dt_str = transform_datetime( installments_from_dttm_str)[:10] installments_from_param = endpoint_config.get( 'params', {}).get('dueFrom') if installments_from_param: endpoint_config['params'][ 'dueFrom'] = installments_from_dt_str installments_to_param = endpoint_config.get( 'params', {}).get('dueTo') if installments_to_param: endpoint_config['params']['dueTo'] = now_date_str update_currently_syncing(state, stream_name) path = endpoint_config.get('path') sub_type_param = endpoint_config.get('params', {}).get('type') if sub_type_param: endpoint_config['params']['type'] = sub_type total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, path=path, endpoint_config=endpoint_config, api_version=endpoint_config.get('api_version', 'v2'), api_method=endpoint_config.get('api_method', 'GET'), static_params=endpoint_config.get('params', {}), sub_type=sub_type, bookmark_query_field=endpoint_config.get( 'bookmark_query_field'), bookmark_field=endpoint_config.get('bookmark_field'), bookmark_type=endpoint_config.get('bookmark_type'), data_key=endpoint_config.get('data_key', None), body=endpoint_config.get('body', None), id_fields=endpoint_config.get('id_fields')) update_currently_syncing(state, None) LOGGER.info('Synced: {}, total_records: {}'.format( stream_name, total_records)) LOGGER.info('FINISHED Syncing: {}'.format(stream_name))
def get_bookmark(self, state): bookmark = (get_bookmark(state, self.name, self.replication_key) or self.start_date) return utils.strptime_to_utc(bookmark)
def sync_substream(self, state, parent, sub_stream, parent_response): bookmark_date = self.get_bookmark(state, sub_stream.name, self.config.get('start_date'), sub_stream.replication_key) # If last sync was interrupted, get last processed parent record last_processed = self.get_bookmark(state, sub_stream.name, None, key="last_processed") bookmark_dttm = strptime_to_utc(bookmark_date) new_bookmark = bookmark_dttm singer.write_schema(sub_stream.name, sub_stream.stream.schema.to_dict(), sub_stream.key_properties) # Slice response for >= last processed if last_processed: for i, e in enumerate(parent_response): if e.get(parent.key_properties[0]) == last_processed: LOGGER.info("Resuming %s sync with %s", sub_stream.name, e.get(parent.key_properties[0])) parent_response = parent_response[i:len(parent_response)] continue next_log_progress_percentage = 0 for index, record in enumerate(parent_response): try: with metrics.record_counter( sub_stream.name) as counter, Transformer( integer_datetime_fmt= "unix-milliseconds-integer-datetime-parsing" ) as transformer: stream_events = sub_stream.sync( state, new_bookmark, record.get(parent.key_properties[0])) for event in stream_events: counter.increment() schema_dict = sub_stream.stream.schema.to_dict() stream_metadata = metadata.to_map( sub_stream.stream.metadata) transformed_event = sub_stream.transform(event) try: transformed_record = transformer.transform( transformed_event, schema_dict, stream_metadata) except Exception as err: LOGGER.error('Error: %s', err) LOGGER.error( ' for schema: %s', json.dumps(schema_dict, sort_keys=True, indent=2)) raise err event_time = strptime_to_utc( transformed_record.get(sub_stream.replication_key)) new_bookmark = max(new_bookmark, event_time) singer.write_record(sub_stream.stream.tap_stream_id, transformed_record) except HTTPError: LOGGER.warning( "Unable to retrieve %s Event for Stream (ID: %s)", sub_stream.name, record[parent.key_properties[0]]) # All events for all parents processed; can removed last processed self.update_bookmark(state=state, stream=sub_stream.name, bookmark_value=record.get( parent.key_properties[0]), bookmark_key="last_processed") self.update_bookmark(state=state, stream=sub_stream.name, bookmark_value=strftime(new_bookmark), bookmark_key=sub_stream.replication_key) progress_percentage = float(index) / len(parent_response) * 100 if progress_percentage > next_log_progress_percentage: LOGGER.info( "Finished syncing %s percentage of sub_stream for parent %s's sub_stream %s data", progress_percentage, parent.name, sub_stream.name) next_log_progress_percentage += self.LOG_PROGRESS_PERCENTAGE_INTERVAL # After processing for all parent ids we can remove our resumption state state.get('bookmarks').get(sub_stream.name).pop('last_processed') update_currently_syncing(state, None)
def sync_endpoint(client, # pylint: disable=too-many-branches catalog, state, start_date, stream_name, path, endpoint_config, static_params, bookmark_query_field=None, bookmark_field=None, bookmark_type=None, data_key=None, id_fields=None, selected_streams=None, replication_ind=None, parent=None, parent_id=None): # Get the latest bookmark for the stream and set the last_integer/datetime last_datetime = None last_integer = None max_bookmark_value = None if bookmark_type == 'integer': last_integer = get_bookmark(state, stream_name, 0) max_bookmark_value = last_integer else: last_datetime = get_bookmark(state, stream_name, start_date) max_bookmark_value = last_datetime LOGGER.info('{}, initial max_bookmark_value {}'.format(stream_name, max_bookmark_value)) max_bookmark_dttm = strptime_to_utc(last_datetime) max_bookmark_int = int(time.mktime(max_bookmark_dttm.timetuple())) now_int = int(time.time()) updated_since_sec = now_int - max_bookmark_int updated_since_days = math.ceil(updated_since_sec/(24 * 60 * 60)) # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 # Default per_page limit is 50, max is 60 limit = endpoint_config.get('batch_size', 60) total_records = 0 # Check scroll_type to determine if to use Scroll API # scroll_types: always, never. # Endpoints: # always: customers # never: all others # Scroll API: https://developers.intercom.io/reference?_ga=2.237132992.1579857338.1569387987-1032864292.1569297580#iterating-over-all-users scroll_type = endpoint_config.get('scroll_type', 'never') # Check whether the endpoint supports a cursor # https://developers.intercom.com/intercom-api-reference/reference#pagination-cursor cursor = endpoint_config.get('cursor', False) search = endpoint_config.get('search', False) # Scroll for always re-syncs if scroll_type == 'always': LOGGER.info('Stream: {}, Historical Sync, Using Scoll API'.format(stream_name)) is_scrolling = True next_url = '{}/{}/scroll'.format(client.base_url, path) params = {} else: is_scrolling = False next_url = '{}/{}'.format(client.base_url, path) # INTERPOLATE PAGE: # Endpoints: conversations and leads # Pre-requisites: Endpoint allows SORT ASC by bookmark and PAGING, but does not provide query filtering params. # Interpolate Page: Find start page based on sorting results, bookmark datetime, and binary search algorithm. # Algorithm tries to estimate start page based on bookmark, and then splits the difference if it # exceeds the start page or falls short of the start page based on the page's 1st and last record bookmarks. interpolate_page = endpoint_config.get('interpolate_page', False) if interpolate_page: # Interpolate based on current page, total_pages, and updated_at to get first page min_page = 1 max_page = 4 # initial value, reset to total_pages on 1st API call i = 1 while (max_page - min_page) > 2: params = { 'page': page, 'per_page': limit, **static_params # adds in endpoint specific, sort, filter params } querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) # API request data data = {} data = client.get( path=path, params=querystring, endpoint=stream_name) page = int(data.get('pages', {}).get('page')) # per_page = int(data.get('pages', {}).get('per_page')) total_pages = int(data.get('pages', {}).get('total_pages')) if i == 1: max_page = total_pages list_len = len(data.get(data_key, [])) LOGGER.info('Interpolate start page: i = {}, page = {}, min_page = {}, max_page = {}'.format(i, page, min_page, max_page)) first_record_updated_int = data.get(data_key, [])[0].get('updated_at') last_record_updated_int = data.get(data_key, [])[list_len - 1].get('updated_at') if i == 1: # FIRST GUESS - based on TOTAL PAGES, last bookmark, and % of time difference: (bookmark - 1st Record) / (NOW - 1st Record) # Get next_page based on proportional ratio of time integers # If bookmark datetime in at 90% of (NOW - 1st Record) and there are 100 pages TOTAL, then try page 90 # NOTE: It is better for NEXT GUESSES to slightly under-shoot - so that splitting the difference is smaller. # If you start at 1, then over-shoot to 90, but the 1st Page is 89, the next_page guess will be 45. # This is why next_page is 95% of pct_time x total_pages pct_time = ((max_bookmark_int - first_record_updated_int)/(now_int - first_record_updated_int)) LOGGER.info('Interpolate percent based on time diff: {}%'.format(math.floor(pct_time * 100))) next_page = math.floor(0.95 * pct_time * total_pages) # Adjust 1st GUESS to lower by 5% to under-shoot LOGGER.info(' next_page = {}'.format(next_page)) elif first_record_updated_int <= max_bookmark_int and last_record_updated_int >= max_bookmark_int: # First page found, stop looping min_page = page LOGGER.info('First page found. page = {}'.format(page)) break elif last_record_updated_int < max_bookmark_int: # Increase page by half min_page = page next_page = page + math.ceil((1 + max_page - min_page) / 2) LOGGER.info('Increase page. next_page = {}'.format(next_page)) elif first_record_updated_int > max_bookmark_int: # Decrease the page by half max_page = page next_page = page - math.floor((1 + max_page - min_page) / 2) LOGGER.info('Decrease page. next_page = {}'.format(next_page)) else: # Break out of loop break page = next_page i = i + 1 # Set params to interpolated page params = { 'page': min_page, 'per_page': limit, **static_params # adds in endpoint specific, sort, filter params } # FINISH INTERPOLATION elif cursor: params = { 'per_page': limit, **static_params } # NORMAL SYNC - Not SCROLLING, Not INTERPOLATION # Standard INCREMENTAL or FULL TABLE else: params = { 'page': page, 'per_page': limit, **static_params # adds in endpoint specific, sort, filter params } request_body = None # Initial search query contains only a starting_time if search: search_query = endpoint_config.get('search_query') request_body = build_query(search_query, max_bookmark_int) i = 1 while next_url is not None: # Need URL querystring for 1st page; subsequent pages provided by next_url # querystring: Squash query params into string if i == 1 and not is_scrolling: if bookmark_query_field: if bookmark_type == 'datetime': params[bookmark_query_field] = updated_since_days elif bookmark_type == 'integer': params[bookmark_query_field] = last_integer if params != {}: querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) else: querystring = None LOGGER.info('URL for Stream {}: {}{}'.format( stream_name, next_url, '?{}'.format(querystring) if querystring else '')) # API request data data = {} data = client.perform( method=endpoint_config.get('method'), url=next_url, path=path, params=querystring, endpoint=stream_name, json=request_body) # LOGGER.info('data = {}'.format(data)) # TESTING, comment out # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: break # No data results # Transform data with transform_json from transform.py # The data_key identifies the array/list of records below the <root> element. # SINGLE RECORD data results appear as dictionary. # MULTIPLE RECORD data results appear as an array-list under the data_key. # The following code converts ALL results to an array-list and transforms data. transformed_data = [] # initialize the record list data_list = [] data_dict = {} if isinstance(data, list) and data_key not in data: data_list = data data_dict[data_key] = data_list transformed_data = transform_json(data_dict, stream_name, data_key) elif isinstance(data, dict) and data_key not in data: data_list.append(data) data_dict[data_key] = data_list transformed_data = transform_json(data_dict, stream_name, data_key) else: transformed_data = transform_json(data, stream_name, data_key) # LOGGER.info('transformed_data = {}'.format(transformed_data)) # TESTING, comment out if not transformed_data or transformed_data is None: if parent_id is None: LOGGER.info('Stream: {}, No transformed data for data = {}'.format( stream_name, data)) break # No data results # Verify key id_fields are present rec_count = 0 for record in transformed_data: for key in id_fields: if not record.get(key): LOGGER.info('Stream: {}, Missing key {} in record: {}'.format( stream_name, key, record)) raise RuntimeError rec_count = rec_count + 1 # Process records and get the max_bookmark_value and record_count for the set of records if replication_ind: max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, bookmark_type=bookmark_type, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime, last_integer=last_integer, parent=parent, parent_id=parent_id) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, record_count)) else: record_count = 0 # Loop thru parent batch records for each children objects (if should stream) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items(): if child_stream_name in selected_streams: child_replication_ind = child_endpoint_config.get('replication_ind', True) if child_replication_ind: write_schema(catalog, child_stream_name) child_selected_fields = get_selected_fields(catalog, child_stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format( child_stream_name, child_selected_fields)) total_child_records = 0 # For each parent record for record in transformed_data: i = 0 # Set parent_id for id_field in id_fields: if i == 0: parent_id_field = id_field if id_field == 'id': parent_id_field = id_field i = i + 1 parent_id = record.get(parent_id_field) # sync_endpoint for child LOGGER.info('Syncing: {}, parent_stream: {}, parent_id: {}'.format( child_stream_name, stream_name, parent_id)) child_path = child_endpoint_config.get('path', child_stream_name).format( str(parent_id)) child_bookmark_field = next(iter(child_endpoint_config.get( 'replication_keys', [])), None) child_total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=child_stream_name, path=child_path, endpoint_config=child_endpoint_config, static_params=child_endpoint_config.get('params', {}), bookmark_query_field=child_endpoint_config.get( 'bookmark_query_field', None), bookmark_field=child_bookmark_field, bookmark_type=child_endpoint_config.get('bookmark_type', None), data_key=child_endpoint_config.get('data_key', child_stream_name), id_fields=child_endpoint_config.get('key_properties'), selected_streams=selected_streams, replication_ind=child_replication_ind, parent=child_endpoint_config.get('parent'), parent_id=parent_id) LOGGER.info('Synced: {}, parent_id: {}, records: {}'.format( child_stream_name, parent_id, child_total_records)) total_child_records = total_child_records + child_total_records LOGGER.info('Parent Stream: {}, Child Stream: {}, FINISHED PARENT BATCH'.format( stream_name, child_stream_name)) LOGGER.info('Synced: {}, total_records: {}'.format( child_stream_name, total_child_records)) # set total_records and next_url for pagination total_records = total_records + record_count if is_scrolling: scroll_param = data.get('scroll_param') if not scroll_param: break next_url = '{}/{}/scroll?scroll_param={}'.format(client.base_url, path, scroll_param) elif cursor: pagination = data.get('pages', {}).get('next', {}) starting_after = pagination.get('starting_after', None) next_url = '{}/{}?starting_after={}'.format(client.base_url, path, starting_after) elif search: pagination = data.get('pages', {}).get('next', {}) starting_after = pagination.get('starting_after', None) # Subsequent search queries require starting_after if starting_after: request_body = build_query(search_query, max_bookmark_int, starting_after) else: next_url = None else: next_url = data.get('pages', {}).get('next', None) # Update the state with the max_bookmark_value for non-scrolling if bookmark_field and not is_scrolling: write_bookmark(state, stream_name, max_bookmark_value) # to_rec: to record; ending record for the batch page to_rec = offset + rec_count LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) and page offset = offset + rec_count page = page + 1 i = i + 1 # Return total_records across all pages LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format( stream_name, page - 1, total_records)) # Update the state with the max_bookmark_value for non-scrolling if bookmark_field and is_scrolling: write_bookmark(state, stream_name, max_bookmark_value) return total_records
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, path, endpoint_config, static_params, bookmark_query_field=None, bookmark_field=None, bookmark_type=None, data_key=None, id_fields=None, selected_streams=None, parent=None, parent_id=None): # Get the latest bookmark for the stream and set the last_integer/datetime last_datetime = None last_integer = None max_bookmark_value = None if bookmark_type == 'integer': last_integer = get_bookmark(state, stream_name, 0) max_bookmark_value = last_integer else: last_datetime = get_bookmark(state, stream_name, start_date) max_bookmark_value = last_datetime max_bookmark_dttm = strptime_to_utc(last_datetime) max_bookmark_int = int(time.mktime(max_bookmark_dttm.timetuple())) now_int = int(time.time()) updated_since_sec = now_int - max_bookmark_int updated_since_days = math.ceil(updated_since_sec / (24 * 60 * 60)) # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 limit = 100 # Default per_page limit is 100 total_endpoint_records = 0 next_url = '{}/{}'.format(client.base_url, path) params = { 'page': page, 'per': limit, **static_params # adds in endpoint specific, sort, filter params } total_processed_records = 0 while next_url is not None: # Need URL querystring for 1st page; subsequent pages provided by next_url # querystring: Squash query params into string if page == 1: if bookmark_query_field: if bookmark_type == 'datetime': params[bookmark_query_field] = start_date elif bookmark_type == 'integer': params[bookmark_query_field] = last_integer if params != {}: querystring = '&'.join([ '%s=%s' % (key, value) for (key, value) in params.items() ]) else: querystring = None LOGGER.info('URL for Stream {}: {}{}'.format( stream_name, next_url, '?{}'.format(querystring) if querystring else '')) # API request data # total_endpoint_records: API response for all pages data = {} data, total_endpoint_records, next_url = client.get( url=next_url, path=path, params=querystring, endpoint=stream_name) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: return total_endpoint_records # No data results # Transform data with transform_json from transform.py # The data_key identifies the array/list of records below the <root> element transformed_data = [] # initialize the record list data_list = [] # data_dict = {} if isinstance(data, list) and not data_key in data: data_list = data transformed_data = transform_json(data, stream_name, data_key) if not transformed_data or transformed_data is None: LOGGER.info('No transformed data for data = {}'.format(data)) return total_endpoint_records # No data results total_submitted_records = len(transformed_data) rec_count = 0 # Process records and get the max_bookmark_value and record_count for the set of records max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, bookmark_type=bookmark_type, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime, last_integer=last_integer, parent=parent, parent_id=parent_id) total_processed_records = total_processed_records + record_count LOGGER.info( 'Stream {}, batch processed {} records, total processed records {}' .format(stream_name, record_count, total_processed_records)) # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, max_bookmark_value) # to_rec: to record; ending record for the batch page to_rec = offset + rec_count LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) and page offset = offset + rec_count page = page + 1 # Return total_endpoint_records across all pages LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format( stream_name, page - 1, total_endpoint_records)) return total_endpoint_records
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() STATE = write_current_sync_start(STATE, "companies", current_sync_start) singer.write_state(STATE) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform( lift_properties_and_versions(record), schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'companies', None) singer.write_state(STATE) return STATE
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, search_path, endpoint_config, bookmark_field=None, selected_streams=None): # Endpoint parameters bookmark_query_field = endpoint_config.get('bookmark_query_field', None) data_key = endpoint_config.get('data_key', stream_name) LOGGER.info('data_key = {}'.format(data_key)) # Get the latest bookmark for the stream and set the last_datetime last_datetime = get_bookmark(state, stream_name, start_date) file_max_bookmark_value = last_datetime # Convert to GitHub date format, example: Sun, 13 Oct 2019 22:40:01 GMT last_dttm = strptime_to_utc(last_datetime) last_modified = last_dttm.strftime("%a, %d %b %Y %H:%M:%S %Z'") LOGGER.info('HEADER If-Modified-Since: {}'.format(last_modified)) # Write schema and log selected fields for file stream and child csv stream(s) write_schema(catalog, stream_name) selected_fields = get_selected_fields(catalog, stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format( stream_name, selected_fields)) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items(): if child_stream_name in selected_streams: write_schema(catalog, child_stream_name) child_selected_fields = get_selected_fields( catalog, child_stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format( child_stream_name, child_selected_fields)) # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 file_total_records = 0 csv_total_records = 0 next_url = '{}/{}'.format(client.base_url, search_path) i = 1 while next_url is not None: LOGGER.info('Search URL for Stream {}: {}'.format( stream_name, next_url)) # API request search_data search_data = {} search_data, next_url = client.get(url=next_url, endpoint=stream_name) LOGGER.info('next_url = {}'.format(next_url)) # LOGGER.info('search_data = {}'.format(search_data)) # COMMENT OUT # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() search_items = search_data.get(data_key, []) if not search_items: LOGGER.info('Stream: {}, no files found'.format(stream_name)) break # No data results file_count = 0 file_records = [] csv_records = [] for item in search_items: file_count = file_count + 1 file_url = item.get('url') LOGGER.info('File URL for Stream {}: {}'.format( stream_name, file_url)) file_data = {} headers = {} if bookmark_query_field: headers[bookmark_query_field] = last_modified # API request file_data for item, single-file (ignore file_next_url) file_data, file_next_url = client.get(url=file_url, headers=headers, endpoint=stream_name) # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT if file_data: content = file_data.get('content') content_list = [] if content: content_b64 = base64.b64decode(content) content_str = content_b64.decode('utf-8') file_like_object = io.StringIO(content_str) with file_like_object as f: reader = csv.DictReader(f) content_list = [r for r in reader] file_modified = file_data.get('last_modified') file_sha = file_data.get('sha') file_path = file_data.get('path') file_name = file_data.get('name') # Remove _links, content nodes file_data.pop('_links', None) file_data.pop('content', None) # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT file_records.append(file_data) # Loop thru each child object and append csv records if children: for child_stream_name, child_endpoint_config in children.items( ): if child_stream_name in selected_streams: i = 1 for record in content_list: record['git_path'] = file_path record['git_sha'] = file_sha record['git_last_modified'] = file_modified record['git_file_name'] = file_name record['row_number'] = i # Transform record and append transformed_csv_record = {} transformed_csv_record = transform_record( child_stream_name, record) csv_records.append(transformed_csv_record) i = i + 1 # Process file_records and get the max_bookmark_value and record_count file_max_bookmark_value, file_record_count = process_records( catalog=catalog, stream_name=stream_name, records=file_records, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=file_max_bookmark_value, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, file_record_count)) file_total_records = file_total_records + file_record_count # Loop thru each child object to process csv records if children: for child_stream_name, child_endpoint_config in children.items(): if child_stream_name in selected_streams: csv_max_bookmark_value, csv_record_count = process_records( catalog=catalog, stream_name=child_stream_name, records=csv_records, time_extracted=time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( child_stream_name, csv_record_count)) csv_total_records = csv_total_records + csv_record_count # to_rec: to record; ending record for the batch page to_rec = offset + file_count LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) and page offset = offset + file_count page = page + 1 i = i + 1 # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, file_max_bookmark_value) # Return total_records across all pages LOGGER.info( 'Synced Stream: {}, TOTAL pages: {}, file records: {}, csv records: {}' .format(stream_name, page - 1, file_total_records, csv_total_records)) return file_total_records
def sync_in_app_events(): schema = load_schema("raw_data/in_app_events") singer.write_schema("in_app_events", schema, [ "event_time", "event_name", "appsflyer_id" ]) # This order matters fieldnames = ( "attributed_touch_type", "attributed_touch_time", "install_time", "event_time", "event_name", "event_value", "event_revenue", "event_revenue_currency", "event_revenue_usd", "event_source", "is_receipt_validated", "af_prt", "media_source", "af_channel", "af_keywords", "campaign", "af_c_id", "af_adset", "af_adset_id", "af_ad", "af_ad_id", "af_ad_type", "af_siteid", "af_sub_siteid", "af_sub1", "af_sub2", "af_sub3", "af_sub4", "af_sub5", "af_cost_model", "af_cost_value", "af_cost_currency", "contributor1_af_prt", "contributor1_media_source", "contributor1_campaign", "contributor1_touch_type", "contributor1_touch_time", "contributor2_af_prt", "contributor2_media_source", "contributor2_campaign", "contributor2_touch_type", "contributor2_touch_time", "contributor3_af_prt", "contributor3_media_source", "contributor3_campaign", "contributor3_touch_type", "contributor3_touch_time", "region", "country_code", "state", "city", "postal_code", "dma", "ip", "wifi", "operator", "carrier", "language", "appsflyer_id", "advertising_id", "idfa", "android_id", "customer_user_id", "imei", "idfv", "platform", "device_type", "os_version", "app_version", "sdk_version", "app_id", "app_name", "bundle_id", "is_retargeting", "retargeting_conversion_type", "af_attribution_lookback", "af_reengagement_window", "is_primary_attribution", "user_agent", "http_referrer", "original_url", ) stop_time = datetime.datetime.now(pytz.utc) from_datetime = get_start("in_app_events") to_datetime = get_stop(from_datetime, stop_time, 10) while from_datetime < stop_time: LOGGER.info("Syncing data from %s to %s", from_datetime, to_datetime) params = dict() params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M") params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M") params["api_token"] = CONFIG["api_token"] url = get_url("in_app_events", app_id=CONFIG["app_id"]) request_data = request(url, params) csv_data = RequestToCsvAdapter(request_data) reader = csv.DictReader(csv_data, fieldnames) next(reader) # Skip the heading row bookmark = from_datetime for i, row in enumerate(reader): record = xform(row, schema) singer.write_record("in_app_events", record) # AppsFlyer returns records in order of most recent first. if utils.strptime_to_utc(record["event_time"]) > bookmark: bookmark = utils.strptime_to_utc(record["event_time"]) # Write out state utils.update_state(STATE, "in_app_events", bookmark) singer.write_state(STATE) # Move the timings forward from_datetime = to_datetime to_datetime = get_stop(from_datetime, stop_time, 10)
def sync_companies(state: State): bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(state, "companies", bookmark_key)) logger.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("hubspot_companies", schema, ["companyId"], [bookmark_key]) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(state, "companies") or utils.now() state = write_current_sync_start(state, "companies", current_sync_start) singer.write_state(state) url = get_url("companies_all") max_bk_value = start contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("hubspot_contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) for row in gen_request(state, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = build_record(record, schema) write_record('hubspot_companies', record) state = _sync_contacts_by_company(state, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) state = singer.write_bookmark(state, 'hubspot_companies', bookmark_key, utils.strftime(new_bookmark)) state = write_current_sync_start(state, 'companies', None) singer.write_state(state) return state
def sync_endpoint(client, config, catalog, state, stream_name, endpoint_config, sync_streams, selected_streams, parent_id=None): # endpoint_config variables base_path = endpoint_config.get('path', stream_name) bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None) params = endpoint_config.get('params', {}) bookmark_query_field_from = endpoint_config.get( 'bookmark_query_field_from') bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to') data_key_array = endpoint_config.get('data_key_array') id_fields = endpoint_config.get('key_properties') parent = endpoint_config.get('parent') date_window_size = int(endpoint_config.get('date_window_size', '1')) # tap config variabless start_date = config.get('start_date') attribution_window = config.get('attribution_window', 30) last_datetime = get_bookmark(state, stream_name, start_date, bookmark_field, parent, parent_id) max_bookmark_value = last_datetime # Convert to datetimes in local/ad account timezone now_datetime = utils.now() last_dttm = strptime_to_utc(last_datetime) if bookmark_query_field_from and bookmark_query_field_to: # date_window_size: Number of days in each date window # Set start window start_window = now_datetime - timedelta(days=attribution_window) if last_dttm < start_window: start_window = last_dttm + timedelta( days=1) # makes sure that we don't have duplicated data # Set end window end_window = start_window + timedelta(days=date_window_size) if end_window > now_datetime: end_window = now_datetime else: start_window = last_dttm end_window = now_datetime diff_sec = (end_window - start_window).seconds date_window_size = math.ceil( diff_sec / (3600 * 24)) # round-up difference to days endpoint_total = 0 total_records = 0 while start_window < now_datetime: LOGGER.info('START Sync for Stream: {}{}'.format( stream_name, ', Date window from: {} to {}'.format(start_window.date(), end_window.date()) \ if bookmark_query_field_from else '')) if bookmark_query_field_from and bookmark_query_field_to: # Query parameter startDate and endDate must be in Eastern time zone # API will error if future dates are requested # DAY based window_start_dt_str = start_window.date().strftime( '%Y-%m-%dT00:00:00') window_end_dt_str = end_window.date().strftime('%Y-%m-%dT23:59:59') params[bookmark_query_field_from] = window_start_dt_str params[bookmark_query_field_to] = window_end_dt_str path = base_path.format(parent_id=parent_id) total_records = 0 # concate params querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) # initialize url url = '{}/{}?{}'.format(client.base_url, path, querystring) # API request data data = {} try: data = client.get(url=url, endpoint=stream_name) except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('URL for Stream {}: {}'.format(stream_name, url)) raise Exception(err) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: LOGGER.info('No data results returned') else: # Transform data with transform_json from transform.py # The data_key_array identifies the array/list of records below the <root> element # LOGGER.info('data = {}'.format(data)) # TESTING, comment out transformed_data = [] # initialize the record list if data_key_array: data_records = data.get(data_key_array, []) else: data_records = data for record in data_records: # Add parent id field/value if parent and parent_id and parent not in record: record[parent] = parent_id # transform record (remove inconsistent use of CamelCase) try: transformed_record = humps.decamelize(record) except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('error record: {}'.format(record)) raise Exception(err) transformed_data.append(transformed_record) # End for record in array # End non-stats stream # LOGGER.info('transformed_data = {}'.format(transformed_data)) # COMMENT OUT if not transformed_data or transformed_data is None: LOGGER.info('No transformed data for data = {}'.format(data)) else: # Process records and get the max_bookmark_value and record_count if stream_name in sync_streams: max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, record_count)) # Loop thru parent batch records for each children objects (if should stream) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items( ): if child_stream_name in sync_streams: LOGGER.info( 'START Syncing: {}'.format(child_stream_name)) write_schema(catalog, child_stream_name) # For each parent record for record in transformed_data: i = 0 # Set parent_id for id_field in id_fields: if i == 0: parent_id_field = id_field if id_field == 'id': parent_id_field = id_field i = i + 1 parent_id = record.get(parent_id_field) # sync_endpoint for child LOGGER.info( 'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\ .format(child_stream_name, stream_name, parent_id)) child_total_records = sync_endpoint( client=client, config=config, catalog=catalog, state=state, stream_name=child_stream_name, endpoint_config=child_endpoint_config, sync_streams=sync_streams, selected_streams=selected_streams, parent_id=parent_id) LOGGER.info( 'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\ .format(child_stream_name, parent_id, child_total_records)) # End transformed data record loop # End if child in sync_streams # End child streams for parent # End if children # Parent record batch total_records = total_records + record_count endpoint_total = endpoint_total + record_count LOGGER.info('Synced Stream: {}, records: {}'.format( stream_name, total_records)) # Update the state with the max_bookmark_value for the stream date window # Snapchat Ads API does not allow page/batch sorting; bookmark written for date window if bookmark_field and stream_name in selected_streams: write_bookmark(state, stream_name, max_bookmark_value, bookmark_field, parent, parent_id) # Increment date window and sum endpoint_total start_window = end_window + timedelta(days=1) next_end_window = end_window + timedelta(days=date_window_size) if next_end_window > now_datetime: end_window = now_datetime else: end_window = next_end_window # End date window # Return total_records (for date windows) return endpoint_total
def transform_report(report_name, report_data, account_id): time_series_length = int(report_data.get('time_series_length', 1)) # Default = 1 to loop once request = report_data.get('request', {}) # request params params = request.get('params', {}) entity = params.get('entity') granularity = params.get('granularity') placement = params.get('placement') segmentation_type = params.get('segmentation_type') country = params.get('country') platform = params.get('platform') start_time = params.get('start_time') end_time = params.get('end_time') LOGGER.info( 'Report: {} - transform_report, absolute start_time: {}'.format( report_name, start_time)) LOGGER.info('Report: {} - transform_report, absoluted end_time: {}'.format( report_name, end_time)) LOGGER.info('Report: {} - transform_report, time_series_length: {}'.format( report_name, time_series_length)) report_records = [] if granularity == 'DAY': interval = timedelta(days=1) elif granularity == 'HOUR': interval = timedelta(hours=1) elif granularity == 'TOTAL': interval = timedelta(days=0) # 0 days for TOTAL # Loop through entity_id records w/ data for id_record in report_data.get('data'): # LOGGER.info('id_record = {}'.format(id_record)) # COMMENT OUT entity_id = id_record.get('id') # LOGGER.info('entity_id = {}'.format(entity_id)) # COMMENT OUT id_data = [] id_data = id_record.get('id_data') # Loop through id_data records for datum in id_data: # Loop through time intervals start_dttm = strptime_to_utc(start_time) end_dttm = start_dttm + interval i = 0 while i <= (time_series_length - 1): series_start = strftime(start_dttm) series_end = strftime(end_dttm) append_record = False # Initialize; only append records w/ metric data segment = datum.get('segment') segment_name = None segment_value = None if segment: segment_name = segment.get('segment_name') segment_value = segment.get('segment_value') dimensions = { 'report_name': report_name, 'account_id': account_id, 'entity': entity, 'entity_id': entity_id, 'granularity': granularity, 'placement': placement, 'start_time': series_start, 'end_time': series_end, 'segmentation_type': segmentation_type, 'segment_name': segment_name, 'segment_value': segment_value, 'country': country, 'platform': platform } # Create MD5 hash key of sorted json dimesions (above) dims_md5 = str( hash_data(json.dumps(dimensions, sort_keys=True))) record = { '__sdc_dimensions_hash_key': dims_md5, 'start_time': series_start, 'end_time': series_end, 'dimensions': dimensions } # LOGGER.info('dimensions_hash_key = {}'.format(dims_md5)) # COMMENT OUT # Get time interval value from metrics value arrays metrics = datum.get('metrics', {}) for key, val in list(metrics.items()): # Determine nested object group for each measure if key[0:7] == 'billed_': group = 'billing' elif key[0:6] == 'media_': group = 'media' elif key[0:6] == 'video_': group = 'video' elif key[0:11] == 'conversion_': group = 'web_conversion' elif key[0:18] == 'mobile_conversion_': group = 'mobile_conversion' else: group = 'engagement' # Create group node if not exists if not record.get(group): record[group] = {} if isinstance(val, list): index_val = None try: index_val = val[i] record[group][key] = index_val append_record = True except IndexError: index_val = None elif isinstance(val, dict): new_dict = {} for key2, val2 in list(val.items()): idx_val = None if isinstance(val2, list): try: idx_val = val2[i] new_dict[key2] = idx_val append_record = True except IndexError: idx_val = None if new_dict != {}: record[group][key] = new_dict # End for key, val in metrics # LOGGER.info('record = {}'.format(record)) # COMMENT OUT # LOGGER.info('append_record = {}'.format(append_record)) # COMMENT OUT if append_record: report_records.append(record) i = i + 1 start_dttm = end_dttm end_dttm = start_dttm + interval # End: while i < time_series_length # End: for datum in id_data # End: for id_record in report_data return report_records
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, search_path, endpoint_config, git_owner, git_repository, bookmark_query_field=None, bookmark_field=None, data_key=None, id_fields=None, selected_streams=None): # Get the latest bookmark for the stream and set the last_datetime last_datetime = get_bookmark(state, stream_name, start_date) file_max_bookmark_value = last_datetime # Convert to GitHub date format, example: Sun, 13 Oct 2019 22:40:01 GMT last_dttm = strptime_to_utc(last_datetime) last_modified = last_dttm.strftime("%a, %d %b %Y %H:%M:%S %Z'") LOGGER.info('HEADER If-Modified-Since: {}'.format(last_modified)) # Write schema and log selected fields for file stream and child lkml stream(s) write_schema(catalog, stream_name) selected_fields = get_selected_fields(catalog, stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format( stream_name, selected_fields)) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items(): if child_stream_name in selected_streams: write_schema(catalog, child_stream_name) child_selected_fields = get_selected_fields( catalog, child_stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format( child_stream_name, child_selected_fields)) # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 file_total_records = 0 lkml_total_records = 0 next_url = '{}/{}'.format(client.base_url, search_path) i = 1 while next_url is not None: LOGGER.info('Search URL for Stream {}: {}'.format( stream_name, next_url)) # API request search_data search_data = {} search_data, next_url = client.get(url=next_url, endpoint=stream_name) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() search_items = search_data.get(data_key) if not search_items: break # No data results file_count = 0 file_records = [] lkml_records = [] for item in search_items: file_count = file_count + 1 file_url = item.get('url') LOGGER.info('File URL for Stream {}: {}'.format( stream_name, file_url)) file_data = {} headers = {} if bookmark_query_field: headers[bookmark_query_field] = last_modified # API request file_data for item, single-file (ignore file_next_url) file_data, file_next_url = client.get(url=file_url, headers=headers, endpoint=stream_name) # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT if file_data: content = file_data.get('content') content_dict = {} if content: content_b64 = base64.b64decode(content) content_str = content_b64.decode('utf-8') content_dict = lkml.load(content_str) file_modified = file_data.get('last_modified') file_sha = file_data.get('sha') file_path = file_data.get('path') # Remove _links, content nodes, add git info file_data.pop('_links', None) file_data.pop('content', None) file_data['git_owner'] = git_owner file_data['git_repository'] = git_repository # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT file_records.append(file_data) # Loop thru each child object and append lkml records if children: for child_stream_name, child_endpoint_config in children.items( ): if child_stream_name in selected_streams: child_data_key = child_endpoint_config.get( 'data_key') if child_data_key and child_data_key in content_dict: for record in content_dict.get( child_data_key, []): record['path'] = file_path record['sha'] = file_sha record['last_modified'] = file_modified record['git_owner'] = git_owner record['git_repository'] = git_repository lkml_records.append(record) else: content_dict['path'] = file_path content_dict['sha'] = file_sha content_dict['last_modified'] = file_modified content_dict['git_owner'] = git_owner content_dict['git_repository'] = git_repository lkml_records.append(content_dict) # Process file_records and get the max_bookmark_value and record_count file_max_bookmark_value, file_record_count = process_records( catalog=catalog, stream_name=stream_name, records=file_records, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=file_max_bookmark_value, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, file_record_count)) file_total_records = file_total_records + file_record_count # Loop thru each child object to process lkml records if children: for child_stream_name, child_endpoint_config in children.items(): if child_stream_name in selected_streams: lkml_max_bookmark_value, lkml_record_count = process_records( catalog=catalog, stream_name=child_stream_name, records=lkml_records, time_extracted=time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( child_stream_name, lkml_record_count)) lkml_total_records = lkml_total_records + lkml_record_count # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, file_max_bookmark_value) # to_rec: to record; ending record for the batch page to_rec = offset + file_count LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) and page offset = offset + file_count page = page + 1 i = i + 1 # Return total_records across all pages LOGGER.info( 'Synced Stream: {}, TOTAL pages: {}, file records: {}, lookml records: {}' .format(stream_name, page - 1, file_total_records, lkml_total_records)) return file_total_records
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, path, endpoint_config, bookmark_field=None, project_timezone=None, days_interval=None, attribution_window=None, export_events=None, denest_properties_flag=None): # Get endpoint_config fields url = endpoint_config.get('url') data_key = endpoint_config.get('data_key', 'results') api_method = endpoint_config.get('api_method') parent_path = endpoint_config.get('parent_path') parent_id_field = endpoint_config.get('parent_id_field') static_params = endpoint_config.get('params', {}) bookmark_query_field_from = endpoint_config.get( 'bookmark_query_field_from') bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to') id_fields = endpoint_config.get('key_properties') date_dictionary = endpoint_config.get('date_dictionary', False) pagination = endpoint_config.get('pagination', False) # Get the latest bookmark for the stream and set the last_integer/datetime last_datetime = None max_bookmark_value = None last_datetime = get_bookmark(state, stream_name, start_date) max_bookmark_value = last_datetime write_schema(catalog, stream_name) # windowing: loop through date days_interval date windows from last_datetime to now_datetime tzone = pytz.timezone(project_timezone) now_datetime = datetime.now(tzone) if bookmark_query_field_from and bookmark_query_field_to: # days_interval from config date_window_size, default = 60; passed to function from sync if not days_interval: days_interval = 30 last_dttm = strptime_to_utc(last_datetime) delta_days = (now_datetime - last_dttm).days if delta_days <= attribution_window: delta_days = attribution_window LOGGER.info( "Start bookmark less than {} day attribution window.".format( attribution_window)) elif delta_days >= 365: delta_days = 365 LOGGER.warning( "WARNING: Start date or bookmark greater than 1 year maxiumum." ) LOGGER.warning("WARNING: Setting bookmark start to 1 year ago.") start_window = now_datetime - timedelta(days=delta_days) end_window = start_window + timedelta(days=days_interval) if end_window > now_datetime: end_window = now_datetime else: start_window = strptime_to_utc(last_datetime) end_window = now_datetime diff_sec = (end_window - start_window).seconds days_interval = math.ceil(diff_sec / (3600 * 24)) # round-up difference to days # LOOP order: Date Windows, Parent IDs, Page # Initialize counter endpoint_total = 0 # Total for ALL: parents, date windows, and pages # Begin date windowing loop while start_window < now_datetime: # Initialize counters date_total = 0 # Total records for a date window parent_total = 0 # Total records for parent ID total_records = 0 # Total records for all pages record_count = 0 # Total processed for page params = static_params # adds in endpoint specific, sort, filter params if bookmark_query_field_from and bookmark_query_field_to: # Request dates need to be normalized to project timezone or else errors may occur # Errors occur when from_date is > 365 days ago # and when to_date > today (in project timezone) from_date = '{}'.format(start_window.astimezone(tzone))[0:10] to_date = '{}'.format(end_window.astimezone(tzone))[0:10] LOGGER.info('START Sync for Stream: {}{}'.format( stream_name, ', Date window from: {} to {}'.format(from_date, to_date) \ if bookmark_query_field_from else '')) params[bookmark_query_field_from] = from_date params[bookmark_query_field_to] = to_date # funnels and cohorts have a parent endpoint with parent_data and parent_id_field if parent_path and parent_id_field: # API request data LOGGER.info('URL for Parent Stream {}: {}/{}'.format( stream_name, url, parent_path)) parent_data = client.request(method='GET', url=url, path=parent_path, endpoint='parent_data') # Other endpoints (not funnels, cohorts): Simulate parent_data with single record else: parent_data = [{'id': 'none'}] parent_id_field = 'id' for parent_record in parent_data: parent_id = parent_record.get(parent_id_field) LOGGER.info('START: Stream: {}, parent_id: {}'.format( stream_name, parent_id)) # pagination: loop thru all pages of data using next (if not None) page = 0 # First page is page=0, second page is page=1, ... offset = 0 limit = 250 # Default page_size # Initialize counters parent_total = 0 # Total records for parent ID total_records = 0 # Total records for all pages record_count = 0 # Total processed for page session_id = 'initial' if pagination: params['page_size'] = limit while offset <= total_records and session_id is not None: if pagination and page != 0: params['session_id'] = session_id params['page'] = page # querystring: Squash query params into string and replace [parent_id] querystring = '&'.join(['%s=%s' % (key, value) for (key, value) \ in params.items()]).replace( '[parent_id]', str(parent_id)) if stream_name == 'export' and export_events: event = json.dumps([export_events] if isinstance( export_events, str) else export_events) url_encoded = urllib.parse.quote(event) querystring += f'&event={url_encoded}' full_url = '{}/{}{}'.format( url, path, '?{}'.format(querystring) if querystring else '') LOGGER.info('URL for Stream {}: {}'.format( stream_name, full_url)) # API request data data = {} # Export has a streaming api call if stream_name == 'export': data = client.request_export(method=api_method, url=url, path=path, params=querystring, endpoint=stream_name) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() transformed_data = [] for record in data: if record and str(record) != '': # transform reocord and append to transformed_data array transformed_record = transform_record(record, stream_name, \ project_timezone, denest_properties_flag) transformed_data.append(transformed_record) # Check for missing keys for key in id_fields: val = transformed_record.get(key) if val == '' or not val: LOGGER.error('Error: Missing Key') raise 'Missing Key' if len(transformed_data) == limit: # Process full batch (limit = 250) records # and get the max_bookmark_value and record_count max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) total_records = total_records + record_count parent_total = parent_total + record_count date_total = date_total + record_count endpoint_total = endpoint_total + record_count transformed_data = [] LOGGER.info( 'Stream {}, batch processed {} records, total {}, max bookmark {}' .format(stream_name, record_count, endpoint_total, max_bookmark_value)) # End if (batch = limit 250) # End if record # End has export_data records loop # Process remaining, partial batch if len(transformed_data) > 0: max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) LOGGER.info( 'Stream {}, batch processed {} records'.format( stream_name, record_count)) total_records = total_records + record_count parent_total = parent_total + record_count date_total = date_total + record_count endpoint_total = endpoint_total + record_count # End if transformed_data # Export does not provide pagination; session_id = None breaks out of loop. session_id = None # End export stream API call else: # stream_name != 'export` data = client.request(method=api_method, url=url, path=path, params=querystring, endpoint=stream_name) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {} or data == []: LOGGER.info('No data for URL: {}'.format(full_url)) # No data results else: # has data # Transform data with transform_json from transform.py # The data_key identifies the array/list of records below the <root> element # LOGGER.info('data = {}'.format(data)) # TESTING, comment out transformed_data = [] # initialize the record list # Endpoints: funnels, revenue return results as dictionary for each date # Standardize results to a list/array if date_dictionary and data_key in data: results = {} results_list = [] for key, val in data[data_key].items(): # skip $overall summary if key != '$overall': val['date'] = key val['datetime'] = '{}T00:00:00Z'.format( key) results_list.append(val) results[data_key] = results_list data = results # Cohorts endpoint returns results as a list/array (no data_key) # All other endpoints have a data_key if data_key is None or data_key == '.': data_key = 'results' new_data = {'results': data} data = new_data transformed_data = [] # Loop through result records for record in data[data_key]: # transform reocord and append to transformed_data array transformed_record = transform_record( record, stream_name, project_timezone, parent_record) transformed_data.append(transformed_record) # Check for missing keys for key in id_fields: val = transformed_record.get(key) if val == '' or not val: LOGGER.error('Error: Missing Key') raise 'Missing Key' # End data record loop if not transformed_data or transformed_data is None or \ transformed_data == []: LOGGER.info( 'No transformed data for data = {}'.format( data)) # No transformed data results else: # has transformed data # Process records and get the max_bookmark_value and record_count max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) LOGGER.info( 'Stream {}, batch processed {} records'.format( stream_name, record_count)) # set total_records and pagination fields if page == 0: if isinstance(data, dict): total_records = data.get( 'total', record_count) else: total_records = record_count parent_total = parent_total + record_count date_total = date_total + record_count endpoint_total = endpoint_total + record_count if isinstance(data, dict): session_id = data.get('session_id', None) # to_rec: to record; ending record for the batch page if pagination: to_rec = offset + limit if to_rec > total_records: to_rec = total_records else: to_rec = record_count LOGGER.info( 'Synced Stream: {}, page: {}, {} to {} of total: {}' .format(stream_name, page, offset, to_rec, total_records)) # End has transformed data # End has data results # Pagination: increment the offset by the limit (batch-size) and page offset = offset + limit page = page + 1 # End page/batch loop # End stream != 'export' LOGGER.info('FINISHED: Stream: {}, parent_id: {}'.format( stream_name, parent_id)) LOGGER.info(' Total records for parent: {}'.format(parent_total)) # End parent record loop LOGGER.info('FINISHED Sync for Stream: {}{}'.format( stream_name, ', Date window from: {} to {}'.format(from_date, to_date) \ if bookmark_query_field_from else '')) LOGGER.info(' Total records for date window: {}'.format(date_total)) # Increment date window start_window = end_window next_end_window = end_window + timedelta(days=days_interval) if next_end_window > now_datetime: end_window = now_datetime else: end_window = next_end_window # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, max_bookmark_value) # End date window loop # Return endpoint_total across all batches return endpoint_total
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, bookmark_type=None, max_bookmark_value=None, last_datetime=None, last_integer=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): bookmark_dttm = strptime_to_utc( transformed_record[bookmark_field]) if max_bookmark_value: max_bookmark_value_dttm = strptime_to_utc( max_bookmark_value) if bookmark_dttm > max_bookmark_value_dttm: max_bookmark_value = transformed_record[ bookmark_field] else: max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): if bookmark_type == 'integer': # Keep only records whose bookmark is after the last_integer if transformed_record[bookmark_field] >= last_integer: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() elif bookmark_type == 'datetime': last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, len(records)
def advance_bookmark(worklogs): raise_if_bookmark_cannot_advance(worklogs) new_last_updated = max( utils.strptime_to_utc(w["updated"]) for w in worklogs) return new_last_updated
def update_bookmark(self, state, value): current_bookmark = self.get_bookmark(state) if value and utils.strptime_to_utc(value) > current_bookmark: write_bookmark(state, self.name, self.replication_key, value)
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, search_path, endpoint_config, bookmark_field=None, selected_streams=None): # Endpoint parameters bookmark_query_field = endpoint_config.get('bookmark_query_field', None) data_key = endpoint_config.get('data_key', stream_name) exclude_files = endpoint_config.get('exclude_files', []) csv_delimiter = endpoint_config.get('csv_delimiter', ',') skip_header_rows = endpoint_config.get('skip_header_rows', 0) activate_version_ind = endpoint_config.get('activate_version', False) alt_character_set = endpoint_config.get('alt_character_set', 'utf-8') # LOGGER.info('data_key = {}'.format(data_key)) # Get the latest bookmark for the stream and set the last_datetime last_datetime = get_bookmark(state, stream_name, start_date) last_dttm = strptime_to_utc(last_datetime) timezone = pytz.timezone('UTC') bookmark_dttm = utils.now() # Initialize bookmark_dttn max_bookmark_value = None # Convert to GitHub date format, example: Sun, 13 Oct 2019 22:40:01 GMT last_modified = last_dttm.strftime("%a, %d %b %Y %H:%M:%S %Z'") LOGGER.info('HEADER If-Modified-Since: {}'.format(last_modified)) # Write schema and log selected fields for stream write_schema(catalog, stream_name) selected_fields = get_selected_fields(catalog, stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format( stream_name, selected_fields)) # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 file_count = 0 total_records = 0 next_url = '{}/{}'.format(client.base_url, search_path) # Loop through all search items pages (while there are more pages, next_url) # and until bookmark_dttm < last_dttm first_record = True while next_url is not None and bookmark_dttm >= last_dttm: LOGGER.info('Search URL for Stream {}: {}'.format( stream_name, next_url)) # API request search_data search_data = {} search_data, next_url, search_last_modified = client.get( url=next_url, endpoint=stream_name) LOGGER.info('next_url = {}'.format(next_url)) # LOGGER.info('search_data = {}'.format(search_data)) # COMMENT OUT # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() search_items = search_data.get(data_key, []) if not search_items: LOGGER.info('Stream: {}, no files found'.format(stream_name)) break # No data results i = 0 # i = search item number item_total = len(search_items) # Loop through all search items until bookmark_dttm < last_dttm while i <= (item_total - 1) and bookmark_dttm >= last_dttm: item = search_items[i] file_name = item.get('name') # Skip excluded files if file_name in exclude_files: i = i + 1 if i > (item_total - 1): break else: item = search_items[i] csv_records = [] file_count = file_count + 1 # url (content url) is preferable to git_url (blob url) b/c it provides # last-modified header for bookmark # However, git_url allows for up to 100 MB files; url allows for up to 1 MB files # Therefore, we use the git_url (blob) endpoint # And make another call to the commits endpoint to get last-modified file_url = item.get('git_url') git_repository = item.get('repository', {}).get('name') git_owner = item.get('repository', {}).get('owner', {}).get('login') file_path = item.get('path') file_sha = item.get('sha') file_name = item.get('name') file_html_url = item.get('html_url') headers = {} if bookmark_query_field: headers[bookmark_query_field] = last_modified # API request commits_data for single-file, to get file last_modified commit_url = '{}/repos/{}/{}/commits?path={}'.format( client.base_url, git_owner, git_repository, file_path) LOGGER.info('Commit URL for Stream {}: {}'.format( stream_name, commit_url)) commit_data, commits_next_url, commit_last_modified = client.get( url=commit_url, headers=headers, endpoint='{}_commits'.format(stream_name)) # Bookmarking: search data (and commit data) sorted by last-modified desc # 1st item on 1st page sets max_bookmark_value = last-modified bookmark_dttm = strptime_to_utc(commit_last_modified) if first_record and bookmark_dttm > last_dttm: max_bookmark_value = commit_last_modified max_bookmark_dttm = bookmark_dttm max_bookmark_epoch = int( (max_bookmark_dttm - timezone.localize(datetime(1970, 1, 1))).total_seconds()) # For some streams (activate_version = True): # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs) # everytime after each sheet sync is complete. # This forces hard deletes on the data downstream if fewer records are sent. # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137 if activate_version_ind: if last_datetime == start_date: activate_version = 0 else: activate_version = max_bookmark_epoch activate_version_message = singer.ActivateVersionMessage( stream=stream_name, version=activate_version) if last_datetime == start_date: # initial load, send activate_version before AND after data sync singer.write_message(activate_version_message) LOGGER.info( 'INITIAL SYNC, Stream: {}, Activate Version: {}'. format(stream_name, activate_version)) else: activate_version = None # End: if first_record and bookmark_dttm > last_dttm if commit_data and bookmark_dttm >= last_dttm: # API request file_data for item, single-file (ignore file_next_url) file_data = {} headers = {} LOGGER.info('File URL for Stream {}: {}'.format( stream_name, file_url)) file_data, file_next_url, file_last_modified = client.get( url=file_url, headers=headers, endpoint=stream_name) # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT if file_data: # Read, decode, and parse content blob to json content = file_data.get('content') content_list = [] if content: content_b64 = base64.b64decode(content) # Italian files typically use character_set: utf-8 # However, some newer files use character_set: latin_1 # All other files use character_set: utf-8 (default) try: content_str = content_b64.decode('utf-8') except UnicodeDecodeError as err: LOGGER.warning( 'UTF-8 UNICODE DECODE ERROR: {}'.format(err)) # Try decoding with Alternate Character Set (from streams.py) content_str = content_b64.decode(alt_character_set) content_array = content_str.splitlines() content_array_sliced = content_array[skip_header_rows:] reader = csv.DictReader(content_array_sliced, delimiter=csv_delimiter) content_list = [r for r in reader] LOGGER.info('Retrieved file_name: {}'.format(file_name)) # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT # Loop thru and append csv records row_number = 1 for record in content_list: record['git_owner'] = git_owner record['git_repository'] = git_repository record['git_url'] = file_url record['git_html_url'] = file_html_url record['git_path'] = file_path record['git_sha'] = file_sha record['git_file_name'] = file_name record['git_last_modified'] = commit_last_modified record['__sdc_row_number'] = row_number # Transform record and append transformed_csv_record = {} try: transformed_csv_record = transform_record( stream_name, record) except Exception as err: LOGGER.error( 'Transform Record error: {}, Strean: {}'. format(err, stream_name)) LOGGER.error('record: {}'.format(record)) raise err # Bad records and totals if transformed_csv_record is None: continue csv_records.append(transformed_csv_record) row_number = row_number + 1 # End If file_data record_count = process_records(catalog=catalog, stream_name=stream_name, records=csv_records, time_extracted=time_extracted, version=activate_version) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, record_count)) total_records = total_records + record_count # End if commit_data first_record = False i = i + 1 # Next search item record # End: while i <= (item_total - 1) and bookmark_dttm >= last_dttm # to_rec: to record; ending record for the batch page to_rec = offset + file_count LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) and page offset = offset + file_count page = page + 1 # End: next_url is not None and bookmark_dttm >= last_dttm if file_count > 0 and max_bookmark_value: # End of Stream: Send Activate Version (if needed) and update State if activate_version_ind: singer.write_message(activate_version_message) write_bookmark(state, stream_name, max_bookmark_value) else: LOGGER.warning('NO NEW DATA FOR STREAM: {}'.format(stream_name)) write_bookmark(state, stream_name, last_datetime) # Return total_records across all pages LOGGER.info( 'Synced Stream: {}, TOTAL pages: {}, file count: {}, total records: {}' .format(stream_name, page - 1, file_count, total_records)) return total_records
def sync_stream(stream_name): """ Sync each stream, looking for newly created records. Updates are captured by events stream. """ LOGGER.info("Started syncing stream %s", stream_name) stream_metadata = metadata.to_map( Context.get_catalog_entry(stream_name)['metadata']) stream_field_whitelist = json.loads( Context.config.get('whitelist_map', '{}')).get(stream_name) extraction_time = singer.utils.now() replication_key = metadata.get(stream_metadata, (), 'valid-replication-keys')[0] # Invoice Items bookmarks on `date`, but queries on `created` filter_key = 'created' if stream_name == 'invoice_items' else replication_key stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) bookmark = stream_bookmark # if this stream has a sub_stream, compare the bookmark sub_stream_name = SUB_STREAMS.get(stream_name) # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark) should_sync_sub_stream = sub_stream_name and Context.is_selected( sub_stream_name) if should_sync_sub_stream: sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \ or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) # if there is a sub stream, set bookmark to sub stream's bookmark # since we know it must be earlier than the stream's bookmark if sub_stream_bookmark != stream_bookmark: bookmark = sub_stream_bookmark else: sub_stream_bookmark = None with Transformer( singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: end_time = dt_to_epoch(utils.now()) window_size = int( Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE)) if DEFAULT_DATE_WINDOW_SIZE != window_size: LOGGER.info('Using non-default date window size of %d', window_size) start_window = bookmark # NB: Immutable streams are never synced for updates. We've # observed a short lag period between when records are created and # when they are available via the API, so these streams will need # a short lookback window. if stream_name in IMMUTABLE_STREAMS: # pylint:disable=fixme # TODO: This may be an issue for other streams' created_at # entries, but to keep the surface small, doing this only for # immutable streams at first to confirm the suspicion. start_window -= IMMUTABLE_STREAM_LOOKBACK # NB: We observed records coming through newest->oldest and so # date-windowing was added and the tap only bookmarks after it has # gotten through a date window while start_window < end_time: stop_window = dt_to_epoch( epoch_to_dt(start_window) + timedelta(days=window_size)) # cut off the last window at the end time if stop_window > end_time: stop_window = end_time for stream_obj in paginate( STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key, start_window, stop_window): # get the replication key value from the object rec = unwrap_data_objects(stream_obj.to_dict_recursive()) rec = reduce_foreign_keys(rec, stream_name) stream_obj_created = rec[replication_key] rec['updated'] = stream_obj_created # sync stream if object is greater than or equal to the bookmark if stream_obj_created >= stream_bookmark: rec = transformer.transform( rec, Context.get_catalog_entry(stream_name)['schema'], stream_metadata) # At this point, the record has been transformed and so # any de-selected fields have been pruned. Now, prune off # any fields that aren't present in the whitelist. if stream_field_whitelist: rec = apply_whitelist(rec, stream_field_whitelist) singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.new_counts[stream_name] += 1 # sync sub streams if its selected and the parent object # is greater than its bookmark if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark: sync_sub_stream(sub_stream_name, stream_obj) # Update stream/sub-streams bookmarks as stop window if stop_window > stream_bookmark: stream_bookmark = stop_window singer.write_bookmark(Context.state, stream_name, replication_key, stream_bookmark) # the sub stream bookmarks on its parent if should_sync_sub_stream and stop_window > sub_stream_bookmark: sub_stream_bookmark = stop_window singer.write_bookmark(Context.state, sub_stream_name, replication_key, sub_stream_bookmark) singer.write_state(Context.state) # update window for next iteration start_window = stop_window singer.write_state(Context.state)
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"], bookmark_properties=['created_at']) latest_updated_at = utils.strptime_to_utc( STATE.get('latest_updated_at', DEFAULT_TIMESTAMP)) run_maximum_updated_at = latest_updated_at latest_disbursement_date = utils.strptime_to_utc( STATE.get('latest_disbursment_date', DEFAULT_TIMESTAMP)) run_maximum_disbursement_date = latest_disbursement_date latest_start_date = utils.strptime_to_utc(get_start("transactions")) period_start = latest_start_date - TRAILING_DAYS period_end = utils.now() logger.info("transactions: Syncing from {}".format(period_start)) logger.info( "transactions: latest_updated_at from {}, disbursement_date from {}". format(latest_updated_at, latest_disbursement_date)) logger.info( "transactions: latest_start_date from {}".format(latest_start_date)) # increment through each day (20k results max from api) for start, end in daterange(period_start, period_end): end = min(end, period_end) data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) time_extracted = utils.now() logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) row_written_count = 0 row_skipped_count = 0 for row in data: # Ensure updated_at consistency if not getattr(row, 'updated_at'): row.updated_at = row.created_at transformed = transform_row(row, schema) updated_at = to_utc(row.updated_at) # if disbursement is successful, get disbursement date # set disbursement datetime to min if not found if row.disbursement_details is None: disbursement_date = datetime.min else: if row.disbursement_details.disbursement_date is None: row.disbursement_details.disbursement_date = datetime.min disbursement_date = to_utc( datetime.combine( row.disbursement_details.disbursement_date, datetime.min.time())) # Is this more recent than our past stored value of update_at? # Is this more recent than our past stored value of disbursement_date? # Use >= for updated_at due to non monotonic updated_at values # Use > for disbursement_date - confirming all transactions disbursed # at the same time # Update our high water mark for updated_at and disbursement_date # in this run if (updated_at >= latest_updated_at) or (disbursement_date >= latest_disbursement_date): if updated_at > run_maximum_updated_at: run_maximum_updated_at = updated_at if disbursement_date > run_maximum_disbursement_date: run_maximum_disbursement_date = disbursement_date singer.write_record("transactions", transformed, time_extracted=time_extracted) row_written_count += 1 else: row_skipped_count += 1 logger.info("transactions: Written {} records from {} - {}".format( row_written_count, start, end)) logger.info("transactions: Skipped {} records from {} - {}".format( row_skipped_count, start, end)) # End day loop logger.info("transactions: Complete. Last updated record: {}".format( run_maximum_updated_at)) logger.info("transactions: Complete. Last disbursement date: {}".format( run_maximum_disbursement_date)) latest_updated_at = run_maximum_updated_at latest_disbursement_date = run_maximum_disbursement_date STATE['latest_updated_at'] = utils.strftime(latest_updated_at) STATE['latest_disbursement_date'] = utils.strftime( latest_disbursement_date) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE)
def sync(client, config, catalog, state): LOGGER.info('Starting Sync..') selected_streams = catalog.get_selected_streams(state) streams = [] stream_keys = [] with Transformer() as transformer: for catalog_entry in selected_streams: streams.append(catalog_entry) stream_keys.append(catalog_entry.stream) for catalog_entry in streams: stream = AVAILABLE_STREAMS[catalog_entry.stream](client=client, config=config, catalog=catalog, state=state) LOGGER.info('Syncing stream: %s', catalog_entry.stream) stream.update_currently_syncing(stream.name) stream.write_state() stream_schema = catalog_entry.schema.to_dict() stream.write_schema() stream_metadata = metadata.to_map(catalog_entry.metadata) bookmark_date = stream.get_bookmark(stream.name, config['start_date']) bookmark_dttm = strptime_to_utc(bookmark_date) max_bookmark_value = None with singer.metrics.record_counter( endpoint=stream.name) as counter: if stream.replication_method == 'FULL_TABLE': for page in stream.sync(client): for record in page: singer.write_record( catalog_entry.stream, transformer.transform( record, stream_schema, stream_metadata, )) counter.increment() else: for page in stream.sync(client, bookmark_date): for record in page: if not max_bookmark_value: max_bookmark_value = bookmark_date max_bookmark_dttm = strptime_to_utc( max_bookmark_value) record_timestamp = stream.max_from_replication_dates( record) if record_timestamp > max_bookmark_dttm: max_bookmark_value = strftime(record_timestamp) if record_timestamp >= bookmark_dttm: singer.write_record( catalog_entry.stream, transformer.transform( record, stream_schema, stream_metadata, )) counter.increment() stream.update_bookmark(stream.name, max_bookmark_value) stream.write_state() stream.update_currently_syncing(None) stream.write_state() LOGGER.info('Finished Sync..')
def subtract_day(self, bookmark): bookmark_dt = strptime_to_utc(bookmark) adjusted_bookmark = bookmark_dt - timedelta(days=1) return strftime(adjusted_bookmark)
def process_args(): # Parse command line arguments args = utils.parse_args(REQUIRED_CONFIG_KEYS) # Check for errors on the provided config # params that utils.parse_args is letting through if not args.config.get('start_date'): LOGGER.critical( "tap-google-analytics: a valid start_date must be provided.") sys.exit(1) if not (args.config.get('reports') or args.catalog): LOGGER.critical( "tap-google-analytics: a catalog or report must be provided.") sys.exit(1) if not args.config.get('key_file_location') and \ not args.config.get('oauth_credentials'): LOGGER.critical( "tap-google-analytics: a valid key_file_location string or \ oauth_credentials object must be provided.") sys.exit(1) # Remove optional args that have empty strings as values if 'reports' in args.config and not args.config.get('reports'): del args.config['reports'] if 'end_date' in args.config and not args.config.get('end_date'): del args.config['end_date'] # Process the [start_date, end_date) so that they define an open date # window that ends yesterday if end_date is not defined start_date = utils.strptime_to_utc(args.config['start_date']) args.config['start_date'] = utils.strftime(start_date, '%Y-%m-%d') end_date = args.config.get('end_date', utils.strftime(utils.now())) end_date = utils.strptime_to_utc(end_date) - datetime.timedelta(days=1) args.config['end_date'] = utils.strftime(end_date, '%Y-%m-%d') if end_date < start_date: LOGGER.critical( "tap-google-analytics: start_date '{}' > end_date '{}'".format( start_date, end_date)) sys.exit(1) # If using a service account, validate that the client_secrets.json file # exists and load it if args.config.get('key_file_location'): if Path(args.config['key_file_location']).is_file(): try: args.config['client_secrets'] = load_json( args.config['key_file_location']) except ValueError: LOGGER.critical( "tap-google-analytics: The JSON definition in '{}' has \ errors".format(args.config['key_file_location'])) sys.exit(1) else: LOGGER.critical("tap-google-analytics: '{}' file not found".format( args.config['key_file_location'])) sys.exit(1) else: # If using oauth credentials, verify that all required keys are present credentials = args.config['oauth_credentials'] for key in [ 'access_token', 'refresh_token', 'client_id', 'client_secret' ]: if not credentials.get(key): LOGGER.critical(f"tap-google-analytics: a valid {key} for the \ oauth_credentials must be provided.") sys.exit(1) return args
def test_run(self): """ Verify that we can get multiple pages of data for each stream """ conn_id = connections.ensure_connection(self) self.run_and_verify_check_mode(conn_id) self.select_and_verify_fields(conn_id) first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_bookmarks = menagerie.get_state(conn_id) first_sync_records = runner.get_records_from_target_output() new_bookmarks = {} for stream_name, current_bookmark in first_sync_bookmarks[ 'bookmarks'].items(): if stream_name == 'gl_accounts': new_gl_bookmarks = { sub_stream: self.subtract_day(sub_bookmark) for sub_stream, sub_bookmark in current_bookmark.items() } new_bookmarks[stream_name] = new_gl_bookmarks else: new_bookmarks[stream_name] = self.subtract_day( current_bookmark) new_state = {"bookmarks": new_bookmarks} # Ensure the test is not the first to post a state poll_state_version(conn_id) menagerie.set_state(conn_id, new_state) # Run a sync job using orchestrator second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_bookmarks = menagerie.get_state(conn_id) second_sync_records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): replication_method = self.expected_replication_method().get( stream) first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = first_sync_records.get(stream, {}).get( 'messages', []) second_sync_messages = second_sync_records.get(stream, {}).get( 'messages', []) if replication_method == self.INCREMENTAL: replication_key = self.expected_replication_keys().get( stream).pop() first_sync_bookmark_value = first_sync_bookmarks[ 'bookmarks'][stream] second_sync_bookmark_value = second_sync_bookmarks[ 'bookmarks'][stream] simulated_bookmark_value = new_state['bookmarks'][stream] # Verify the both syncs end on the same bookmark self.assertEqual(first_sync_bookmark_value, second_sync_bookmark_value) # Verify that first sync records fall betwen the start date and the final # bookmark value for message in first_sync_messages: lower_bound = strptime_to_utc( self.get_properties()['start_date']) actual_value = strptime_to_utc( message.get('data').get(replication_key)) upper_bound = strptime_to_utc( first_sync_bookmark_value) self.assertTrue( lower_bound <= actual_value <= upper_bound, msg= "First sync records fall outside of expected sync window" ) # Verify the second sync records fall between simulated bookmark value and the # final bookmark value for message in second_sync_messages: lower_bound = strptime_to_utc(simulated_bookmark_value) actual_value = strptime_to_utc( message.get('data', {}).get(replication_key)) upper_bound = strptime_to_utc( second_sync_bookmark_value) self.assertTrue( lower_bound <= actual_value <= upper_bound, msg= "Second sync records fall outside of expected sync window" ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}". format(stream)) elif replication_method == self.FULL_TABLE: # Verify no bookmark exists self.assertNotIn(stream, first_sync_bookmarks['bookmarks']) self.assertNotIn(stream, second_sync_bookmarks['bookmarks']) else: raise NotImplementedError( "invalid replication method: {}".format( replication_method))
def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) if "schema" in catalog: schema = catalog["schema"] else: schema = load_schema('engagements') bookmark_key = 'lastUpdated' singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "engagements") or utils.now() STATE = write_current_sync_start(STATE, "engagements", current_sync_start) singer.write_state(STATE) max_bk_value = start LOGGER.info("sync_engagements from %s", start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") params = {'limit': 250} top_level_key = "results" engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform( lift_properties_and_versions(engagement), schema, mdata) if record['engagement'][bookmark_key] >= start: # hoist PK and bookmark field to top-level record record['engagement_id'] = record['engagement']['id'] record[bookmark_key] = record['engagement'][bookmark_key] singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record['engagement'][bookmark_key] >= max_bk_value: max_bk_value = record['engagement'][bookmark_key] # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'engagements', None) singer.write_state(STATE) return STATE
def sync_endpoint(client, config, catalog, state, stream_name, endpoint_config, sync_streams, selected_streams, timezone_desc=None, parent_id=None): # endpoint_config variables base_path = endpoint_config.get('path', stream_name) bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None) params = endpoint_config.get('params', {}) paging = endpoint_config.get('paging', False) bookmark_query_field_from = endpoint_config.get( 'bookmark_query_field_from') bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to') targeting_group = endpoint_config.get('targeting_group') targeting_type = endpoint_config.get('targeting_type') targeting_country_ind = endpoint_config.get('targeting_country_ind', False) data_key_array = endpoint_config.get('data_key_array') data_key_record = endpoint_config.get('data_key_record').format( targeting_type=targeting_type) id_fields = endpoint_config.get('key_properties') parent = endpoint_config.get('parent') date_window_size = int(endpoint_config.get('date_window_size', '1')) # tap config variabless start_date = config.get('start_date') swipe_up_attribution_window = config.get('swipe_up_attribution_window', '28_DAY') view_attribution_window = config.get('view_attribution_window', '7_DAY') swipe_up_attr = int(swipe_up_attribution_window.replace('_DAY', '')) if view_attribution_window in ( '1_HOUR', '3_HOUR', '6_HOUR', ): view_attr = 1 else: view_attr = int(view_attribution_window.replace('_DAY', '')) attribution_window = max(1, swipe_up_attr, view_attr) omit_empty = config.get('omit_empty', 'true') if '_stats_' in stream_name: params['omit_empty'] = omit_empty country_codes = config.get('targeting_country_codes', 'us').replace(' ', '').lower() if targeting_country_ind: country_code_list = country_codes.split(',') else: country_code_list = ['none'] # Get the timezone and latest bookmark for the stream if not timezone_desc: timezone = pytz.timezone('UTC') else: timezone = pytz.timezone(timezone_desc) LOGGER.info('timezone = {}'.format(timezone)) last_datetime = get_bookmark(state, stream_name, start_date, bookmark_field, parent, parent_id) max_bookmark_value = last_datetime # Convert to datetimes in local/ad account timezone now_datetime = utils.now() last_dttm = strptime_to_utc(last_datetime) report_granularity = params.get('granularity', 'HOUR') if '_stats_' in stream_name: LOGGER.info('report_granularity: {}'.format(report_granularity)) if bookmark_query_field_from and bookmark_query_field_to: # date_window_size: Number of days in each date window # Set start window start_window = now_datetime - timedelta(days=attribution_window) if last_dttm < start_window: start_window = last_dttm # Set end window end_window = start_window + timedelta(days=date_window_size) else: start_window = last_dttm end_window = now_datetime diff_sec = (end_window - start_window).seconds date_window_size = math.ceil( diff_sec / (3600 * 24)) # round-up difference to days endpoint_total = 0 total_records = 0 while start_window < now_datetime: LOGGER.info('START Sync for Stream: {}{}'.format( stream_name, ', Date window from: {} to {}'.format(start_window.date(), end_window.date()) \ if bookmark_query_field_from else '')) if bookmark_query_field_from and bookmark_query_field_to: # Query parameter startDate and endDate must be in Eastern time zone # API will error if future dates are requested if report_granularity == 'DAY': window_start_dt_str = remove_hours_local( start_window, timezone) window_end_dt_str = remove_hours_local(end_window, timezone) if window_start_dt_str == window_end_dt_str: window_end_dt_str = remove_hours_local( end_window + timedelta(days=1), timezone) else: window_start_dt_str = remove_minutes_local( start_window, timezone) window_end_dt_str = remove_minutes_local(end_window, timezone) if window_start_dt_str == window_end_dt_str: window_end_dt_str = remove_hours_local( end_window + timedelta(hours=1), timezone) params[bookmark_query_field_from] = window_start_dt_str params[bookmark_query_field_to] = window_end_dt_str # This loop will run once for non-country_code endpoints # and one or more times (for each country) for country_code endpoints for country_code in country_code_list: # Path if stream_name.startswith('targeting_'): path = base_path.format(targeting_group=targeting_group, targeting_type=targeting_type, country_code=country_code, parent_id=parent_id) else: path = base_path.format(country_code=country_code, parent_id=parent_id) # pagination: loop thru all pages of data using next (if not None) # Reference: https://developers.snapchat.com/api/docs/#pagination total_records = 0 offset = 1 page = 1 if paging: limit = 500 # Allowed values: 50 - 1000 params['limit'] = limit else: limit = None for key, val in params.items(): # Replace variables in params new_val = str(val).format( swipe_up_attribution_window=swipe_up_attribution_window, view_attribution_window=view_attribution_window) params[key] = new_val # concate params querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) # initialize next_url next_url = '{}/{}?{}'.format(client.base_url, path, querystring) # pagination loop while next_url is not None: # API request data data = {} try: data = client.get(url=next_url, endpoint=stream_name) except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('URL for Stream {}: {}'.format( stream_name, next_url)) raise Exception(err) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: LOGGER.info('No data results returned') total_records = 0 break # No data results request_status = data.get('request_status') if request_status != 'SUCCESS': raise RuntimeError(data) # Get pagination next_url next_url = data.get('paging', {}).get('next_link', None) # Transform data with transform_json from transform.py # The data_key_array identifies the array/list of records below the <root> element # LOGGER.info('data = {}'.format(data)) # TESTING, comment out transformed_data = [] # initialize the record list # Reports stats streams de-nesting if '_stats_' in stream_name: for data_record in data.get(data_key_array, []): base_record = data_record.get(data_key_record, {}) records = base_record.get('timeseries', []) for record in records: # Add parent base_record fields to record for key, val in base_record.items(): if key not in ('start_time', 'end_time', 'timeseries'): record[key] = val # De-nest stats stats = record.get('stats', {}) for key, val in stats.items(): record[key] = val record.pop('stats', None) # transform record try: transformed_record = humps.decamelize(record) except Exception as err: LOGGER.error('{}'.format(err)) # LOGGER.error('error record: {}'.format(record)) # COMMENT OUT raise Exception(err) # verify primary_keys are in tansformed_record if 'id' not in transformed_record or 'start_time' not in transformed_record: LOGGER.error( 'Stream: {}, Missing key (id or start_time)' .format(stream_name)) LOGGER.error('transformed_record: {}'.format( transformed_record)) raise RuntimeError transformed_data.append(transformed_record) # End for record in records # End for data_record in array # End stats stream # Other streams de-nesting else: # Not stats stream for data_record in data.get(data_key_array, []): sub_request_status = data_record.get( 'sub_request_status') if sub_request_status != 'SUCCESS': raise RuntimeError(data_record) record = data_record.get(data_key_record, {}) # Transforms to align schemas for targeting streams if stream_name.startswith('targeting_'): record['targeting_group'] = targeting_group record['targeting_type'] = targeting_type if country_code != 'none': record['country_code'] = country_code if targeting_group == 'geo': record_id = record.get(targeting_type, {}).get('id') record_name = record.get(targeting_type, {}).get('name') record['id'] = record_id record['name'] = record_name if targeting_type == 'postal_code': record_id = record.get('postalCode') record['id'] = record_id record['name'] = record_id record.pop('postalCode') # Add parent id field/value if parent and parent_id: parent_key = '{}_id'.format(parent) record[parent_key] = parent_id # transform record (remove inconsistent use of CamelCase) try: transformed_record = humps.decamelize(record) except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('error record: {}'.format(record)) raise Exception(err) # verify primary_keys are in tansformed_record for key in id_fields: if not transformed_record.get(key): LOGGER.error( 'Stream: {}, Missing key {}'.format( stream_name, key)) LOGGER.info('transformed_record: {}'.format( transformed_record)) raise RuntimeError transformed_data.append(transformed_record) # End for data_record in array # End non-stats stream # LOGGER.info('transformed_data = {}'.format(transformed_data)) # COMMENT OUT if not transformed_data or transformed_data is None: LOGGER.info( 'No transformed data for data = {}'.format(data)) total_records = 0 break # No transformed_data results # Process records and get the max_bookmark_value and record_count if stream_name in sync_streams: max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, record_count)) # Loop thru parent batch records for each children objects (if should stream) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items( ): if child_stream_name in sync_streams: LOGGER.info( 'START Syncing: {}'.format(child_stream_name)) write_schema(catalog, child_stream_name) # For each parent record for record in transformed_data: i = 0 # Set parent_id for id_field in id_fields: if i == 0: parent_id_field = id_field if id_field == 'id': parent_id_field = id_field i = i + 1 parent_id = record.get(parent_id_field) if stream_name == 'ad_accounts': timezone_desc = record.get( 'timezone', timezone_desc) # sync_endpoint for child LOGGER.info( 'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\ .format(child_stream_name, stream_name, parent_id)) child_total_records = sync_endpoint( client=client, config=config, catalog=catalog, state=state, stream_name=child_stream_name, endpoint_config=child_endpoint_config, sync_streams=sync_streams, selected_streams=selected_streams, timezone_desc=timezone_desc, parent_id=parent_id) LOGGER.info( 'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\ .format(child_stream_name, parent_id, child_total_records)) # End transformed data record loop # End if child in sync_streams # End child streams for parent # End if children # Parent record batch total_records = total_records + record_count endpoint_total = endpoint_total + record_count LOGGER.info( 'Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, total_records)) # Pagination: increment the offset by the limit (batch-size) and page if limit: offset = offset + limit page = page + 1 # End page/batch - while next URL loop # End country_code loop # Update the state with the max_bookmark_value for the stream date window # Snapchat Ads API does not allow page/batch sorting; bookmark written for date window if bookmark_field and stream_name in selected_streams: write_bookmark(state, stream_name, max_bookmark_value, bookmark_field, parent, parent_id) # Increment date window and sum endpoint_total start_window = end_window next_end_window = end_window + timedelta(days=date_window_size) if next_end_window > now_datetime: end_window = now_datetime else: end_window = next_end_window # End date window # Return total_records (for all pages and date windows) return endpoint_total