def get_tweet_entity_ids(client, account_id, window_start, window_end): entity_ids = [] flat_streams = flatten_streams() tweet_config = flat_streams.get("tweets") tweet_path = tweet_config.get("path").replace("{account_id}", account_id) datetime_format = tweet_config.get("datetime_format") # Set params for PUBLISHED ORGANIC_TWEETs tweet_params = tweet_config.get("params") tweet_params["tweet_type"] = "PUBLISHED" tweet_params["timeline_type"] = "ORGANIC" tweet_params["with_deleted"] = "false" tweet_params["trim_user"] = "******" tweet_cursor = get_resource("tweets", client, tweet_path, tweet_params) # Loop thru organic tweets to get entity_ids (if in date range) for tweet in tweet_cursor: entity_id = tweet["id"] created_at = tweet["created_at"] created_dttm = datetime.strptime(created_at, datetime_format) if created_dttm <= window_end and created_dttm >= window_start: entity_ids.append(entity_id) elif created_dttm < window_start: break return entity_ids
def get_tweet_entity_ids(client, account_id, window_start, window_end): entity_ids = [] flat_streams = flatten_streams() tweet_config = flat_streams.get('tweets') tweet_path = tweet_config.get('path').replace('{account_id}', account_id) datetime_format = tweet_config.get('datetime_format') # Set params for PUBLISHED ORGANIC_TWEETs tweet_params = tweet_config.get('params') tweet_params['tweet_type'] = 'PUBLISHED' tweet_params['timeline_type'] = 'ORGANIC' tweet_params['with_deleted'] = 'false' tweet_params['trim_user'] = '******' tweet_cursor = get_resource('tweets', client, tweet_path, tweet_params) # Loop thru organic tweets to get entity_ids (if in date range) for tweet in tweet_cursor: entity_id = tweet['id'] created_at = tweet['created_at'] created_dttm = datetime.strptime(created_at, datetime_format) if created_dttm <= window_end and created_dttm >= window_start: entity_ids.append(entity_id) elif created_dttm < window_start: break return entity_ids
def get_schemas(reports): schemas = {} field_metadata = {} refs = load_shared_schema_refs() flat_streams = flatten_streams() # JSON schemas for each stream endpoint for stream_name, stream_metadata in flat_streams.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema resolve_schema_references(schema, refs) mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata # JSON schemas for each report for report in reports: report_name = report.get('name') report_entity = report.get('entity') report_segment = report.get('segment') report_granularity = report.get('granularity') # Metrics & Segmentation: https://developer.twitter.com/en/docs/ads/analytics/overview/metrics-and-segmentation # Google Sheet summary: https://docs.google.com/spreadsheets/d/1Cn3B1TPZOjg9QhnnF44Myrs3W8hNOSyFRH6qn8SCc7E/edit?usp=sharing err = None running_error = '' if report_entity not in ENTITY_TYPES: err = 'Report: {}, Entity: {}: INVALID ENTITY'.format( report_name, report_entity) running_error = '{}; {}'.format(running_error, err) if report_segment not in SEGMENTS: err = 'Report: {}, Segment: {}: INVALID SEGMENT'.format( report_name, report_segment) running_error = '{}; {}'.format(running_error, err) if report_granularity not in GRANULARITIES: err = 'Report: {}, Granularity: {}: INVALID GRANULARITY'.format( report_name, report_granularity) running_error = '{}; {}'.format(running_error, err) if report_entity in ('MEDIA_CREATIVE', 'ORGANIC_TWEET') and \ not report_segment == 'NO_SEGMENT': err = 'Report: {}, Segment: {}, SEGMENTATION NOT ALLOWED for Entity: {}'.format( report_name, report_segment, report_entity) running_error = '{}; {}'.format(running_error, err) # Undocumented rule: CONVERSION_TAGS report segment only allowed for certain entities if report_segment == 'CONVERSION_TAGS' and report_entity in \ ['FUNDING_INSTRUMENT', 'PROMOTED_ACCOUNT']: # 'ACCOUNT', err = 'Report: {}, Entity: {}, Segment: CONVERSION_TAGS, INVALID COMBINATION'.format( report_name, report_entity) running_error = '{}; {}'.format(running_error, err) # Undocumented rule: LANGUAGES report segment only allowed for certain report_entities if report_segment == 'LANGUAGES' and report_entity in \ ['ACCOUNT', 'FUNDING_INSTRUMENT', 'MEDIA_CREATIVE']: err = 'Report: {}, Entity: {}, Segment: LANGUAGES, INVALID COMBINATION'.format( report_name, report_entity) running_error = '{}; {}'.format(running_error, err) if err: LOGGER.error('ERROR: {}'.format(running_error)) raise RuntimeError(running_error) # Undocumented rule: CONVERSION_TAGS report segment ONLY allows WEB_CONVERSION metric group if report_segment == 'CONVERSION_TAGS' and report_entity in \ ['ACCOUNT', 'CAMPAIGN', 'LINE_ITEM', 'PROMOTED_TWEET']: report_path = get_abs_path( 'schemas/shared/report_web_conversion.json') # ACCOUNT, FUNDING_INSTRUMENT, ORGANIC_TWEET only permit a subset of METRIC_GROUPS elif report_entity in ('ACCOUNT', 'FUNDING_INSTRUMENT', 'ORGANIC_TWEET'): report_path = get_abs_path('schemas/shared/report_{}.json'.format( report_entity.lower())) else: report_path = get_abs_path('schemas/shared/report_other.json') with open(report_path) as file: schema = json.load(file) # Replace $ref nodes with reference nodes in schema resolve_schema_references(schema, refs) # If NO_SEGMENT, then remove Segment fields if report_segment == 'NO_SEGMENT': schema['properties']['dimensions'].pop('segmentation_type', None) schema['properties']['dimensions'].pop('segment_name', None) schema['properties']['dimensions'].pop('segment_value', None) # Web Conversion ONLY valid for NO_SEGMENT, PLATFORM, and CONVERSION_TAGS segment # Reference: https://developer.twitter.com/en/docs/ads/analytics/overview/metrics-and-segmentation#WEB_CONVERSION # Docs ^^ say 'PLATFORMS Only' Segmentation; but CONVERSION_TAGS segment only allow WEB_CONVERSION metrics if report_segment not in ('NO_SEGMENT', 'PLATFORMS', 'CONVERSION_TAGS'): schema['properties'].pop('web_conversion', None) schemas[report_name] = schema mdata = metadata.new() mdata = metadata.get_standard_metadata( schema=schema, key_properties=['__sdc_dimensions_hash_key'], valid_replication_keys=['end_time'], replication_method='INCREMENTAL') field_metadata[report_name] = mdata return schemas, field_metadata
def sync(client, config, state): # Get config parameters account_list = config.get("account_ids").replace(" ", "").split(",") country_code_list = config.get("country_codes", "US").replace(" ", "").split(",") start_date = config.get("start_date") reports = [{ "name": "campaign_events", "entity": "CAMPAIGN", "segment": "NO_SEGMENT", "granularity": "DAY", }] # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info("Last/Currently Syncing Stream: {}".format(last_stream)) # For children, ensure that dependent parent_stream is included parent_streams = [] child_streams = [] # Get all streams (parent + child) from streams.py flat_streams = flatten_streams() # Loop thru all streams for stream_name, stream_metadata in flat_streams.items(): # If stream has a parent_stream, then it is a child stream parent_stream = stream_metadata.get("parent_stream") if parent_stream: child_streams.append(stream_name) else: parent_streams.append(stream_name) LOGGER.info("Sync Parent Streams: {}".format(parent_streams)) LOGGER.info("Sync Child Streams: {}".format(child_streams)) report_streams = [] for report in reports: report_name = report.get("name") report_streams.append(report_name) LOGGER.info("Sync Report Streams: {}".format(report_streams)) # ACCOUNT_ID OUTER LOOP for account_id in account_list: LOGGER.info("Account ID: {} - START Syncing".format(account_id)) # PARENT STREAM LOOP for stream_name in parent_streams: update_currently_syncing(state, stream_name) endpoint_config = flat_streams.get(stream_name) LOGGER.info("Stream: {} - START Syncing, Account ID: {}".format( stream_name, account_id)) total_records = sync_endpoint( client=client, state=state, start_date=start_date, stream_name=stream_name, endpoint_config=endpoint_config, tap_config=config, account_id=account_id, child_streams=child_streams, ) LOGGER.info( "Stream: {} - FINISHED Syncing, Account ID: {}, Total Records: {}" .format(stream_name, account_id, total_records)) update_currently_syncing(state, None) # GET country_ids and platform_ids (targeting values) - only if reports exist if report_streams != []: # GET country_ids (targeting_values) based on config country_codes country_ids = [] country_path = "targeting_criteria/locations" for country_code in country_code_list: country_params = { "count": 1000, "cursor": None, "location_type": "COUNTRIES", "country_code": country_code, } country_cursor = get_resource("countries", client, country_path, country_params) for country in country_cursor: country_id = country["targeting_value"] country_ids.append(country_id) LOGGER.info( "Countries - Country Codes: {}, Country Targeting IDs: {}". format(country_code_list, country_ids)) # GET platform_ids (targeting_values) platform_ids = [] platforms_path = "targeting_criteria/platforms" platforms_params = {"count": 1000, "cursor": None} platforms_cursor = get_resource("platforms", client, platforms_path, platforms_params) for platform in platforms_cursor: platform_id = platform["targeting_value"] platform_ids.append(platform_id) LOGGER.info( "Platforms - Platform Targeting IDs: {}".format(platform_ids)) # REPORT STREAMS LOOP for report in reports: report_name = report.get("name") if report_name in report_streams: update_currently_syncing(state, report_name) LOGGER.info( "Report: {} - START Syncing for Account ID: {}".format( report_name, account_id)) total_records = sync_report( client=client, state=state, start_date=start_date, report_name=report_name, report_config=report, tap_config=config, account_id=account_id, country_ids=country_ids, platform_ids=platform_ids, ) # pylint: disable=line-too-long LOGGER.info( "Report: {} - FINISHED Syncing for Account ID: {}, Total Records: {}" .format(report_name, account_id, total_records)) # pylint: enable=line-too-long update_currently_syncing(state, None) LOGGER.info("Account ID: {} - FINISHED Syncing".format(account_id))
def sync(client, config, catalog, state): # Get config parameters account_list = config.get('account_ids').replace(' ', '').split(',') country_code_list = config.get('country_codes', 'US').replace(' ', '').split(',') start_date = config.get('start_date') reports = config.get('reports', []) # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('Last/Currently Syncing Stream: {}'.format(last_stream)) # Get ALL selected streams from catalog selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('Sync Selected Streams: {}'.format(selected_streams)) if not selected_streams: return # Get lists of parent and child streams to sync (from streams.py and catalog) # For children, ensure that dependent parent_stream is included parent_streams = [] child_streams = [] # Get all streams (parent + child) from streams.py flat_streams = flatten_streams() # Loop thru all streams for stream_name, stream_metadata in flat_streams.items(): # If stream has a parent_stream, then it is a child stream parent_stream = stream_metadata.get('parent_stream') # Append selected parent streams if not parent_stream and stream_name in selected_streams: parent_streams.append(stream_name) # Append selected child streams elif parent_stream and stream_name in selected_streams: child_streams.append(stream_name) # Append un-selected parent streams of selected children if parent_stream not in selected_streams: parent_streams.append(parent_stream) LOGGER.info('Sync Parent Streams: {}'.format(parent_streams)) LOGGER.info('Sync Child Streams: {}'.format(child_streams)) # Get list of report streams to sync (from config and catalog) report_streams = [] for report in reports: report_name = report.get('report_name') if report_name in selected_streams: report_streams.append(report_name) LOGGER.info('Sync Report Streams: {}'.format(report_streams)) # ACCOUNT_ID OUTER LOOP for account_id in account_list: LOGGER.info('Account ID: {} - START Syncing'.format(account_id)) # PARENT STREAM LOOP for stream_name in parent_streams: update_currently_syncing(state, stream_name) endpoint_config = flat_streams.get(stream_name) LOGGER.info('Stream: {} - START Syncing, Account ID: {}'.format( stream_name, account_id)) # Write schema and log selected fields for stream write_schema(catalog, stream_name) selected_fields = get_selected_fields(catalog, stream_name) LOGGER.info('Stream: {} - selected_fields: {}'.format(stream_name, selected_fields)) total_records = sync_endpoint(client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, endpoint_config=endpoint_config, tap_config=config, account_id=account_id, child_streams=child_streams) LOGGER.info('Stream: {} - FINISHED Syncing, Account ID: {}, Total Records: {}'.format( stream_name, account_id, total_records)) update_currently_syncing(state, None) # GET country_ids and platform_ids (targeting values) - only if reports exist if report_streams != []: # GET country_ids (targeting_values) based on config country_codes country_ids = [] country_path = 'targeting_criteria/locations' for country_code in country_code_list: country_params = { 'count': 1000, 'cursor': None, 'location_type': 'COUNTRIES', 'country_code': country_code } country_cursor = get_resource('countries', client, country_path, country_params) for country in country_cursor: country_id = country['targeting_value'] country_ids.append(country_id) LOGGER.info('Countries - Country Codes: {}, Country Targeting IDs: {}'.format( country_code_list, country_ids)) # GET platform_ids (targeting_values) platform_ids = [] platforms_path = 'targeting_criteria/platforms' platforms_params = { 'count': 1000, 'cursor': None } platforms_cursor = get_resource('platforms', client, platforms_path, platforms_params) for platform in platforms_cursor: platform_id = platform['targeting_value'] platform_ids.append(platform_id) LOGGER.info('Platforms - Platform Targeting IDs: {}'.format(platform_ids)) # REPORT STREAMS LOOP for report in reports: report_name = report.get('report_name') if report_name in report_streams: update_currently_syncing(state, report_name) LOGGER.info('Report: {} - START Syncing for Account ID: {}'.format( report_name, account_id)) # Write schema and log selected fields for stream write_schema(catalog, report_name) selected_fields = get_selected_fields(catalog, report_name) LOGGER.info('Report: {} - selected_fields: {}'.format( report_name, selected_fields)) total_records = sync_report(client=client, catalog=catalog, state=state, start_date=start_date, report_name=report_name, report_config=report, tap_config=config, account_id=account_id, country_ids=country_ids, platform_ids=platform_ids) # pylint: disable=line-too-long LOGGER.info('Report: {} - FINISHED Syncing for Account ID: {}, Total Records: {}'.format( report_name, account_id, total_records)) # pylint: enable=line-too-long update_currently_syncing(state, None) LOGGER.info('Account ID: {} - FINISHED Syncing'.format(account_id))