예제 #1
0
def get_tweet_entity_ids(client, account_id, window_start, window_end):
    entity_ids = []
    flat_streams = flatten_streams()
    tweet_config = flat_streams.get("tweets")
    tweet_path = tweet_config.get("path").replace("{account_id}", account_id)
    datetime_format = tweet_config.get("datetime_format")

    # Set params for PUBLISHED ORGANIC_TWEETs
    tweet_params = tweet_config.get("params")
    tweet_params["tweet_type"] = "PUBLISHED"
    tweet_params["timeline_type"] = "ORGANIC"
    tweet_params["with_deleted"] = "false"
    tweet_params["trim_user"] = "******"

    tweet_cursor = get_resource("tweets", client, tweet_path, tweet_params)
    # Loop thru organic tweets to get entity_ids (if in date range)
    for tweet in tweet_cursor:
        entity_id = tweet["id"]
        created_at = tweet["created_at"]
        created_dttm = datetime.strptime(created_at, datetime_format)
        if created_dttm <= window_end and created_dttm >= window_start:
            entity_ids.append(entity_id)
        elif created_dttm < window_start:
            break

    return entity_ids
예제 #2
0
def get_tweet_entity_ids(client, account_id, window_start, window_end):
    entity_ids = []
    flat_streams = flatten_streams()
    tweet_config = flat_streams.get('tweets')
    tweet_path = tweet_config.get('path').replace('{account_id}', account_id)
    datetime_format = tweet_config.get('datetime_format')

    # Set params for PUBLISHED ORGANIC_TWEETs
    tweet_params = tweet_config.get('params')
    tweet_params['tweet_type'] = 'PUBLISHED'
    tweet_params['timeline_type'] = 'ORGANIC'
    tweet_params['with_deleted'] = 'false'
    tweet_params['trim_user'] = '******'

    tweet_cursor = get_resource('tweets', client, tweet_path, tweet_params)
    # Loop thru organic tweets to get entity_ids (if in date range)
    for tweet in tweet_cursor:
        entity_id = tweet['id']
        created_at = tweet['created_at']
        created_dttm = datetime.strptime(created_at, datetime_format)
        if created_dttm <= window_end and created_dttm >= window_start:
            entity_ids.append(entity_id)
        elif created_dttm < window_start:
            break

    return entity_ids
예제 #3
0
def get_schemas(reports):
    schemas = {}
    field_metadata = {}

    refs = load_shared_schema_refs()
    flat_streams = flatten_streams()

    # JSON schemas for each stream endpoint
    for stream_name, stream_metadata in flat_streams.items():
        schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
        with open(schema_path) as file:
            schema = json.load(file)

        schemas[stream_name] = schema
        resolve_schema_references(schema, refs)
        mdata = metadata.new()

        # Documentation:
        # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
        # Reference:
        # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=stream_metadata.get('key_properties', None),
            valid_replication_keys=stream_metadata.get('replication_keys',
                                                       None),
            replication_method=stream_metadata.get('replication_method', None))
        field_metadata[stream_name] = mdata

    # JSON schemas for each report
    for report in reports:
        report_name = report.get('name')
        report_entity = report.get('entity')
        report_segment = report.get('segment')
        report_granularity = report.get('granularity')

        # Metrics & Segmentation: https://developer.twitter.com/en/docs/ads/analytics/overview/metrics-and-segmentation
        # Google Sheet summary: https://docs.google.com/spreadsheets/d/1Cn3B1TPZOjg9QhnnF44Myrs3W8hNOSyFRH6qn8SCc7E/edit?usp=sharing
        err = None
        running_error = ''
        if report_entity not in ENTITY_TYPES:
            err = 'Report: {}, Entity: {}: INVALID ENTITY'.format(
                report_name, report_entity)
            running_error = '{}; {}'.format(running_error, err)
        if report_segment not in SEGMENTS:
            err = 'Report: {}, Segment: {}: INVALID SEGMENT'.format(
                report_name, report_segment)
            running_error = '{}; {}'.format(running_error, err)
        if report_granularity not in GRANULARITIES:
            err = 'Report: {}, Granularity: {}: INVALID GRANULARITY'.format(
                report_name, report_granularity)
            running_error = '{}; {}'.format(running_error, err)

        if report_entity in ('MEDIA_CREATIVE', 'ORGANIC_TWEET') and \
            not report_segment == 'NO_SEGMENT':
            err = 'Report: {}, Segment: {}, SEGMENTATION NOT ALLOWED for Entity: {}'.format(
                report_name, report_segment, report_entity)
            running_error = '{}; {}'.format(running_error, err)

        # Undocumented rule: CONVERSION_TAGS report segment only allowed for certain entities
        if report_segment == 'CONVERSION_TAGS' and report_entity in \
            ['FUNDING_INSTRUMENT', 'PROMOTED_ACCOUNT']: # 'ACCOUNT',
            err = 'Report: {}, Entity: {}, Segment: CONVERSION_TAGS, INVALID COMBINATION'.format(
                report_name, report_entity)
            running_error = '{}; {}'.format(running_error, err)

        # Undocumented rule: LANGUAGES report segment only allowed for certain report_entities
        if report_segment == 'LANGUAGES' and report_entity in \
            ['ACCOUNT', 'FUNDING_INSTRUMENT', 'MEDIA_CREATIVE']:
            err = 'Report: {}, Entity: {}, Segment: LANGUAGES, INVALID COMBINATION'.format(
                report_name, report_entity)
            running_error = '{}; {}'.format(running_error, err)
        if err:
            LOGGER.error('ERROR: {}'.format(running_error))
            raise RuntimeError(running_error)

        # Undocumented rule: CONVERSION_TAGS report segment ONLY allows WEB_CONVERSION metric group
        if report_segment == 'CONVERSION_TAGS' and report_entity in \
            ['ACCOUNT', 'CAMPAIGN', 'LINE_ITEM', 'PROMOTED_TWEET']:
            report_path = get_abs_path(
                'schemas/shared/report_web_conversion.json')

        # ACCOUNT, FUNDING_INSTRUMENT, ORGANIC_TWEET only permit a subset of METRIC_GROUPS
        elif report_entity in ('ACCOUNT', 'FUNDING_INSTRUMENT',
                               'ORGANIC_TWEET'):
            report_path = get_abs_path('schemas/shared/report_{}.json'.format(
                report_entity.lower()))
        else:
            report_path = get_abs_path('schemas/shared/report_other.json')

        with open(report_path) as file:
            schema = json.load(file)

        # Replace $ref nodes with reference nodes in schema
        resolve_schema_references(schema, refs)

        # If NO_SEGMENT, then remove Segment fields
        if report_segment == 'NO_SEGMENT':
            schema['properties']['dimensions'].pop('segmentation_type', None)
            schema['properties']['dimensions'].pop('segment_name', None)
            schema['properties']['dimensions'].pop('segment_value', None)

        # Web Conversion ONLY valid for NO_SEGMENT, PLATFORM, and CONVERSION_TAGS segment
        # Reference: https://developer.twitter.com/en/docs/ads/analytics/overview/metrics-and-segmentation#WEB_CONVERSION
        #   Docs ^^ say 'PLATFORMS Only' Segmentation; but CONVERSION_TAGS segment only allow WEB_CONVERSION metrics
        if report_segment not in ('NO_SEGMENT', 'PLATFORMS',
                                  'CONVERSION_TAGS'):
            schema['properties'].pop('web_conversion', None)

        schemas[report_name] = schema
        mdata = metadata.new()

        mdata = metadata.get_standard_metadata(
            schema=schema,
            key_properties=['__sdc_dimensions_hash_key'],
            valid_replication_keys=['end_time'],
            replication_method='INCREMENTAL')
        field_metadata[report_name] = mdata

    return schemas, field_metadata
예제 #4
0
def sync(client, config, state):
    # Get config parameters
    account_list = config.get("account_ids").replace(" ", "").split(",")
    country_code_list = config.get("country_codes",
                                   "US").replace(" ", "").split(",")
    start_date = config.get("start_date")
    reports = [{
        "name": "campaign_events",
        "entity": "CAMPAIGN",
        "segment": "NO_SEGMENT",
        "granularity": "DAY",
    }]

    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info("Last/Currently Syncing Stream: {}".format(last_stream))

    # For children, ensure that dependent parent_stream is included
    parent_streams = []
    child_streams = []
    # Get all streams (parent + child) from streams.py
    flat_streams = flatten_streams()
    # Loop thru all streams
    for stream_name, stream_metadata in flat_streams.items():
        # If stream has a parent_stream, then it is a child stream
        parent_stream = stream_metadata.get("parent_stream")
        if parent_stream:
            child_streams.append(stream_name)
        else:
            parent_streams.append(stream_name)

    LOGGER.info("Sync Parent Streams: {}".format(parent_streams))
    LOGGER.info("Sync Child Streams: {}".format(child_streams))

    report_streams = []
    for report in reports:
        report_name = report.get("name")
        report_streams.append(report_name)
    LOGGER.info("Sync Report Streams: {}".format(report_streams))

    # ACCOUNT_ID OUTER LOOP
    for account_id in account_list:
        LOGGER.info("Account ID: {} - START Syncing".format(account_id))

        # PARENT STREAM LOOP
        for stream_name in parent_streams:
            update_currently_syncing(state, stream_name)
            endpoint_config = flat_streams.get(stream_name)

            LOGGER.info("Stream: {} - START Syncing, Account ID: {}".format(
                stream_name, account_id))

            total_records = sync_endpoint(
                client=client,
                state=state,
                start_date=start_date,
                stream_name=stream_name,
                endpoint_config=endpoint_config,
                tap_config=config,
                account_id=account_id,
                child_streams=child_streams,
            )

            LOGGER.info(
                "Stream: {} - FINISHED Syncing, Account ID: {}, Total Records: {}"
                .format(stream_name, account_id, total_records))

            update_currently_syncing(state, None)

        # GET country_ids and platform_ids (targeting values) - only if reports exist
        if report_streams != []:
            # GET country_ids (targeting_values) based on config country_codes
            country_ids = []
            country_path = "targeting_criteria/locations"
            for country_code in country_code_list:
                country_params = {
                    "count": 1000,
                    "cursor": None,
                    "location_type": "COUNTRIES",
                    "country_code": country_code,
                }
                country_cursor = get_resource("countries", client,
                                              country_path, country_params)
                for country in country_cursor:
                    country_id = country["targeting_value"]
                    country_ids.append(country_id)
            LOGGER.info(
                "Countries - Country Codes: {}, Country Targeting IDs: {}".
                format(country_code_list, country_ids))

            # GET platform_ids (targeting_values)
            platform_ids = []
            platforms_path = "targeting_criteria/platforms"
            platforms_params = {"count": 1000, "cursor": None}
            platforms_cursor = get_resource("platforms", client,
                                            platforms_path, platforms_params)
            for platform in platforms_cursor:
                platform_id = platform["targeting_value"]
                platform_ids.append(platform_id)
            LOGGER.info(
                "Platforms - Platform Targeting IDs: {}".format(platform_ids))

        # REPORT STREAMS LOOP
        for report in reports:
            report_name = report.get("name")
            if report_name in report_streams:
                update_currently_syncing(state, report_name)

                LOGGER.info(
                    "Report: {} - START Syncing for Account ID: {}".format(
                        report_name, account_id))

                total_records = sync_report(
                    client=client,
                    state=state,
                    start_date=start_date,
                    report_name=report_name,
                    report_config=report,
                    tap_config=config,
                    account_id=account_id,
                    country_ids=country_ids,
                    platform_ids=platform_ids,
                )

                # pylint: disable=line-too-long
                LOGGER.info(
                    "Report: {} - FINISHED Syncing for Account ID: {}, Total Records: {}"
                    .format(report_name, account_id, total_records))
                # pylint: enable=line-too-long
                update_currently_syncing(state, None)

        LOGGER.info("Account ID: {} - FINISHED Syncing".format(account_id))
예제 #5
0
def sync(client, config, catalog, state):
    # Get config parameters
    account_list = config.get('account_ids').replace(' ', '').split(',')
    country_code_list = config.get('country_codes', 'US').replace(' ', '').split(',')
    start_date = config.get('start_date')
    reports = config.get('reports', [])

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('Last/Currently Syncing Stream: {}'.format(last_stream))

    # Get ALL selected streams from catalog
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('Sync Selected Streams: {}'.format(selected_streams))
    if not selected_streams:
        return

    # Get lists of parent and child streams to sync (from streams.py and catalog)
    # For children, ensure that dependent parent_stream is included
    parent_streams = []
    child_streams = []
    # Get all streams (parent + child) from streams.py
    flat_streams = flatten_streams()
    # Loop thru all streams
    for stream_name, stream_metadata in flat_streams.items():
        # If stream has a parent_stream, then it is a child stream
        parent_stream = stream_metadata.get('parent_stream')
        # Append selected parent streams
        if not parent_stream and stream_name in selected_streams:
            parent_streams.append(stream_name)
        # Append selected child streams
        elif parent_stream and stream_name in selected_streams:
            child_streams.append(stream_name)
            # Append un-selected parent streams of selected children
            if parent_stream not in selected_streams:
                parent_streams.append(parent_stream)
    LOGGER.info('Sync Parent Streams: {}'.format(parent_streams))
    LOGGER.info('Sync Child Streams: {}'.format(child_streams))

    # Get list of report streams to sync (from config and catalog)
    report_streams = []
    for report in reports:
        report_name = report.get('report_name')
        if report_name in selected_streams:
            report_streams.append(report_name)
    LOGGER.info('Sync Report Streams: {}'.format(report_streams))

    # ACCOUNT_ID OUTER LOOP
    for account_id in account_list:
        LOGGER.info('Account ID: {} - START Syncing'.format(account_id))

        # PARENT STREAM LOOP
        for stream_name in parent_streams:
            update_currently_syncing(state, stream_name)
            endpoint_config = flat_streams.get(stream_name)

            LOGGER.info('Stream: {} - START Syncing, Account ID: {}'.format(
                stream_name, account_id))

            # Write schema and log selected fields for stream
            write_schema(catalog, stream_name)

            selected_fields = get_selected_fields(catalog, stream_name)
            LOGGER.info('Stream: {} - selected_fields: {}'.format(stream_name, selected_fields))

            total_records = sync_endpoint(client=client,
                                          catalog=catalog,
                                          state=state,
                                          start_date=start_date,
                                          stream_name=stream_name,
                                          endpoint_config=endpoint_config,
                                          tap_config=config,
                                          account_id=account_id,
                                          child_streams=child_streams)

            LOGGER.info('Stream: {} - FINISHED Syncing, Account ID: {}, Total Records: {}'.format(
                stream_name, account_id, total_records))

            update_currently_syncing(state, None)

        # GET country_ids and platform_ids (targeting values) - only if reports exist
        if report_streams != []:
            # GET country_ids (targeting_values) based on config country_codes
            country_ids = []
            country_path = 'targeting_criteria/locations'
            for country_code in country_code_list:
                country_params = {
                    'count': 1000,
                    'cursor': None,
                    'location_type': 'COUNTRIES',
                    'country_code': country_code
                }
                country_cursor = get_resource('countries', client, country_path, country_params)
                for country in country_cursor:
                    country_id = country['targeting_value']
                    country_ids.append(country_id)
            LOGGER.info('Countries - Country Codes: {}, Country Targeting IDs: {}'.format(
                country_code_list, country_ids))

            # GET platform_ids (targeting_values)
            platform_ids = []
            platforms_path = 'targeting_criteria/platforms'
            platforms_params = {
                'count': 1000,
                'cursor': None
            }
            platforms_cursor = get_resource('platforms', client, platforms_path, platforms_params)
            for platform in platforms_cursor:
                platform_id = platform['targeting_value']
                platform_ids.append(platform_id)
            LOGGER.info('Platforms - Platform Targeting IDs: {}'.format(platform_ids))

        # REPORT STREAMS LOOP
        for report in reports:
            report_name = report.get('report_name')
            if report_name in report_streams:
                update_currently_syncing(state, report_name)

                LOGGER.info('Report: {} - START Syncing for Account ID: {}'.format(
                    report_name, account_id))

                # Write schema and log selected fields for stream
                write_schema(catalog, report_name)

                selected_fields = get_selected_fields(catalog, report_name)
                LOGGER.info('Report: {} - selected_fields: {}'.format(
                    report_name, selected_fields))

                total_records = sync_report(client=client,
                                            catalog=catalog,
                                            state=state,
                                            start_date=start_date,
                                            report_name=report_name,
                                            report_config=report,
                                            tap_config=config,
                                            account_id=account_id,
                                            country_ids=country_ids,
                                            platform_ids=platform_ids)

                # pylint: disable=line-too-long
                LOGGER.info('Report: {} - FINISHED Syncing for Account ID: {}, Total Records: {}'.format(
                    report_name, account_id, total_records))
                # pylint: enable=line-too-long
                update_currently_syncing(state, None)

        LOGGER.info('Account ID: {} - FINISHED Syncing'.format(account_id))