Пример #1
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        path,
        endpoint_config,
        data_key,
        static_params,
        bookmark_query_field=None,
        bookmark_field=None,
        id_fields=None,
        parent=None,
        parent_id=None):

    # Get the latest bookmark for the stream and set the last_datetime
    last_datetime = get_bookmark(state, stream_name, start_date)
    max_bookmark_value = last_datetime
    LOGGER.info('{}: bookmark last_datetime = {}'.format(
        stream_name, max_bookmark_value))

    write_schema(catalog, stream_name)

    # Initialize child_max_bookmarks
    child_max_bookmarks = {}
    children = endpoint_config.get('children')
    if children:
        for child_stream_name, child_endpoint_config in children.items():
            should_stream, last_stream_child = should_sync_stream(
                get_selected_streams(catalog), None, child_stream_name)

            if should_stream:
                child_bookmark_field = child_endpoint_config.get(
                    'bookmark_field')
                if child_bookmark_field:
                    child_last_datetime = get_bookmark(state, stream_name,
                                                       start_date)
                    child_max_bookmarks[
                        child_stream_name] = child_last_datetime

    # Pagination reference:
    # https://docs.microsoft.com/en-us/linkedin/shared/api-guide/concepts/pagination?context=linkedin/marketing/context
    # Each page has a "start" (offset value) and a "count" (batch size, number of records)
    # Increase the "start" by the "count" for each batch.
    # Continue until the "start" exceeds the total_records.
    start = 0  # Starting offset value for each batch API call
    count = endpoint_config.get(
        'count',
        100)  # Batch size; Number of records per API call, default = 100
    total_records = 0
    page = 1
    params = {
        'start': start,
        'count': count,
        **static_params  # adds in endpoint specific, sort, filter params
    }
    if bookmark_query_field:
        params[bookmark_query_field] = last_datetime

    querystring = '&'.join(
        ['%s=%s' % (key, value) for (key, value) in params.items()])
    next_url = 'https://api.linkedin.com/v2/{}?{}'.format(path, querystring)

    while next_url:
        LOGGER.info('URL for {}: {}'.format(stream_name, next_url))

        # Get data, API request
        data = client.get(url=next_url, endpoint=stream_name)
        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        # LOGGER.info('stream_name = , data = {}'.format(stream_name, data))  # TESTING, comment out

        # Transform data with transform_json from transform.py
        #  This function converts unix datetimes, de-nests audit fields,
        #  tranforms URNs to IDs, tranforms/abstracts variably named fields,
        #  converts camelCase to snake_case for fieldname keys.
        # For the Linkedin Ads API, 'elements' is always the root data_key for records.
        # The data_key identifies the collection of records below the <root> element
        transformed_data = []  # initialize the record list
        if data_key in data:
            transformed_data = transform_json(data, stream_name)[data_key]
        # LOGGER.info('stream_name = , transformed_data = {}'.format(stream_name, transformed_data))  # TESTING, comment out
        if not transformed_data or transformed_data is None:
            LOGGER.info('No transformed_data')
            # LOGGER.info('data_key = {}, data = {}'.format(data_key, data))
            break  # No data results

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value, record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime,
            parent=parent,
            parent_id=parent_id)
        LOGGER.info('{}, records processed: {}'.format(stream_name,
                                                       record_count))
        total_records = total_records + record_count

        # Loop thru parent batch records for each children objects (if should stream)
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                should_stream, last_stream_child = should_sync_stream(
                    get_selected_streams(catalog), None, child_stream_name)
                if should_stream:
                    # For each parent record
                    for record in transformed_data:
                        i = 0
                        # Set parent_id
                        for id_field in id_fields:
                            if i == 0:
                                parent_id_field = id_field
                            if id_field == 'id':
                                parent_id_field = id_field
                            i = i + 1
                        parent_id = record.get(parent_id_field)
                        # Add children filter params based on parent IDs
                        if stream_name == 'accounts':
                            account = 'urn:li:sponsoredAccount:{}'.format(
                                parent_id)
                            owner_id = record.get('reference_organization_id',
                                                  None)
                            owner = 'urn:li:organization:{}'.format(owner_id)
                            if child_stream_name == 'video_ads' and owner_id is not None:
                                child_endpoint_config['params'][
                                    'account'] = account
                                child_endpoint_config['params'][
                                    'owner'] = owner
                        elif stream_name == 'campaigns':
                            campaign = 'urn:li:sponsoredCampaign:{}'.format(
                                parent_id)
                            if child_stream_name == 'creatives':
                                child_endpoint_config['params'][
                                    'search.campaign.values[0]'] = campaign
                            elif child_stream_name in (
                                    'ad_analytics_by_campaign',
                                    'ad_analytics_by_creative'):
                                child_endpoint_config['params'][
                                    'campaigns[0]'] = campaign

                        LOGGER.info(
                            'Syncing: {}, parent_stream: {}, parent_id: {}'.
                            format(child_stream_name, stream_name, parent_id))
                        child_path = child_endpoint_config.get('path')
                        child_total_records, child_batch_bookmark_value = sync_endpoint(
                            client=client,
                            catalog=catalog,
                            state=state,
                            start_date=start_date,
                            stream_name=child_stream_name,
                            path=child_path,
                            endpoint_config=child_endpoint_config,
                            data_key=child_endpoint_config.get(
                                'data_key', 'elements'),
                            static_params=child_endpoint_config.get(
                                'params', {}),
                            bookmark_query_field=child_endpoint_config.get(
                                'bookmark_query_field'),
                            bookmark_field=child_endpoint_config.get(
                                'bookmark_field'),
                            id_fields=child_endpoint_config.get('id_fields'),
                            parent=child_endpoint_config.get('parent'),
                            parent_id=parent_id)

                        child_batch_bookmark_dttm = strptime_to_utc(
                            child_batch_bookmark_value)
                        child_max_bookmark = child_max_bookmarks.get(
                            child_stream_name)
                        child_max_bookmark_dttm = strptime_to_utc(
                            child_max_bookmark)
                        if child_batch_bookmark_dttm > child_max_bookmark_dttm:
                            child_max_bookmarks[child_stream_name] = strftime(
                                child_batch_bookmark_dttm)

                        LOGGER.info(
                            'Synced: {}, parent_id: {}, total_records: {}'.
                            format(child_stream_name, parent_id,
                                   child_total_records))

        # Pagination: Get next_url
        next_url = None
        links = data.get('paging', {}).get('links', [])
        for link in links:
            rel = link.get('rel')
            if rel == 'next':
                href = link.get('href')
                if href:
                    next_url = 'https://api.linkedin.com{}'.format(
                        urllib.parse.unquote(href))

        LOGGER.info(
            '{}: Synced page {}, this page: {}. Total records processed: {}'.
            format(stream_name, page, record_count, total_records))
        page = page + 1

    # Write child bookmarks
    for key, val in list(child_max_bookmarks.items()):
        write_bookmark(state, key, val)

    return total_records, max_bookmark_value
Пример #2
0
def sync_ad_analytics(client,
                      catalog,
                      state,
                      last_datetime,
                      stream_name,
                      path,
                      endpoint_config,
                      data_key,
                      static_params,
                      bookmark_query_field=None,
                      bookmark_field=None,
                      id_fields=None,
                      parent=None,
                      parent_id=None):
    # pylint: disable=too-many-branches,too-many-statements,unused-argument

    # LinkedIn has a max of 20 fields per request. We cap the chunks at 17
    # to make sure there's always room for us to append `dateRange`,
    # `pivot`, and `pivotValue`
    MAX_CHUNK_LENGTH = 17

    max_bookmark_value = last_datetime
    last_datetime_dt = strptime_to_utc(last_datetime) - timedelta(days=7)

    window_start_date = last_datetime_dt.date()
    window_end_date = window_start_date + timedelta(days=DATE_WINDOW_SIZE)
    today = datetime.date.today()

    if window_end_date > today:
        window_end_date = today

    # Override the default start and end dates
    static_params = {
        **static_params,
        'dateRange.start.day': window_start_date.day,
        'dateRange.start.month': window_start_date.month,
        'dateRange.start.year': window_start_date.year,
        'dateRange.end.day': window_end_date.day,
        'dateRange.end.month': window_end_date.month,
        'dateRange.end.year': window_end_date.year,
    }

    valid_selected_fields = [
        snake_case_to_camel_case(field)
        for field in selected_fields(catalog.get_stream(stream_name)) if
        snake_case_to_camel_case(field) in FIELDS_AVAILABLE_FOR_AD_ANALYTICS_V2
    ]

    # When testing the API, if the fields in `field` all return `0` then
    # the API returns its empty response.

    # However, the API distinguishes between a day with non-null values
    # (even if this means the values are all `0`) and a day with null
    # values. We found that requesting these fields give you the days with
    # non-null values
    first_chunk = [['dateRange', 'pivot', 'pivotValue']]

    chunks = first_chunk + list(
        split_into_chunks(valid_selected_fields, MAX_CHUNK_LENGTH))

    # We have to append these fields in order to ensure we get them back
    # so that we can create the composite primary key for the record and
    # to merge the multiple responses based on this primary key
    for chunk in chunks:
        for field in ['dateRange', 'pivot', 'pivotValue']:
            if field not in chunk:
                chunk.append(field)

    ############### PAGINATION (for these 2 streams) ###############
    # The Tap requests LinkedIn with one Campaign ID at one time.
    # 1 Campaign permits 100 Ads
    # Considering, 1 Ad is active and the existing behaviour of the tap uses 30 Day window size
    #       and timeGranularity = DAILY(Results grouped by day) we get 30 records in one API response
    # Considering the maximum permitted size of Ads are created, "3000" records will be returned in an API response.
    # If “count=100” and records=100 in the API are the same then the next url will be returned and if we hit that URL, 400 error code will be returned.
    # This case is unreachable because here “count” is 10000 and at maximum only 3000 records will be returned in an API response.

    total_records = 0
    while window_end_date <= today:
        responses = []
        for chunk in chunks:
            static_params['fields'] = ','.join(chunk)
            params = {
                "start": 0,
                "count": endpoint_config.get('count', 100),
                **static_params
            }
            query_string = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
            LOGGER.info('Syncing %s from %s to %s', parent_id,
                        window_start_date, window_end_date)
            for page in sync_analytics_endpoint(client, stream_name,
                                                endpoint_config.get('path'),
                                                query_string):
                if page.get(data_key):
                    responses.append(page.get(data_key))
        raw_records = merge_responses(responses)
        time_extracted = utils.now()

        # While we broke the ad_analytics streams out from
        # `sync_endpoint()`, we want to process them the same. And
        # transform_json() expects a dictionary with a key equal to
        # `data_key` and its value is the response from the API

        # Note that `transform_json()` returns the same structure we pass
        # in. `sync_endpoint()` grabs `data_key` from the return value, so
        # we mirror that here
        transformed_data = transform_json(
            {data_key: list(raw_records.values())}, stream_name)[data_key]
        if not transformed_data:
            LOGGER.info('No transformed_data')
        else:
            max_bookmark_value, record_count = process_records(
                catalog=catalog,
                stream_name=stream_name,
                records=transformed_data,
                time_extracted=time_extracted,
                bookmark_field=bookmark_field,
                max_bookmark_value=last_datetime,
                last_datetime=strftime(last_datetime_dt),
                parent=parent,
                parent_id=parent_id)
            LOGGER.info('%s, records processed: %s', stream_name, record_count)
            LOGGER.info('%s: max_bookmark: %s', stream_name,
                        max_bookmark_value)
            total_records += record_count

        window_start_date, window_end_date, static_params = shift_sync_window(
            static_params, today)

        if window_start_date == window_end_date:
            break

    return total_records, max_bookmark_value