예제 #1
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        site,
        sub_type,
        dimensions_list,
        path,
        endpoint_config,
        api_method,
        pagination,
        static_params,
        bookmark_field=None,
        data_key=None,
        body_params=None,
        id_fields=None,
        resource_name=None):

    # Get the latest bookmark for the stream and set the last_datetime
    last_datetime = None
    max_bookmark_value = None

    last_datetime = get_bookmark(state, stream_name, site, sub_type,
                                 start_date)
    max_bookmark_value = last_datetime

    # Pagination: loop thru all pages of data
    # Pagination types: none, body, params
    # Each page has an offset (starting value) and a limit (batch size, number of records)
    # Increase the "offset" by the "limit" for each batch.
    # Continue until the "offset" exceeds the total_records.
    offset = 0  # Starting offset value for each batch API call
    limit = endpoint_config.get(
        'row_limit', 1000)  # Batch size; Number of records per API call
    total_records = 0
    batch_count = limit
    page = 1
    path_params = {'siteUrl': site}

    while limit == batch_count:
        if pagination == 'body':
            body = {
                'startRow': offset,
                'rowLimit': limit,
                **body_params  # adds in endpoint specific, sort, filter body params
            }
            params = static_params
        elif pagination == 'params':
            params = {
                'startRow': offset,
                'rowLimit': limit,
                **static_params  # adds in endpoint specific, sort, filter body params
            }
            body = body_params
        else:
            params = static_params
            body = body_params

        LOGGER.info(
            'Stream: {}, Site: {}, Type: {} - Batch Sync start, Offset: {}'.
            format(stream_name, site, sub_type, offset))

        # Squash params to query-string params
        querystring = None
        if params.items():
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
        LOGGER.info('URL for Stream: {}, Site: {}, Search type: {}'.format(
            stream_name, site, sub_type))
        if body and not body == {}:
            LOGGER.info('body = {}'.format(body))

        # API request data, endpoint = stream_name passed to client for metrics logging
        if body:
            path_params['body'] = body
        data = client.get(method_name=api_method,
                          resource_name=resource_name,
                          params=path_params)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            LOGGER.info('xxx NO DATA xxx')

            if bookmark_field:
                write_bookmark(state, stream_name, site, sub_type,
                               max_bookmark_value)
            return 0  # No data results

        # Transform data with transform_json from transform.py
        transformed_data = []  # initialize the record list

        # Sites endpoint returns a single record dictionary (not a list)
        if stream_name == 'sites':
            data_list = []
            data_list.append(data)
            data_dict = {}
            data_dict[data_key] = data_list
            data = data_dict
        if data_key in data:
            transformed_data = transform_json(data, stream_name, data_key,
                                              site, sub_type,
                                              dimensions_list)[data_key]
        else:
            LOGGER.info('Number of raw data records: 0')
        if not transformed_data or transformed_data is None:
            LOGGER.info('xxx NO TRANSFORMED DATA xxx')

            if bookmark_field:
                write_bookmark(state, stream_name, site, sub_type,
                               max_bookmark_value)
            return 0  # No data results
        for record in transformed_data:
            for key in id_fields:
                if not record.get(key):
                    primary_keys_only = {
                        id_field: record.get(id_field)
                        for id_field in id_fields
                    }
                    raise ValueError(
                        'Missing key {} in record with primary keys {}'.format(
                            key, primary_keys_only))
        batch_count = len(transformed_data)

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime)

        # to_rec: to record; ending record for the batch
        to_rec = offset + limit
        if to_rec > total_records:
            to_rec = total_records

        LOGGER.info(
            'Stream: {}, Site: {}, Type: {}, Page: {}, Batch records: {} to {}'
            .format(stream_name, site, sub_type, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size)
        offset = offset + limit
        total_records = total_records + batch_count
        page = page + 1

    # Update the state with the max_bookmark_value for the stream, site, sub_type
    # Reference: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics/query
    # NOTE: Results are sorted by click count descending.
    #       If two rows have the same click count, they are sorted in an arbitrary way.
    #       Records are NOT sorted in DATE order.
    # THEREFOR: State is updated after ALL pages of data for stream, site, sub_type, date window
    if bookmark_field:
        write_bookmark(state, stream_name, site, sub_type, max_bookmark_value)

    # Return total_records across all batches
    return total_records
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        site,
        sub_type,
        dimensions_list,
        path,
        endpoint_config,
        api_method,
        pagination,
        static_params,
        bookmark_field=None,
        bookmark_type=None,
        data_key=None,
        body_params=None,
        id_fields=None):

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    max_bookmark_value = None
    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, site, sub_type, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, site, sub_type,
                                     start_date)
        max_bookmark_value = last_datetime

    # Pagination: loop thru all pages of data
    # Pagination types: none, body, params
    # Each page has an offset (starting value) and a limit (batch size, number of records)
    # Increase the "offset" by the "limit" for each batch.
    # Continue until the "offset" exceeds the total_records.
    offset = 0  # Starting offset value for each batch API call
    limit = 25000  # Batch size; Number of records per API call
    total_records = limit  # Initialize total; set to actual total on first API call

    while offset <= total_records:
        if pagination == 'body':
            body = {
                'startRow': offset,
                'rowLimit': limit,
                **body_params  # adds in endpoint specific, sort, filter body params
            }
            params = static_params
        elif pagination == 'params':
            params = {
                'startRow': offset,
                'rowLimit': limit,
                **static_params  # adds in endpoint specific, sort, filter body params
            }
            body = body_params
        else:
            params = static_params
            body = body_params

        LOGGER.info(
            'Stream: {}, Site: {}, Type: {} - Batch Sync start, Offset: {}'.
            format(stream_name, site, sub_type, offset))

        # Squash params to query-string params
        querystring = None
        if params.items():
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
        LOGGER.info('URL for Stream: {}, Site: {} ({}): {}/{}{}'.format(
            stream_name, site, api_method, BASE_URL, path,
            '?{}'.format(querystring) if querystring else ''))
        if body and not body == {}:
            LOGGER.info('body = {}'.format(body))

        # API request data, endpoint = stream_name passed to client for metrics logging
        data = {}
        if api_method == 'GET':
            data = client.get(path=path,
                              params=querystring,
                              endpoint=stream_name)
        elif api_method == 'POST':
            data = client.post(path=path,
                               params=querystring,
                               endpoint=stream_name,
                               data=json.dumps(body))

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            LOGGER.info('xxx NO DATA xxx')
            return 0  # No data results

        # Transform data with transform_json from transform.py
        transformed_data = []  # initialize the record list

        # Sites endpoint returns a single record dictionary (not a list)
        if stream_name == 'sites':
            data_list = []
            data_list.append(data)
            data_dict = {}
            data_dict[data_key] = data_list
            data = data_dict
        # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
        if data_key in data:
            LOGGER.info('Number of raw data records: {}'.format(
                len(data[data_key])))
            transformed_data = transform_json(data, stream_name, data_key,
                                              site, sub_type,
                                              dimensions_list)[data_key]
            LOGGER.info('Number of transformed_data records: {}'.format(
                len(transformed_data)))
        else:
            LOGGER.info('Number of raw data records: 0')
        # LOGGER.info('transformed_data = {}'.format(transformed_data))  # TESTING, comment out
        if not transformed_data or transformed_data is None:
            LOGGER.info('xxx NO TRANSFORMED DATA xxx')
            return 0  # No data results
        for record in transformed_data:
            for key in id_fields:
                if not record.get(key):
                    LOGGER.info('xxx Missing key {} in record: {}'.format(
                        key, record))

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            bookmark_type=bookmark_type,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime,
            last_integer=last_integer)

        # set total_records for pagination
        total_records = offset + len(transformed_data)
        LOGGER.info('total_records: {}, offset: {}, length: {}'.format(
            total_records, offset, len(transformed_data)))

        # Update the state with the max_bookmark_value for the stream, site, sub_type
        if bookmark_field:
            write_bookmark(state, stream_name, site, sub_type,
                           max_bookmark_value)

        # to_rec: to record; ending record for the batch
        to_rec = offset + limit
        if to_rec > total_records:
            to_rec = total_records

        LOGGER.info(
            'Stream: {}, Site: {}, Type: {} - Synced batch records - {} to {}'.
            format(stream_name, site, sub_type, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size)
        offset = offset + limit

    # Return total_records across all batches
    return total_records