示例#1
0
    async def sync_trace_search(self, schema):
        """Incidents."""
        stream = "trace_search"
        loop = asyncio.get_event_loop()

        singer.write_schema(stream, schema.to_dict(), ["hour"])
        trace_search = await loop.run_in_executor(None,
                                                  self.client.hourly_request,
                                                  self.state, self.config,
                                                  f"traces", stream)
        if trace_search:
            for trace in trace_search['usage']:
                singer.write_record(stream, trace)
            if trace_search['usage'] is not None and len(
                    trace_search['usage']) > 0:
                self.state = write_bookmark(
                    self.state, stream, "since",
                    trace_search['usage'][len(trace_search['usage']) -
                                          1]['hour'])
示例#2
0
    async def sync_custom_usage(self, schema):
        """Get hourly usage for custom metric."""
        stream = "custom_usage"
        loop = asyncio.get_event_loop()

        singer.write_schema(stream, schema.to_dict(), ["hour"])
        custom_usage = await loop.run_in_executor(None,
                                                  self.client.hourly_request,
                                                  self.state, self.config,
                                                  f"timeseries", stream)
        if custom_usage:
            for c in custom_usage['usage']:
                singer.write_record(stream, c)
            if custom_usage['usage'] is not None and len(
                    custom_usage['usage']) > 0:
                self.state = write_bookmark(
                    self.state, stream, "since",
                    custom_usage['usage'][len(custom_usage['usage']) -
                                          1]['hour'])
示例#3
0
def append_data(response, state, config, stream, tokens, query):
    ref = get_ref()
    key = "currentpage"
    tap_data = []
    if not response:
        return tap_data, state
    tap_data = response.json()[ref[stream.tap_stream_id]]
    if "pagecount" in response.json():
        tap_data = response.json()[ref[stream.tap_stream_id]]
        for i in range(0, response.json()["pagecount"]):
            query["pageoffset"] = i
            response = make_request(config, stream, tokens, query)
            if not response:
                state = write_bookmark(state, stream.tap_stream_id, key, i)
                return tap_data, state
            table = response.json()[ref[stream.tap_stream_id]]
            tap_data = tap_data + table
    
    return tap_data, state
示例#4
0
    async def sync_bills(self, schema, period: pendulum.period = None):
        """Output the `bills` in the period."""
        stream = "bills"
        loop = asyncio.get_event_loop()

        if not period:
            # build a default period from the last bookmark
            bookmark = get_bookmark(self.state, stream, "start_time")
            start = pendulum.parse(bookmark)
            end = pendulum.now()
            period = pendulum.period(start, end)

        singer.write_schema(stream, schema.to_dict(), ["invoice_id"])

        for at in period.range("months"):
            result = await loop.run_in_executor(None, self.client.bill, at)
            if result:
                singer.write_record(stream, result)
                end = datetime.datetime.strptime(
                    result["end_time"], "%Y-%m-%dT%H:%M:%SZ").isoformat()
                self.state = write_bookmark(self.state, stream, "start_time",
                                            end)
示例#5
0
    async def sync_stats(self, schema, period: pendulum.period = None):
        """Output the stats in the period."""
        stream = "stats"
        loop = asyncio.get_event_loop()

        singer.write_schema(stream, schema.to_dict(),
                            ["service_id", "start_time"])
        bookmark = get_bookmark(self.state, stream, "from")
        if bookmark is not None:
            if "UTC" in bookmark:
                bookmark = datetime.datetime.strptime(
                    bookmark, '%Y-%m-%d %H:%M:%S UTC').isoformat()
            start_date = pendulum.parse(bookmark).int_timestamp
        else:
            start_date = pendulum.parse(
                self._config['start_date']).int_timestamp
        end_date = pendulum.now().int_timestamp
        result = await loop.run_in_executor(None, self.client.stats,
                                            start_date, end_date)
        if result:
            for n in result['data']:
                service_result = await loop.run_in_executor(
                    None, self.client.service, n)
                for i in result['data'][n]:
                    i['service_name'] = service_result['name']
                    i['service_versions'] = json.dumps(
                        service_result['versions'])
                    i['service_customer_id'] = service_result['customer_id']
                    i['service_publish_key'] = service_result['publish_key']
                    i['service_comment'] = service_result['comment']
                    i['service_deleted_at'] = service_result['deleted_at']
                    i['service_updated_at'] = service_result['updated_at']
                    i['service_created_at'] = service_result['created_at']
                    singer.write_record(stream, i)
                    end = datetime.datetime.strptime(
                        result['meta']["to"],
                        '%Y-%m-%d %H:%M:%S UTC').isoformat()
                    self.state = write_bookmark(self.state, stream, "from",
                                                end)
示例#6
0
def validate_state(config, catalog, state):
    for stream in catalog["streams"]:
        if not stream["schema"].get("selected"):
            # If a stream is deselected while it's the current stream, unset the
            # current stream.
            if stream["tap_stream_id"] == get_currently_syncing(state):
                set_currently_syncing(state, None)
            continue

        replication_key = determine_replication_key(stream['tap_stream_id'])
        if not replication_key:
            continue

        # If there's no bookmark for a stream (new integration, newly selected,
        # reset, etc) we need to use the default start date from the config.
        bookmark = get_bookmark(state, stream["tap_stream_id"],
                                replication_key)
        if bookmark is None:
            state = write_bookmark(state, stream["tap_stream_id"],
                                   replication_key, config["start_date"])

    singer.write_state(state)
    return state
示例#7
0
def sync_a_day(stream_id, path, params, start_ymd, end_ymd, func_get_records,
               client, catalog, state, window_end):
    """Sync a single day's worth of data, paginating through as many times as
    necessary.

    Loop Guide
    1. Make the API call
    2. Process the response to get a list of records
    3. Persist the records
    4. Update the call parameters

    When we get back an empty result set, log a message that we are done,
    reset the page bookmark to page 1, and update the datetime bookmark to
    the day we just finished syncing.

    The length of the response can vary, but once we get back an empty
    response, it seems that we have requested all objects. Do not trust
    the `total` field on the response, it has varied from the actual total
    by O(10). The path to that field is response.json()['orders']['total'],
    where the response is in the request function in `client.py`.

    """
    # Page until we're done
    while True:
        page_bookmark = bookmarks.get_bookmark(state, stream_id, 'page', 1)
        # Set the page to start paginating on
        params['page'] = page_bookmark
        LOGGER.info('Syncing %s from: %s to: %s - page %s', stream_id,
                    start_ymd, end_ymd, params['page'])
        data = client.get(path, params=params, endpoint=stream_id)
        records = func_get_records(data)
        if not records:
            LOGGER.info('Done daily pagination for endpoint %s at page %d',
                        stream_id, params['page'])
            bookmarks.write_bookmark(state, stream_id, 'page', 1)
            bookmarks.write_bookmark(state, stream_id, 'datetime',
                                     utils.strftime(window_end))
            singer.write_state(state)
            break
        else:
            persist_records(catalog, stream_id, records)
            singer.write_state(state)
            bookmarks.write_bookmark(state, stream_id, 'page',
                                     params['page'] + 1)
示例#8
0
def sync_paginated(client, state, stream):
    # http://developers.marketo.com/rest-api/endpoint-reference/lead-database-endpoint-reference/#!/Campaigns/getCampaignsUsingGET
    # http://developers.marketo.com/rest-api/endpoint-reference/lead-database-endpoint-reference/#!/Static_Lists/getListsUsingGET
    #
    # Campaigns and Static Lists are paginated with a max return of 300
    # items per page. There are no filters that can be used to only
    # return updated records.
    replication_key = determine_replication_key(stream['tap_stream_id'])

    singer.write_schema(stream["tap_stream_id"],
                        stream["schema"],
                        stream["key_properties"],
                        bookmark_properties=[replication_key])
    start_date = bookmarks.get_bookmark(state, stream["tap_stream_id"],
                                        replication_key)
    params = {"batchSize": 300}
    endpoint = "rest/v1/{}.json".format(stream["tap_stream_id"])

    # Paginated requests use paging tokens for retrieving the next page
    # of results. These tokens are stored in the state for resuming
    # syncs. If a paging token exists in state, use it.
    next_page_token = bookmarks.get_bookmark(state, stream["tap_stream_id"],
                                             "next_page_token")
    if next_page_token:
        params["nextPageToken"] = next_page_token

    # Keep querying pages of data until no next page token.
    record_count = 0
    job_started = pendulum.utcnow().isoformat()
    while True:
        data = client.request("GET",
                              endpoint,
                              endpoint_name=stream["tap_stream_id"],
                              params=params)

        time_extracted = utils.now()

        # Each row just needs the values formatted. If the record is
        # newer than the original start date, stream the record. Finally,
        # update the bookmark if newer than the existing bookmark.
        for row in data["result"]:
            record = format_values(stream, row)
            if record[replication_key] >= start_date:
                record_count += 1

                singer.write_record(stream["tap_stream_id"],
                                    record,
                                    time_extracted=time_extracted)

        # No next page, results are exhausted.
        if "nextPageToken" not in data:
            break

        # Store the next page token in state and continue.
        params["nextPageToken"] = data["nextPageToken"]
        state = bookmarks.write_bookmark(state, stream["tap_stream_id"],
                                         "next_page_token",
                                         data["nextPageToken"])
        singer.write_state(state)

    # Once all results are exhausted, unset the next page token bookmark
    # so the subsequent sync starts from the beginning.
    state = bookmarks.write_bookmark(state, stream["tap_stream_id"],
                                     "next_page_token", None)
    state = bookmarks.write_bookmark(state, stream["tap_stream_id"],
                                     replication_key, job_started)
    singer.write_state(state)
    return state, record_count
示例#9
0
def write_metrics_state(atx, metric, date_to_resume):
    write_bookmark(atx.state, metric, 'date_to_resume',
                   date_to_resume.to_datetime_string())
    atx.write_state()
示例#10
0
 def set_bookmark(self, path, val):
     if isinstance(val, date):
         val = val.isoformat()
     bks_.write_bookmark(self.state, path[0], path[1], val)
示例#11
0
 def set_bookmark(self, path, val):
     if isinstance(val, date):
         val = val.isoformat()
     if isinstance(val, str):
         val = pendulum.parse(val).to_iso8601_string()
     bks_.write_bookmark(self.state, path[0], path[1], val)
示例#12
0
def sync_report(stream_name, stream_metadata, sdk_client):

    report_window_days = CONFIG.get("MAX_REPORT_TIME_WINDOW", 365)

    is_incremental = False
    if metadata.get(stream_metadata, (),
                    "replication-method") == "INCREMENTAL":
        is_incremental = True

    customer_id = sdk_client.client_customer_id

    stream_schema, _ = create_schema_for_report(stream_name, sdk_client)
    stream_schema = add_synthetic_keys_to_stream_schema(stream_schema)

    xml_attribute_list = get_fields_to_sync(stream_schema, stream_metadata)

    primary_keys = metadata.get(stream_metadata,
                                (), 'tap-adwords.report-key-properties') or []
    LOGGER.info("{} primary keys are {}".format(stream_name, primary_keys))

    write_schema(stream_name,
                 stream_schema,
                 primary_keys,
                 bookmark_properties=['day'])

    field_list = []
    for field in xml_attribute_list:
        field_list.append(stream_metadata[('properties',
                                           field)]['adwords.fieldName'])

    check_selected_fields(stream_name, field_list, sdk_client)
    # If an attribution window sync is interrupted, start where it left off
    start_date = get_attribution_window_bookmark(customer_id, stream_name)
    if start_date is not None:
        start_date = start_date + relativedelta(days=1)

    if start_date is None:
        start_date = apply_conversion_window(
            get_start_for_stream(customer_id, stream_name))

    if stream_name in REPORTS_WITH_90_DAY_MAX:
        cutoff = utils.now() + relativedelta(days=-90)
        if start_date < cutoff:
            LOGGER.warning(
                "report only supports up to 90 days, will start at {}".format(
                    start_date))
            start_date = cutoff

    start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)

    LOGGER.info('Selected fields: %s', field_list)

    max_end_date = utils.now() - relativedelta(days=1)
    required_end_date = get_end_date()

    report_end_date = min(max_end_date, required_end_date)
    report_end_date = report_end_date.replace(hour=23,
                                              minute=59,
                                              second=59,
                                              microsecond=0)

    next_start_date = start_date

    is_single_day_report = stream_name in REPORTS_REQUIRING_DAILY_REPORTS
    start_plus_window = next_start_date
    if not is_single_day_report:
        start_plus_window += relativedelta(days=report_window_days)
    end_date = min(start_plus_window, report_end_date)

    while next_start_date <= report_end_date:
        singer.log_info("syncing %s for %s - %s", stream_name,
                        next_start_date.strftime("%Y-%m-%d"),
                        end_date.strftime("%Y-%m-%d"))
        actual_end_date = min(end_date, report_end_date)
        sync_report_for_day(stream_name, stream_schema, sdk_client,
                            next_start_date, field_list, actual_end_date)
        next_start_date = end_date + relativedelta(days=1)

        start_plus_window = next_start_date
        if not is_single_day_report:
            start_plus_window += relativedelta(days=report_window_days)

        end_date = start_plus_window

        bookmarks.write_bookmark(STATE, state_key_name(customer_id,
                                                       stream_name),
                                 'last_attribution_window_date',
                                 actual_end_date.strftime(utils.DATETIME_FMT))
        singer.write_state(STATE)
    if not is_incremental:
        bookmarks.clear_bookmark(STATE, state_key_name(customer_id,
                                                       stream_name),
                                 'last_attribution_window_date')
    singer.write_state(STATE)
    LOGGER.info("Done syncing the %s report for customer_id %s", stream_name,
                customer_id)
示例#13
0
def sync_report_for_day(stream_name, stream_schema, sdk_client, start,
                        field_list):  # pylint: disable=too-many-locals
    report_downloader = sdk_client.GetReportDownloader(version=VERSION)
    customer_id = sdk_client.client_customer_id
    report = {
        'reportName': 'Seems this is required',
        'dateRangeType': 'CUSTOM_DATE',
        'reportType': stream_name,
        'downloadFormat': 'CSV',
        'selector': {
            'fields': field_list,
            'dateRange': {
                'min': start.strftime('%Y%m%d'),
                'max': start.strftime('%Y%m%d')
            }
        }
    }

    # Fetch the report as a csv string
    with metrics.http_request_timer(stream_name):
        result = attempt_download_report(report_downloader, report)

    headers, csv_reader = parse_csv_stream(result)
    with metrics.record_counter(stream_name) as counter:
        time_extracted = utils.now()

        with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING
                         ) as bumble_bee:
            for row in csv_reader:
                obj = dict(
                    zip(get_xml_attribute_headers(stream_schema, headers),
                        row))
                obj['_sdc_customer_id'] = customer_id
                obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME

                bumble_bee.pre_hook = transform_pre_hook
                obj = bumble_bee.transform(obj, stream_schema)

                singer.write_record(stream_name,
                                    obj,
                                    time_extracted=time_extracted)
                counter.increment()

        if start > get_start_for_stream(sdk_client.client_customer_id,
                                        stream_name):
            LOGGER.info(
                'updating bookmark: %s > %s', start,
                get_start_for_stream(sdk_client.client_customer_id,
                                     stream_name))
            bookmarks.write_bookmark(
                STATE,
                state_key_name(sdk_client.client_customer_id, stream_name),
                'date', utils.strftime(start))
            singer.write_state(STATE)
        else:
            LOGGER.info(
                'not updating bookmark: %s <= %s', start,
                get_start_for_stream(sdk_client.client_customer_id,
                                     stream_name))

        LOGGER.info(
            "Done syncing %s records for the %s report for customer_id %s on %s",
            counter.value, stream_name, customer_id, start)
示例#14
0
 def set_bookmark(self, path, val):
     bks_.write_bookmark(self.state, path[0], path[1], val)
def sync(config, state, catalog):
    """ Sync data from tap source """
    access_token = config['access_token']
    client = SurveyMonkeyClient(access_token)

    # Loop over selected streams in catalog
    for stream in catalog.get_selected_streams(state):
        stream_object = STREAMS[stream.tap_stream_id]
        mdata = metadata.to_map(stream.metadata)
        raw_schema = stream.schema.to_dict()
        LOGGER.info("Syncing stream: " + stream.tap_stream_id)

        bookmark_column = stream_object.replication_key
        is_sorted = stream_object.is_sorted  # indicate whether data is sorted ascending on bookmark value

        # Publish schema to singer.
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=raw_schema,
            key_properties=stream.key_properties,
        )

        with metrics.record_counter(stream.tap_stream_id) as counter:
            max_bookmark = None
            bookmark_value = None
            bookmark_key_prefix = ''
            with Transformer() as transformer:
                if stream_object.replication_key_from_parent:
                    if bookmarks.get_bookmark(state, stream_object.stream_id,
                                              'page_sync'):
                        bookmark_value = bookmarks.get_bookmark(
                            state, stream_object.stream_id, 'page_sync')

                    if bookmarks.get_bookmark(state, stream_object.stream_id,
                                              'full_sync'):
                        bookmark_value = bookmarks.get_bookmark(
                            state, stream_object.stream_id, 'full_sync')

                for parent_row in stream_object.parent_stream.fetch_data(
                        client, None, config, state,
                        bookmark_value=bookmark_value
                ) if stream_object.parent_stream else [{}]:
                    bookmark_value = None

                    if not stream_object.replication_key_from_parent:
                        bookmark_key_prefix = build_bookmark_key_prefix(
                            parent_row, stream_object.parent_stream)

                        if bookmarks.get_bookmark(state,
                                                  stream_object.stream_id,
                                                  'page_sync'):
                            bookmark_value = bookmarks.get_bookmark(
                                state, stream_object.stream_id,
                                f'{bookmark_key_prefix}page_sync')

                        if bookmarks.get_bookmark(state,
                                                  stream_object.stream_id,
                                                  'full_sync'):
                            bookmark_value = bookmarks.get_bookmark(
                                state, stream_object.stream_id,
                                f'{bookmark_key_prefix}full_sync')

                    for row in stream_object.fetch_data(
                            client,
                            stream,
                            config,
                            state,
                            parent_row=parent_row,
                            bookmark_value=bookmark_value):
                        # write one or more rows to the stream:
                        singer.write_record(
                            stream.tap_stream_id,
                            transformer.transform(row, raw_schema, mdata))

                        if bookmark_column and not stream_object.replication_key_from_parent:
                            if is_sorted:
                                # update bookmark to latest value
                                state = bookmarks.write_bookmark(
                                    state, stream.tap_stream_id,
                                    f'{bookmark_key_prefix}page_sync',
                                    row[bookmark_column])
                                singer.write_state(state)
                            else:
                                # if data unsorted, save max value until end of writes
                                max_bookmark = max(max_bookmark,
                                                   row[bookmark_column])

                        counter.increment()

                    if bookmark_column and parent_row and stream_object.replication_key_from_parent:
                        if is_sorted:
                            # update bookmark to latest value
                            state = bookmarks.write_bookmark(
                                state, stream.tap_stream_id,
                                f'{bookmark_key_prefix}page_sync',
                                parent_row[bookmark_column])
                            singer.write_state(state)
                        else:
                            # if data unsorted, save max value until end of writes
                            max_bookmark = max(max_bookmark,
                                               parent_row[bookmark_column])
                if bookmark_column and not is_sorted:
                    state = bookmarks.write_bookmark(
                        state, stream.tap_stream_id,
                        f'{bookmark_key_prefix}full_sync',
                        row[bookmark_column])
                    singer.write_state(state)

            if bookmark_column and not is_sorted and parent_row and stream_object.replication_key_from_parent:
                state = bookmarks.write_bookmark(
                    state, stream.tap_stream_id,
                    f'{bookmark_key_prefix}full_sync',
                    parent_row[bookmark_column])
                singer.write_state(state)

        LOGGER.info('Stream: {}, Processed {} records'.format(
            stream.tap_stream_id, counter.value))
示例#16
0
def write_forms_state(atx, form, date_to_resume):
    write_bookmark(atx.state, form, 'date_to_resume',
                   date_to_resume.to_datetime_string())
    atx.write_state()
示例#17
0
def sync_daily(client, catalog, state, start_date, end_date, stream_id,
               stream_config):
    """Syncs a given date range, bookmarking after each day.

    Argument Types:
      client: [ShipHeroClient]
      catalog: [Dictionary]
      state: [Dictionary]
      start_date: [String, UTC datetime]
      end_date: Optional, non-inclusive day [String, UTC datetime]
      stream_id: [String]
      stream_config: [Dictionary]
    """
    write_schema(catalog, stream_id)

    #######################################################################
    ### Set up datetime versions of the start_date, end_date
    #######################################################################

    # Rip this out once all bookmarks are converted
    if isinstance(state.get('bookmarks', {}).get(stream_id), str):
        # Old style bookmark found. Use it and delete it
        old_style_bookmark = state['bookmarks'].pop(stream_id)

        # Write this bookmark in the new style
        bookmarks.write_bookmark(state, stream_id, 'datetime',
                                 old_style_bookmark)

    start_date_bookmark = bookmarks.get_bookmark(state, stream_id, 'datetime',
                                                 start_date)

    start_date_dt = strptime_to_utc(start_date_bookmark)

    # Since end_date is optional in the top level sync
    if end_date:
        end_date_dt = strptime_to_utc(end_date)
    else:
        end_date_dt = utils.now()

    if start_date_dt > end_date_dt:
        raise Exception(
            '{} start_date is greater than end_date'.format(stream_id))

    #######################################################################
    ### Sync data by day
    #######################################################################

    # Extract params from config
    path = stream_config['path']
    params = stream_config.get('params', {})
    from_col = stream_config['from_col']
    to_col = stream_config['to_col']
    records_fn = stream_config['get_records']

    page_bookmark = bookmarks.get_bookmark(state, stream_id, 'page', 1)

    # Set the page to start paginating on
    params['page'] = page_bookmark

    # Loop over all the days
    while start_date_dt != end_date_dt:
        window_end = start_date_dt + timedelta(days=1)
        if window_end > now() or window_end > end_date_dt:
            window_end = end_date_dt

        # The API expects the dates in %Y-%m-%d
        start_ymd = start_date_dt.strftime('%Y-%m-%d')
        end_ymd = window_end.strftime('%Y-%m-%d')
        if start_ymd == end_ymd:
            # NB: A range of 0 days will return 0 records.
            end_ymd = (utils.strptime_to_utc(end_ymd) +
                       timedelta(days=1)).strftime('%Y-%m-%d')

        params.update({from_col: start_ymd, to_col: end_ymd})

        sync_a_day(stream_id, path, params, start_ymd, end_ymd, records_fn,
                   client, catalog, state, window_end)

        start_date_dt = window_end
示例#18
0
def sync_statistics_for_day(
    config,
    state,
    stream,
    sdk_client,
    token,
    start,
    report_metrics,
    report_dimensions,
):  # pylint: disable=too-many-locals
    """Sync and output Criteo Statistics endpoint for one day."""
    mdata = metadata.to_map(stream.metadata)
    stats_query = {
        "report_type": stream.tap_stream_id,
        "dimensions": report_dimensions,
        "metrics": report_metrics,
        "start_date": start.strftime("%Y-%m-%d"),
        "end_date": start.strftime("%Y-%m-%d"),
        "currency": metadata.get(mdata, (), "tap-criteo.currency"),
    }
    # Filter advertiser_ids if defined in config
    advertiser_ids = config.get("advertiser_ids")
    if advertiser_ids:
        stats_query["advertiserId"] = advertiser_ids
    # Add ignore_x_device if defined in metadata
    ignore_x_device = metadata.get(mdata, (), "tap-criteo.ignoreXDevice")
    if ignore_x_device:
        stats_query["tap-criteo.ignoreXDevice"] = ignore_x_device

    # Fetch the report as a csv string
    with metrics.http_request_timer(stream.tap_stream_id):
        result = get_statistics_report(sdk_client, stats_query, token=token)

    csv_reader = parse_csv_string(mdata, result)
    with metrics.record_counter(stream.tap_stream_id) as counter:
        time_extracted = utils.now()

        with Transformer() as bumble_bee:
            for row in csv_reader:
                row["_sdc_report_datetime"] = REPORT_RUN_DATETIME
                row["_sdc_report_currency"] = metadata.get(
                    mdata, (), "tap-criteo.currency")
                row = bumble_bee.transform(row, stream.schema.to_dict())

                singer.write_record(stream.stream,
                                    row,
                                    time_extracted=time_extracted)
                counter.increment()

        if start > get_start_for_stream(config, state, advertiser_ids,
                                        stream.stream):
            LOGGER.info(
                "updating bookmark: %s > %s",
                start,
                get_start_for_stream(config, state, advertiser_ids,
                                     stream.stream),
            )
            bookmarks.write_bookmark(
                state,
                state_key_name(advertiser_ids, stream.stream),
                "date",
                utils.strftime(start),
            )
            singer.write_state(state)
        else:
            LOGGER.info(
                "not updating bookmark: %s <= %s",
                start,
                get_start_for_stream(config, state, advertiser_ids,
                                     stream.stream),
            )

        LOGGER.info(
            "Done syncing %s records for the %s report for " +
            "advertiser_ids %s on %s",
            counter.value,
            stream.stream,
            advertiser_ids,
            start,
        )
示例#19
0
def sync_statistics_report(config, state, stream, sdk_client, token):
    """Sync a stream which is backed by the Criteo Statistics endpoint."""
    advertiser_ids = config.get("advertiser_ids", "")
    mdata = metadata.to_map(stream.metadata)

    stream = add_synthetic_keys_to_stream_schema(stream)

    field_list = get_field_list(stream)

    primary_keys = []
    LOGGER.info("{} primary keys are {}".format(stream.stream, primary_keys))
    singer.write_schema(
        stream.stream,
        stream.schema.to_dict(),
        primary_keys,
        bookmark_properties=["Day"],
    )

    # If an attribution window sync is interrupted, start where it left off
    start_date = get_attribution_window_bookmark(state, advertiser_ids,
                                                 stream.stream)
    if start_date is None:
        start_date = apply_conversion_window(
            config,
            get_start_for_stream(config, state, advertiser_ids, stream.stream),
        )

    # According to Criteo's documentation the StatisticsApi only supports
    # between one and three dimensions and at least one metric.
    report_dimensions = [
        field for field in field_list
        if metadata.get(mdata, ("properties",
                                field), "tap-criteo.behaviour") == "dimension"
    ]
    LOGGER.info("Selected dimensions: %s", report_dimensions)
    if not 0 <= len(report_dimensions) <= 3:
        raise ValueError(
            "%s stream only supports up to 3 selected dimensions" %
            stream.stream)
    report_metrics = [
        field for field in field_list
        if metadata.get(mdata, ("properties",
                                field), "tap-criteo.behaviour") == "metric"
    ]
    LOGGER.info("Selected metrics: %s", report_metrics)
    if not len(report_metrics) >= 1:
        raise ValueError("%s stream must have at least 1 selected metric" %
                         stream.stream)

    while start_date <= get_end_date(config):
        token = refresh_auth_token(sdk_client, token)
        sync_statistics_for_day(
            config,
            state,
            stream,
            sdk_client,
            token,
            start_date,
            report_metrics,
            report_dimensions,
        )
        start_date = start_date + relativedelta(days=1)
        bookmarks.write_bookmark(
            state,
            state_key_name(advertiser_ids, stream.stream),
            "last_attribution_window_date",
            utils.strftime(start_date),
        )
        singer.write_state(state)
    bookmarks.clear_bookmark(
        state,
        state_key_name(advertiser_ids, stream.stream),
        "last_attribution_window_date",
    )
    singer.write_state(state)
    LOGGER.info(
        "Done syncing the %s report for advertiser_ids %s",
        stream.stream,
        advertiser_ids,
    )
示例#20
0
def prepare_stream(
    tap_stream_id: str,
    stream_defs: _STREAM_DEFS,
    stream_versions: _STREAM_VERSIONS,
    catalog: Catalog,
    config: Dict[str, Any],
    state: Dict[str, Any],
) -> datetime:
    """Prepares a stream and any of its substreams by instantiating them and
    handling their preliminary Singer messages
    """

    # mypy isn't properly considering is_substream
    stream_def: "Stream" = AVAILABLE_STREAMS[tap_stream_id](
        catalog, config, filter_record)  # type: ignore
    stream_defs[stream_def.tap_stream_id] = stream_def

    if stream_def.has_substreams:
        stream_def.instantiate_substreams(catalog, filter_record)

        for substream_def in stream_def.substreams:
            if not substream_def.is_selected:
                LOGGER.info('Skipping sub-stream "%s"',
                            substream_def.tap_stream_id)

                continue

            # ignored type errors below seem to be caused by same issue as
            # https://github.com/python/mypy/issues/8993
            stream_defs[substream_def.tap_stream_id] = substream_def
            substream_version = get_full_table_version()
            stream_versions[substream_def.tap_stream_id] = substream_version

            write_schema(
                stream_name=substream_def.tap_stream_id,
                schema=substream_def.schema_dict,
                key_properties=substream_def.key_properties,
            )

            # All substreams are necessarily FULL_TABLE, so no need to
            # check if they're INCREMENTAL
            if is_first_run(substream_def.tap_stream_id, state):
                write_activate_version(
                    substream_def.tap_stream_id,
                    substream_version,
                )

                write_bookmark(state, substream_def.tap_stream_id,
                               "wrote_initial_activate_version", True)
                write_state(state)

    write_schema(
        stream_name=stream_def.tap_stream_id,
        schema=stream_def.schema_dict,
        key_properties=stream_def.key_properties,
    )

    filter_datetime = get_filter_datetime(stream_def, config["start_date"],
                                          state)
    stream_version = (None if stream_def.is_valid_incremental else
                      get_full_table_version())
    stream_versions[stream_def.tap_stream_id] = stream_version

    if not stream_def.is_valid_incremental and is_first_run(
            stream_def.tap_stream_id, state):
        write_activate_version(
            stream_def.tap_stream_id,
            stream_version,
        )

        write_bookmark(state, stream_def.tap_stream_id,
                       "wrote_initial_activate_version", True)
        write_state(state)

    return filter_datetime