Exemplo n.º 1
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Exemplo n.º 2
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    if mdata.get(('properties', 'properties'),
                 {}).get('selected') or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        params['includeAllProperties'] = True
        params['allPropertiesFetchMode'] = 'latest_version'

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore",
                               ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(
                    lift_properties_and_versions(row), schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Exemplo n.º 3
0
def sync_endpoint(
        schema_name,
        endpoint=None,
        path=None,
        date_fields=None,
        with_updated_since=True,  #pylint: disable=too-many-arguments
        for_each_handler=None,
        map_handler=None,
        object_to_id=None):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'

    singer.write_schema(schema_name,
                        schema, ["id"],
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)
    start_dt = pendulum.parse(start)
    updated_since = start_dt.strftime("%Y-%m-%dT%H:%M:%SZ")

    with Transformer() as transformer:
        page = 1
        while page is not None:
            url = get_url(endpoint or schema_name)
            params = {
                "updated_since": updated_since
            } if with_updated_since else {}
            params['page'] = page
            response = request(url, params)
            path = path or schema_name
            data = response[path]
            time_extracted = utils.now()

            for row in data:
                if map_handler is not None:
                    row = map_handler(row)

                if object_to_id is not None:
                    for key in object_to_id:
                        if row[key] is not None:
                            row[key + '_id'] = row[key]['id']
                        else:
                            row[key + '_id'] = None

                remove_empty_date_times(row, schema)

                item = transformer.transform(row, schema)

                append_times_to_dates(item, date_fields)

                if item[bookmark_property] >= start:
                    singer.write_record(schema_name,
                                        item,
                                        time_extracted=time_extracted)

                    # take any additional actions required for the currently loaded endpoint
                    if for_each_handler is not None:
                        for_each_handler(row, time_extracted=time_extracted)

                    utils.update_state(STATE, schema_name,
                                       item[bookmark_property])
            page = response['next_page']

    singer.write_state(STATE)
Exemplo n.º 4
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        sf.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Salesforce data for stream %s', stream)

    for rec in sf.query(catalog_entry, state):
        counter.increment()
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            rec = transformer.transform(rec, schema)
        rec = fix_record_anytype(rec, schema)
        singer.write_message(
            singer.RecordMessage(stream=(stream_alias or stream),
                                 record=rec,
                                 version=stream_version,
                                 time_extracted=start_time))

        replication_key_value = replication_key and singer_utils.strptime_with_tz(
            rec[replication_key])

        if sf.pk_chunking:
            if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark:
                # Replace the highest seen bookmark and save the state in case we need to resume later
                chunked_bookmark = singer_utils.strptime_with_tz(
                    rec[replication_key])
                state = singer.write_bookmark(
                    state, catalog_entry['tap_stream_id'],
                    'JobHighestBookmarkSeen',
                    singer_utils.strftime(chunked_bookmark))
                singer.write_state(state)
        # Before writing a bookmark, make sure Salesforce has not given us a
        # record with one outside our range
        elif replication_key_value and replication_key_value <= start_time:
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          replication_key,
                                          rec[replication_key])
            singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'version', None)

    # If pk_chunking is set, only write a bookmark at the end
    if sf.pk_chunking:
        # Write a bookmark with the highest value we've seen
        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      replication_key,
                                      singer_utils.strftime(chunked_bookmark))
Exemplo n.º 5
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None
    stream_version = singer.get_bookmark(state, stream.tap_stream_id,
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get('replication-key')
    replication_key_value = singer.get_bookmark(state, stream.tap_stream_id,
                                                'replication_key_value')
    replication_key_sql_datatype = md_map.get(
        ('properties', replication_key)).get('sql-datatype')

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.cursor_iter_size
                LOGGER.info("Beginning new incremental replication sync %s",
                            stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} >= '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value, replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key))
                else:
                    #if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table),
                        post_db.prepare_columns_sql(replication_key))

                LOGGER.info("SELECT STATEMENT: %s", select_sql)
                cur.execute(select_sql)

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
                    #event worse would be allowing the NULL value to enter into the state
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state, stream.tap_stream_id,
                            'replication_key_value',
                            record_message.record[replication_key])

                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    return state
Exemplo n.º 6
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        site,
        sub_type,
        dimensions_list,
        path,
        endpoint_config,
        api_method,
        pagination,
        static_params,
        bookmark_field=None,
        data_key=None,
        body_params=None,
        id_fields=None):

    # Get the latest bookmark for the stream and set the last_datetime
    last_datetime = None
    max_bookmark_value = None

    last_datetime = get_bookmark(state, stream_name, site, sub_type,
                                 start_date)
    max_bookmark_value = last_datetime

    # Pagination: loop thru all pages of data
    # Pagination types: none, body, params
    # Each page has an offset (starting value) and a limit (batch size, number of records)
    # Increase the "offset" by the "limit" for each batch.
    # Continue until the "offset" exceeds the total_records.
    offset = 0  # Starting offset value for each batch API call
    limit = endpoint_config.get(
        'row_limit', 1000)  # Batch size; Number of records per API call
    total_records = 0
    batch_count = limit
    page = 1

    while limit == batch_count:
        if pagination == 'body':
            body = {
                'startRow': offset,
                'rowLimit': limit,
                **body_params  # adds in endpoint specific, sort, filter body params
            }
            params = static_params
        elif pagination == 'params':
            params = {
                'startRow': offset,
                'rowLimit': limit,
                **static_params  # adds in endpoint specific, sort, filter body params
            }
            body = body_params
        else:
            params = static_params
            body = body_params

        LOGGER.info(
            'Stream: {}, Site: {}, Type: {} - Batch Sync start, Offset: {}'.
            format(stream_name, site, sub_type, offset))

        # Squash params to query-string params
        querystring = None
        if params.items():
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
        LOGGER.info('URL for Stream: {}, Site: {} ({}): {}/{}{}'.format(
            stream_name, site, api_method, BASE_URL, path,
            '?{}'.format(querystring) if querystring else ''))
        if body and not body == {}:
            LOGGER.info('body = {}'.format(body))

        # API request data, endpoint = stream_name passed to client for metrics logging
        data = {}
        fetch_state = "running"
        wait_time = 4

        while fetch_state != "success":
            try:
                if api_method == 'GET':
                    data = client.get(path=path,
                                      params=querystring,
                                      endpoint=stream_name)
                elif api_method == 'POST':
                    data = client.post(path=path,
                                       params=querystring,
                                       endpoint=stream_name,
                                       data=json.dumps(body))
                fetch_state = "success"
            except GoogleError as err:
                LOGGER.info('API quota exceeded, waiting... ' +
                            str(wait_time) + ' seconds')
                time.sleep(wait_time)
                wait_time *= 2

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            LOGGER.info('xxx NO DATA xxx')
            return 0  # No data results

        # Transform data with transform_json from transform.py
        transformed_data = []  # initialize the record list

        # Sites endpoint returns a single record dictionary (not a list)
        if stream_name == 'sites':
            data_list = []
            data_list.append(data)
            data_dict = {}
            data_dict[data_key] = data_list
            data = data_dict
        if data_key in data:
            transformed_data = transform_json(data, stream_name, data_key,
                                              site, sub_type,
                                              dimensions_list)[data_key]
        else:
            LOGGER.info('Number of raw data records: 0')
        if not transformed_data or transformed_data is None:
            LOGGER.info('xxx NO TRANSFORMED DATA xxx')
            return 0  # No data results
        for record in transformed_data:
            for key in id_fields:
                if not record.get(key):
                    primary_keys_only = {
                        id_field: record.get(id_field)
                        for id_field in id_fields
                    }
                    raise ValueError(
                        'Missing key {} in record with primary keys {}'.format(
                            key, primary_keys_only))
        batch_count = len(transformed_data)

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime)

        # to_rec: to record; ending record for the batch
        to_rec = offset + limit
        if to_rec > total_records:
            to_rec = total_records

        LOGGER.info(
            'Stream: {}, Site: {}, Type: {}, Page: {}, Batch records: {} to {}'
            .format(stream_name, site, sub_type, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size)
        offset = offset + limit
        total_records = total_records + batch_count
        page = page + 1

    # Update the state with the max_bookmark_value for the stream, site, sub_type
    # Reference: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics/query
    # NOTE: Results are sorted by click count descending.
    #       If two rows have the same click count, they are sorted in an arbitrary way.
    #       Records are NOT sorted in DATE order.
    # THEREFOR: State is updated after ALL pages of data for stream, site, sub_type, date window
    if bookmark_field:
        write_bookmark(state, stream_name, site, sub_type, max_bookmark_value)

    # Return total_records across all batches
    return total_records
Exemplo n.º 7
0
def write_record(entity_type: str, record: dict):
    time_extracted = utils.now()
    singer.write_record(entity_type, record, time_extracted=time_extracted)
Exemplo n.º 8
0
def sync_statistics_for_day(
    config,
    state,
    stream,
    sdk_client,
    token,
    start,
    report_metrics,
    report_dimensions,
):  # pylint: disable=too-many-locals
    """Sync and output Criteo Statistics endpoint for one day."""
    mdata = metadata.to_map(stream.metadata)
    stats_query = {
        "report_type": stream.tap_stream_id,
        "dimensions": report_dimensions,
        "metrics": report_metrics,
        "start_date": start.strftime("%Y-%m-%d"),
        "end_date": start.strftime("%Y-%m-%d"),
        "currency": metadata.get(mdata, (), "tap-criteo.currency"),
    }
    # Filter advertiser_ids if defined in config
    advertiser_ids = config.get("advertiser_ids")
    if advertiser_ids:
        stats_query["advertiserId"] = advertiser_ids
    # Add ignore_x_device if defined in metadata
    ignore_x_device = metadata.get(mdata, (), "tap-criteo.ignoreXDevice")
    if ignore_x_device:
        stats_query["tap-criteo.ignoreXDevice"] = ignore_x_device

    # Fetch the report as a csv string
    with metrics.http_request_timer(stream.tap_stream_id):
        result = get_statistics_report(sdk_client, stats_query, token=token)

    csv_reader = parse_csv_string(mdata, result)
    with metrics.record_counter(stream.tap_stream_id) as counter:
        time_extracted = utils.now()

        with Transformer() as bumble_bee:
            for row in csv_reader:
                row["_sdc_report_datetime"] = REPORT_RUN_DATETIME
                row["_sdc_report_currency"] = metadata.get(
                    mdata, (), "tap-criteo.currency")
                row = bumble_bee.transform(row, stream.schema.to_dict())

                singer.write_record(stream.stream,
                                    row,
                                    time_extracted=time_extracted)
                counter.increment()

        if start > get_start_for_stream(config, state, advertiser_ids,
                                        stream.stream):
            LOGGER.info(
                "updating bookmark: %s > %s",
                start,
                get_start_for_stream(config, state, advertiser_ids,
                                     stream.stream),
            )
            bookmarks.write_bookmark(
                state,
                state_key_name(advertiser_ids, stream.stream),
                "date",
                utils.strftime(start),
            )
            singer.write_state(state)
        else:
            LOGGER.info(
                "not updating bookmark: %s <= %s",
                start,
                get_start_for_stream(config, state, advertiser_ids,
                                     stream.stream),
            )

        LOGGER.info(
            "Done syncing %s records for the %s report for " +
            "advertiser_ids %s on %s",
            counter.value,
            stream.stream,
            advertiser_ids,
            start,
        )
Exemplo n.º 9
0
def sync_generic_endpoint(config, state, stream, sdk_client, token):
    """Sync a stream which is backed by a generic Criteo endpoint."""
    stream = add_synthetic_keys_to_stream_schema(stream)
    stream = add_synthetic_keys_to_stream_metadata(stream)
    mdata = metadata.to_map(stream.metadata)
    primary_keys = metadata.get(mdata, (), "table-key-properties") or []
    LOGGER.info("{} primary keys are {}".format(stream.stream, primary_keys))
    singer.write_schema(stream.stream, stream.schema.to_dict(), primary_keys)

    advertiser_ids = config.get("advertiser_ids", None)
    if stream.tap_stream_id == "Audiences":
        if not advertiser_ids:
            LOGGER.warn(
                "%s stream needs at least one advertiser_id defined in config"
                % stream.stream)
        for advertiser_id in advertiser_ids.split(","):
            token = refresh_auth_token(sdk_client, token)
            with metrics.http_request_timer(stream.tap_stream_id):
                result = get_audiences_endpoint(sdk_client,
                                                advertiser_id,
                                                token=token)
    else:
        module = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["module"]
        method = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["method"]
        if stream.tap_stream_id in (
                "Portfolio",
                "AdvertiserInfo",
                "Sellers",
                "SellerBudgets",
                "SellerCampaigns",
        ):
            result = call_generic_endpoint(stream,
                                           sdk_client,
                                           module,
                                           method,
                                           token=token)
        else:
            result = call_generic_endpoint(
                stream,
                sdk_client,
                module,
                method,
                advertiser_ids=advertiser_ids,
                token=token,
            )

    result = convert_keys_snake_to_camel([_.to_dict() for _ in result])

    with metrics.record_counter(stream.tap_stream_id) as counter:
        time_extracted = utils.now()

        with Transformer() as bumble_bee:
            for row in result:
                row["_sdc_report_datetime"] = REPORT_RUN_DATETIME
                row = bumble_bee.transform(row, stream.schema.to_dict())

                singer.write_record(stream.stream,
                                    row,
                                    time_extracted=time_extracted)
                counter.increment()

    LOGGER.info(
        "Done syncing %s records for the %s report for advertiser_ids %s",
        counter.value,
        stream.stream,
        advertiser_ids,
    )
Exemplo n.º 10
0
    def _query_recur(self,
                     query,
                     catalog_entry,
                     start_date_str,
                     end_date=None,
                     retries=MAX_RETRIES):
        params = {"q": query}
        url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url)
        headers = self.sf._get_standard_headers()

        sync_start = singer_utils.now()
        if end_date is None:
            end_date = sync_start

        if retries == 0:
            raise TapSalesforceException(
                "Ran out of retries attempting to query Salesforce Object {}".
                format(catalog_entry['stream']))

        retryable = False
        try:
            for rec in self._sync_records(url, headers, params):
                yield rec

            # If the date range was chunked (an end_date was passed), sync
            # from the end_date -> now
            if end_date < sync_start:
                next_start_date_str = singer_utils.strftime(end_date)
                query = self.sf._build_query_string(catalog_entry,
                                                    next_start_date_str)
                for record in self._query_recur(query,
                                                catalog_entry,
                                                next_start_date_str,
                                                retries=retries):
                    yield record

        except HTTPError as ex:
            response = ex.response.json()
            if isinstance(
                    response,
                    list) and response[0].get("errorCode") == "QUERY_TIMEOUT":
                start_date = singer_utils.strptime_with_tz(start_date_str)
                day_range = (end_date - start_date).days
                LOGGER.info(
                    "Salesforce returned QUERY_TIMEOUT querying %d days of %s",
                    day_range, catalog_entry['stream'])
                retryable = True
            else:
                raise ex

        if retryable:
            start_date = singer_utils.strptime_with_tz(start_date_str)
            half_day_range = (end_date - start_date) // 2
            end_date = end_date - half_day_range

            if half_day_range.days == 0:
                raise TapSalesforceException(
                    "Attempting to query by 0 day range, this would cause infinite looping."
                )

            query = self.sf._build_query_string(
                catalog_entry, singer_utils.strftime(start_date),
                singer_utils.strftime(end_date))
            for record in self._query_recur(query, catalog_entry,
                                            start_date_str, end_date,
                                            retries - 1):
                yield record
Exemplo n.º 11
0
    create_sdk_client,
    get_audiences_endpoint,
    get_generic_endpoint,
    get_statistics_report,
    refresh_auth_token,
)
from tap_criteo.endpoints import (
    GENERIC_ENDPOINT_MAPPINGS,
    SELLER_STATS_REPORT_TYPES,
    STATISTICS_REPORT_TYPES,
)

CSV_DELIMITER = ";"
LOGGER = singer.get_logger()

REPORT_RUN_DATETIME = utils.strftime(utils.now())


def get_attribution_window_bookmark(state, advertiser_ids, stream_name):
    """Get attribution window for stream from Singer State."""
    mid_bk_value = bookmarks.get_bookmark(
        state,
        state_key_name(advertiser_ids, stream_name),
        "last_attribution_window_date",
    )
    return utils.strptime_with_tz(mid_bk_value) if mid_bk_value else None


def get_start_for_stream(config, state, advertiser_ids, stream_name):
    """Get start date for stream sync."""
    bk_value = bookmarks.get_bookmark(
Exemplo n.º 12
0
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader,
                     binlog_streams_map: Dict, state: Dict, config: Dict,
                     end_log_file: str, end_log_pos: int):

    processed_rows_events = 0
    events_skipped = 0

    log_file = None
    log_pos = None
    gtid_pos = reader.auto_position  # initial gtid, we set this when we created the reader's instance

    # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types.
    # Saving them here to avoid doing the check if we should ignore a column over and over again
    ignored_columns = set()

    # Exit from the loop when the reader either runs out of streams to return or we reach
    # the end position (which is Master's)
    for binlog_event in reader:

        # get reader current binlog file and position
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if (log_file > end_log_file) or (end_log_file == log_file
                                         and log_pos >= end_log_pos):
            LOGGER.info(
                'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!',
                log_file, log_pos)

            # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master
            # binlog file and position above, making the latter behind the stream reader and it causes some data loss
            # in the next run by skipping everything between end_log_file and log_pos
            # so we need to update log_pos back to master's position
            log_file = end_log_file
            log_pos = end_log_pos

            break

        if isinstance(binlog_event, RotateEvent):
            LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d',
                         binlog_event.next_binlog, binlog_event.position)

            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position, gtid_pos)

        elif isinstance(binlog_event, MariadbGtidEvent) or isinstance(
                binlog_event, GtidEvent):
            gtid_pos = binlog_event.gtid

            LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__,
                         gtid_pos)

            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)

            # There is strange behavior happening when using GTID in the pymysqlreplication lib,
            # explained here: https://github.com/noplay/python-mysql-replication/issues/367
            # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart
            # consuming binlog from old GTID pos when connection to server is lost.
            reader.auto_position = gtid_pos

        else:
            time_extracted = utils.now()

            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped += 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, processed_rows_events)
            else:
                # Compare event's columns to the schema properties
                diff = __get_diff_in_columns_list(
                    binlog_event, catalog_entry.schema.properties.keys(),
                    ignored_columns)

                # If there are additional cols in the event then run discovery if needed and update the catalog
                if diff:

                    LOGGER.info(
                        'Stream `%s`: Difference detected between event and schema: %s',
                        tap_stream_id, diff)

                    md_map = metadata.to_map(catalog_entry.metadata)

                    if not should_run_discovery(diff, md_map):
                        LOGGER.info(
                            'Stream `%s`: Not running discovery. Ignoring all detected columns in %s',
                            tap_stream_id, diff)
                        ignored_columns = ignored_columns.union(diff)

                    else:
                        LOGGER.info('Stream `%s`: Running discovery ... ',
                                    tap_stream_id)

                        # run discovery for the current table only
                        new_catalog_entry = discover_catalog(
                            mysql_conn, config.get('filter_dbs'),
                            catalog_entry.table).streams[0]

                        selected = {
                            k
                            for k, v in
                            new_catalog_entry.schema.properties.items()
                            if common.property_is_selected(
                                new_catalog_entry, k)
                        }

                        # the new catalog has "stream" property = table name, we need to update that to make it the
                        # same as the result of the "resolve_catalog" function
                        new_catalog_entry.stream = tap_stream_id

                        # These are the columns we need to select
                        new_columns = desired_columns(selected,
                                                      new_catalog_entry.schema)

                        cols = set(new_catalog_entry.schema.properties.keys())

                        # drop unsupported properties from schema
                        for col in cols:
                            if col not in new_columns:
                                new_catalog_entry.schema.properties.pop(
                                    col, None)

                        # Add the _sdc_deleted_at col
                        new_columns = add_automatic_properties(
                            new_catalog_entry, list(new_columns))

                        # send the new scheme to target if we have a new schema
                        if new_catalog_entry.schema.properties != catalog_entry.schema.properties:
                            write_schema_message(
                                catalog_entry=new_catalog_entry)
                            catalog_entry = new_catalog_entry

                            # update this dictionary while we're at it
                            binlog_streams_map[tap_stream_id][
                                'catalog_entry'] = new_catalog_entry
                            binlog_streams_map[tap_stream_id][
                                'desired_columns'] = new_columns
                            columns = new_columns

                if isinstance(binlog_event, WriteRowsEvent):
                    processed_rows_events = handle_write_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    processed_rows_events = handle_update_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    processed_rows_events = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, columns,
                        processed_rows_events, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update singer bookmark and send STATE message periodically
        if ((processed_rows_events
             and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos, gtid_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    LOGGER.info('Processed %s rows', processed_rows_events)

    # Update singer bookmark at the last time to point it the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos,
                                 gtid_pos)
Exemplo n.º 13
0
def sync_query(cursor, catalog_entry, state, select_sql, columns,
               stream_version, params):
    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          'replication_key')

    query_string = cursor.mogrify(select_sql, params)

    time_extracted = utils.now()

    LOGGER.info('Running %s', query_string)
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = database_name
        counter.tags['table'] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version, row, columns,
                                                  time_extracted)
            singer.write_message(record_message)

            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get('replication-method')

            if replication_method in {'FULL_TABLE', 'LOG_BASED'}:
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id, 'max_pk_values')

                if max_pk_values:
                    last_pk_fetched = {
                        k: v
                        for k, v in record_message.record.items()
                        if k in key_properties
                    }

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'last_pk_fetched',
                                                  last_pk_fetched)

            elif replication_method == 'INCREMENTAL':
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key',
                                                  replication_key)

                    state = singer.write_bookmark(
                        state, catalog_entry.tap_stream_id,
                        'replication_key_value',
                        record_message.record[replication_key])
            if rows_saved % 1000 == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemplo n.º 14
0
def sync_milestones(entity, element="project"):
    url = get_url(element + "_milestones", entity['id'])

    with Transformer(pre_hook=format_timestamp) as transformer:
        for row in gen_request(url):
            transformed_row = transformer.transform(row, RESOURCES[element + "_milestones"]["schema"])

            if row["updated_at"] >= get_start(element + "_{}".format(entity["id"])):
                singer.write_record(element + "_milestones", transformed_row, time_extracted=utils.now())
Exemplo n.º 15
0
def sync_contact_lists(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("contact_lists")
    bookmark_key = 'updatedAt'
    singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key], catalog.get('stream_alias'))

    start = get_start(STATE, "contact_lists", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_contact_lists from %s", start)

    url = get_url("contact_lists")
    params = {'count': 250}
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'contact_lists', url, params, "lists", "has-more", ["offset"], ["offset"]):
            record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata)

            if record[bookmark_key] >= start:
                singer.write_record("contact_lists", record, catalog.get('stream_alias'), time_extracted=utils.now())
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

    STATE = singer.write_bookmark(STATE, 'contact_lists', bookmark_key, max_bk_value)
    singer.write_state(STATE)

    return STATE
Exemplo n.º 16
0
def get_end_date(config):
    """Get end date from config file."""
    if config.get("end_date"):
        return utils.strptime_with_tz(config.get("end_date"))

    return utils.now()
Exemplo n.º 17
0
def sync_deal_pipelines(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema('deal_pipelines')
    singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias'))
    LOGGER.info('sync_deal_pipelines')
    data = request(get_url('deal_pipelines')).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata)
            singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now())
    singer.write_state(STATE)
    return STATE
Exemplo n.º 18
0
def sync_records(qb, catalog_entry, state, counter, state_passed):
    chunked_bookmark = singer_utils.strptime_with_tz(qb.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream),
                                                             version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Quickbooks data for stream %s', stream)

    previous_max_replication_key = None;

    query_func = qb.query
    if stream.endswith("Report"):
        query_func = qb.query_report

    for rec in query_func(catalog_entry, state, state_passed):

        counter.increment()
        with Transformer(pre_hook=transform_data_hook) as transformer:
            rec = transformer.transform(rec, schema)

        singer.write_message(
            singer.RecordMessage(
                stream=(
                        stream_alias or stream),
                record=rec,
                version=stream_version,
                time_extracted=start_time))

        if replication_key:
            jsonpath_expression = parse(f"$.{replication_key}")
            _rec = {'MetaData': json.loads(rec.get('MetaData', {}))}
            match = jsonpath_expression.find(_rec)
            original_replication_key_value = ""
            if replication_key and len(match) > 0:
                original_replication_key_value = match[0].value
                replication_key_value = singer_utils.strptime_with_tz(original_replication_key_value)

            # Before writing a bookmark, make sure Quickbooks has not given us a
            # record with one outside our range
            if previous_max_replication_key is None or (
                    replication_key_value and replication_key_value <= start_time and replication_key_value > previous_max_replication_key
            ):
                state = singer.write_bookmark(
                    state,
                    catalog_entry['tap_stream_id'],
                    replication_key,
                    original_replication_key_value)
                previous_max_replication_key = replication_key_value

            # Tables with no replication_key will send an
            # activate_version message for the next sync

    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(
            state, catalog_entry['tap_stream_id'], 'version', None)
Exemplo n.º 19
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams or selected_streams == []:
        return

    # Get current datetime (now_dt_str) for query parameters
    now_dttm = utils.now()
    now_dt_str = strftime(now_dttm)[0:10]
    # Reference: https://support.google.com/webmasters/answer/96568?hl=en
    # There is some delay/lag in Google Search Console results reconcilliation
    attribution_start_dttm = now_dttm - timedelta(days=ATTRIBUTION_DAYS)

    # Loop through selected_streams
    for stream_name in selected_streams:
        LOGGER.info('STARTED Syncing: {}'.format(stream_name))
        update_currently_syncing(state, stream_name)
        write_schema(catalog, stream_name)
        endpoint_config = STREAMS[stream_name]
        bookmark_field = next(
            iter(endpoint_config.get('replication_keys', [])), None)
        body_params = endpoint_config.get('body', {})
        endpoint_total = 0
        # Initialize body
        body = endpoint_config.get('body', {})
        # Loop through sites from config site_urls
        site_list = []
        if 'site_urls' in config:
            site_list = config['site_urls'].replace(" ", "").split(",")
        for site in site_list:
            # Skip/ignore sitemaps for domain property sites
            # Reference issue: https://github.com/googleapis/google-api-php-client/issues/1607
            #   "...sitemaps API does not support domain property urls at this time."
            if stream_name == 'sitemaps' and site[0:9] == 'sc-domain':
                LOGGER.info('Skipping Site: {}'.format(site))
                LOGGER.info(
                    '  Sitemaps API does not support domain property urls at this time.'
                )

            else:  # Not sitemaps and sites = sc-domain
                LOGGER.info('STARTED Syncing: {}, Site: {}'.format(
                    stream_name, site))
                site_total = 0
                site_encoded = quote(site, safe='')
                path = endpoint_config.get('path').format(site_encoded)

                # Set dimension_list for performance_reports
                if stream_name == 'performance_report_custom':
                    dimensions_list = []
                    # Create dimensions_list from catalog breadcrumb
                    stream = catalog.get_stream(stream_name)
                    mdata = metadata.to_map(stream.metadata)
                    dimensions_all = [
                        'date', 'country', 'device', 'page', 'query'
                    ]
                    for dim in dimensions_all:
                        if singer.should_sync_field(
                                singer.metadata.get(mdata, ('properties', dim),
                                                    'inclusion'),
                                singer.metadata.get(mdata, ('properties', dim),
                                                    'selected')):
                            # metadata is selected for the dimension
                            dimensions_list.append(dim)
                    body_params['dimensions'] = dimensions_list
                dimensions_list = body_params.get('dimensions')
                LOGGER.info('stream: {}, dimensions_list: {}'.format(
                    stream_name, dimensions_list))

                # loop through each sub type
                sub_types = endpoint_config.get('sub_types', ['self'])
                for sub_type in sub_types:
                    sub_type_total = 0

                    # Initialize date window
                    if stream_name.startswith('performance_report'):
                        reports_dttm_str = get_bookmark(
                            state, stream_name, site, sub_type, start_date)

                        reports_dttm = strptime_to_utc(reports_dttm_str)
                        if reports_dttm < attribution_start_dttm:
                            start_dttm = reports_dttm
                        else:
                            start_dttm = attribution_start_dttm
                        end_dttm = start_dttm + timedelta(
                            days=DATE_WINDOW_SIZE)
                        if end_dttm > now_dttm:
                            end_dttm = now_dttm

                    else:
                        start_dttm = strptime_to_utc(start_date)
                        end_dttm = now_dttm

                    # Date window loop
                    while start_dttm < now_dttm:
                        start_str = strftime(start_dttm)[0:10]
                        end_str = strftime(end_dttm)[0:10]
                        if stream_name.startswith('performance_report'):
                            body = {
                                'searchType': sub_type,
                                'startDate': start_str,
                                'endDate': end_str,
                                **body_params
                            }
                        else:
                            body = None

                        LOGGER.info(
                            'START Syncing Stream: {}, Site: {}, Type: {}, {} to {}'
                            .format(stream_name, site, sub_type, start_str,
                                    end_str))
                        total_records = sync_endpoint(
                            client=client,
                            catalog=catalog,
                            state=state,
                            start_date=start_date,
                            stream_name=stream_name,
                            site=site,
                            sub_type=sub_type,
                            dimensions_list=dimensions_list,
                            path=path,
                            endpoint_config=endpoint_config,
                            api_method=endpoint_config.get(
                                'api_method', 'GET'),
                            pagination=endpoint_config.get(
                                'pagination', 'none'),
                            static_params=endpoint_config.get('params', {}),
                            bookmark_field=bookmark_field,
                            data_key=endpoint_config.get('data_key', None),
                            body_params=body,
                            id_fields=endpoint_config.get('key_properties'))

                        # Increment totals
                        endpoint_total = endpoint_total + total_records
                        site_total = site_total + total_records
                        sub_type_total = sub_type_total + total_records

                        LOGGER.info(
                            'FINISHED Syncing Stream: {}, Site: {}, Type: {}, {} to {}'
                            .format(stream_name, site, sub_type, start_str,
                                    end_str))
                        LOGGER.info(
                            '  Records Synced for Date Window: {}'.format(
                                total_records))

                        # Set next date window
                        start_dttm = end_dttm
                        end_dttm = start_dttm + timedelta(
                            days=DATE_WINDOW_SIZE)
                        if end_dttm > now_dttm:
                            end_dttm = now_dttm
                        # End date window loop

                    LOGGER.info(
                        'FINISHED Syncing Stream: {}, Site: {}, Type: {}'.
                        format(stream_name, site, sub_type))
                    LOGGER.info(
                        '  Records Synced for Type: {}'.format(sub_type_total))
                    # End sub-type loop
                # End else: Not sitemaps and sites = sc-domain

                LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format(
                    stream_name, site))
                LOGGER.info('  Records Synced for Site: {}'.format(site_total))
                # End site loop

        LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name))
        LOGGER.info('  Records Synced for Stream: {}'.format(endpoint_total))
        update_currently_syncing(state, None)
Exemplo n.º 20
0
def batch_record_success(response, stream=None, transformer=None, schema=None):
    '''A success callback for the FB Batch endpoint used when syncing AdCreatives. Needs the stream
    to resolve schema refs and transform the successful response object.'''
    rec = response.json()
    record = transformer.transform(rec, schema)
    singer.write_record(stream.name, record, stream.stream_alias, utils.now())
Exemplo n.º 21
0
def sync_companies(state: State):
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(state, "companies", bookmark_key))
    logger.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("hubspot_companies", schema, ["companyId"],
                        [bookmark_key])

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(state,
                                                "companies") or utils.now()
    state = write_current_sync_start(state, "companies", current_sync_start)
    singer.write_state(state)

    url = get_url("companies_all")
    max_bk_value = start
    contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
    singer.write_schema("hubspot_contacts_by_company",
                        contacts_by_company_schema,
                        ["company-id", "contact-id"])

    for row in gen_request(state, 'companies', url, default_company_params,
                           'companies', 'has-more', ['offset'], ['offset']):
        row_properties = row['properties']
        modified_time = None
        if bookmark_key in row_properties:
            # Hubspot returns timestamps in millis
            timestamp_millis = row_properties[bookmark_key][
                'timestamp'] / 1000.0
            modified_time = datetime.datetime.fromtimestamp(
                timestamp_millis, datetime.timezone.utc)
        elif 'createdate' in row_properties:
            # Hubspot returns timestamps in millis
            timestamp_millis = row_properties['createdate'][
                'timestamp'] / 1000.0
            modified_time = datetime.datetime.fromtimestamp(
                timestamp_millis, datetime.timezone.utc)

        if modified_time and modified_time >= max_bk_value:
            max_bk_value = modified_time

        if not modified_time or modified_time >= start:
            record = request(
                get_url("companies_detail",
                        company_id=row['companyId'])).json()
            record = build_record(record, schema)
            write_record('hubspot_companies', record)
            state = _sync_contacts_by_company(state, record['companyId'])

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    state = singer.write_bookmark(state, 'hubspot_companies', bookmark_key,
                                  utils.strftime(new_bookmark))
    state = write_current_sync_start(state, 'companies', None)
    singer.write_state(state)
    return state
Exemplo n.º 22
0
def sync_endpoint(client,
                  config,
                  catalog,
                  state,
                  stream_name,
                  endpoint_config,
                  sync_streams,
                  selected_streams,
                  timezone_desc=None,
                  parent_id=None):

    # endpoint_config variables
    base_path = endpoint_config.get('path', stream_name)
    bookmark_field = next(iter(endpoint_config.get('replication_keys', [])),
                          None)
    params = endpoint_config.get('params', {})
    paging = endpoint_config.get('paging', False)
    bookmark_query_field_from = endpoint_config.get(
        'bookmark_query_field_from')
    bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to')
    targeting_group = endpoint_config.get('targeting_group')
    targeting_type = endpoint_config.get('targeting_type')
    targeting_country_ind = endpoint_config.get('targeting_country_ind', False)
    data_key_array = endpoint_config.get('data_key_array')
    data_key_record = endpoint_config.get('data_key_record').format(
        targeting_type=targeting_type)
    id_fields = endpoint_config.get('key_properties')
    parent = endpoint_config.get('parent')
    date_window_size = int(endpoint_config.get('date_window_size', '1'))

    # tap config variabless
    start_date = config.get('start_date')
    swipe_up_attribution_window = config.get('swipe_up_attribution_window',
                                             '28_DAY')
    view_attribution_window = config.get('view_attribution_window', '7_DAY')

    swipe_up_attr = int(swipe_up_attribution_window.replace('_DAY', ''))

    if view_attribution_window in (
            '1_HOUR',
            '3_HOUR',
            '6_HOUR',
    ):
        view_attr = 1
    else:
        view_attr = int(view_attribution_window.replace('_DAY', ''))

    attribution_window = max(1, swipe_up_attr, view_attr)

    omit_empty = config.get('omit_empty', 'true')
    if '_stats_' in stream_name:
        params['omit_empty'] = omit_empty

    country_codes = config.get('targeting_country_codes',
                               'us').replace(' ', '').lower()
    if targeting_country_ind:
        country_code_list = country_codes.split(',')
    else:
        country_code_list = ['none']

    # Get the timezone and latest bookmark for the stream
    if not timezone_desc:
        timezone = pytz.timezone('UTC')
    else:
        timezone = pytz.timezone(timezone_desc)
    LOGGER.info('timezone = {}'.format(timezone))

    last_datetime = get_bookmark(state, stream_name, start_date,
                                 bookmark_field, parent, parent_id)
    max_bookmark_value = last_datetime

    # Convert to datetimes in local/ad account timezone
    now_datetime = utils.now()
    last_dttm = strptime_to_utc(last_datetime)

    report_granularity = params.get('granularity', 'HOUR')
    if '_stats_' in stream_name:
        LOGGER.info('report_granularity: {}'.format(report_granularity))

    if bookmark_query_field_from and bookmark_query_field_to:
        # date_window_size: Number of days in each date window
        # Set start window
        start_window = now_datetime - timedelta(days=attribution_window)
        if last_dttm < start_window:
            start_window = last_dttm
        # Set end window
        end_window = start_window + timedelta(days=date_window_size)

    else:
        start_window = last_dttm
        end_window = now_datetime
        diff_sec = (end_window - start_window).seconds
        date_window_size = math.ceil(
            diff_sec / (3600 * 24))  # round-up difference to days

    endpoint_total = 0
    total_records = 0

    while start_window < now_datetime:
        LOGGER.info('START Sync for Stream: {}{}'.format(
            stream_name,
            ', Date window from: {} to {}'.format(start_window.date(), end_window.date()) \
                if bookmark_query_field_from else ''))

        if bookmark_query_field_from and bookmark_query_field_to:
            # Query parameter startDate and endDate must be in Eastern time zone
            # API will error if future dates are requested
            if report_granularity == 'DAY':
                window_start_dt_str = remove_hours_local(
                    start_window, timezone)
                window_end_dt_str = remove_hours_local(end_window, timezone)
                if window_start_dt_str == window_end_dt_str:
                    window_end_dt_str = remove_hours_local(
                        end_window + timedelta(days=1), timezone)
            else:
                window_start_dt_str = remove_minutes_local(
                    start_window, timezone)
                window_end_dt_str = remove_minutes_local(end_window, timezone)
                if window_start_dt_str == window_end_dt_str:
                    window_end_dt_str = remove_hours_local(
                        end_window + timedelta(hours=1), timezone)

            params[bookmark_query_field_from] = window_start_dt_str
            params[bookmark_query_field_to] = window_end_dt_str

        # This loop will run once for non-country_code endpoints
        #   and one or more times (for each country) for country_code endpoints
        for country_code in country_code_list:
            # Path
            if stream_name.startswith('targeting_'):
                path = base_path.format(targeting_group=targeting_group,
                                        targeting_type=targeting_type,
                                        country_code=country_code,
                                        parent_id=parent_id)
            else:
                path = base_path.format(country_code=country_code,
                                        parent_id=parent_id)

            # pagination: loop thru all pages of data using next (if not None)
            #   Reference: https://developers.snapchat.com/api/docs/#pagination
            total_records = 0
            offset = 1
            page = 1
            if paging:
                limit = 500  # Allowed values: 50 - 1000
                params['limit'] = limit
            else:
                limit = None

            for key, val in params.items():
                # Replace variables in params
                new_val = str(val).format(
                    swipe_up_attribution_window=swipe_up_attribution_window,
                    view_attribution_window=view_attribution_window)
                params[key] = new_val
            # concate params
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])

            # initialize next_url
            next_url = '{}/{}?{}'.format(client.base_url, path, querystring)

            # pagination loop
            while next_url is not None:

                # API request data
                data = {}
                try:
                    data = client.get(url=next_url, endpoint=stream_name)
                except Exception as err:
                    LOGGER.error('{}'.format(err))
                    LOGGER.error('URL for Stream {}: {}'.format(
                        stream_name, next_url))
                    raise Exception(err)

                # time_extracted: datetime when the data was extracted from the API
                time_extracted = utils.now()
                if not data or data is None or data == {}:
                    LOGGER.info('No data results returned')
                    total_records = 0
                    break  # No data results

                request_status = data.get('request_status')
                if request_status != 'SUCCESS':
                    raise RuntimeError(data)

                # Get pagination next_url
                next_url = data.get('paging', {}).get('next_link', None)

                # Transform data with transform_json from transform.py
                # The data_key_array identifies the array/list of records below the <root> element
                # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
                transformed_data = []  # initialize the record list

                # Reports stats streams de-nesting
                if '_stats_' in stream_name:
                    for data_record in data.get(data_key_array, []):
                        base_record = data_record.get(data_key_record, {})
                        records = base_record.get('timeseries', [])
                        for record in records:
                            # Add parent base_record fields to record
                            for key, val in base_record.items():
                                if key not in ('start_time', 'end_time',
                                               'timeseries'):
                                    record[key] = val

                            # De-nest stats
                            stats = record.get('stats', {})
                            for key, val in stats.items():
                                record[key] = val
                            record.pop('stats', None)

                            # transform record
                            try:
                                transformed_record = humps.decamelize(record)
                            except Exception as err:
                                LOGGER.error('{}'.format(err))
                                # LOGGER.error('error record: {}'.format(record)) # COMMENT OUT
                                raise Exception(err)

                            # verify primary_keys are in tansformed_record
                            if 'id' not in transformed_record or 'start_time' not in transformed_record:
                                LOGGER.error(
                                    'Stream: {}, Missing key (id or start_time)'
                                    .format(stream_name))
                                LOGGER.error('transformed_record: {}'.format(
                                    transformed_record))
                                raise RuntimeError

                            transformed_data.append(transformed_record)
                            # End for record in records
                        # End for data_record in array
                    # End stats stream

                # Other streams de-nesting
                else:  # Not stats stream
                    for data_record in data.get(data_key_array, []):
                        sub_request_status = data_record.get(
                            'sub_request_status')
                        if sub_request_status != 'SUCCESS':
                            raise RuntimeError(data_record)

                        record = data_record.get(data_key_record, {})

                        # Transforms to align schemas for targeting streams
                        if stream_name.startswith('targeting_'):
                            record['targeting_group'] = targeting_group
                            record['targeting_type'] = targeting_type
                            if country_code != 'none':
                                record['country_code'] = country_code
                            if targeting_group == 'geo':
                                record_id = record.get(targeting_type,
                                                       {}).get('id')
                                record_name = record.get(targeting_type,
                                                         {}).get('name')
                                record['id'] = record_id
                                record['name'] = record_name
                            if targeting_type == 'postal_code':
                                record_id = record.get('postalCode')
                                record['id'] = record_id
                                record['name'] = record_id
                                record.pop('postalCode')

                        # Add parent id field/value
                        if parent and parent_id:
                            parent_key = '{}_id'.format(parent)
                            record[parent_key] = parent_id

                        # transform record (remove inconsistent use of CamelCase)
                        try:
                            transformed_record = humps.decamelize(record)
                        except Exception as err:
                            LOGGER.error('{}'.format(err))
                            LOGGER.error('error record: {}'.format(record))
                            raise Exception(err)

                        # verify primary_keys are in tansformed_record
                        for key in id_fields:
                            if not transformed_record.get(key):
                                LOGGER.error(
                                    'Stream: {}, Missing key {}'.format(
                                        stream_name, key))
                                LOGGER.info('transformed_record: {}'.format(
                                    transformed_record))
                                raise RuntimeError

                        transformed_data.append(transformed_record)
                        # End for data_record in array
                    # End non-stats stream

                # LOGGER.info('transformed_data = {}'.format(transformed_data)) # COMMENT OUT
                if not transformed_data or transformed_data is None:
                    LOGGER.info(
                        'No transformed data for data = {}'.format(data))
                    total_records = 0
                    break  # No transformed_data results

                # Process records and get the max_bookmark_value and record_count
                if stream_name in sync_streams:
                    max_bookmark_value, record_count = process_records(
                        catalog=catalog,
                        stream_name=stream_name,
                        records=transformed_data,
                        time_extracted=time_extracted,
                        bookmark_field=bookmark_field,
                        max_bookmark_value=max_bookmark_value,
                        last_datetime=last_datetime)
                    LOGGER.info('Stream {}, batch processed {} records'.format(
                        stream_name, record_count))

                # Loop thru parent batch records for each children objects (if should stream)
                children = endpoint_config.get('children')
                if children:
                    for child_stream_name, child_endpoint_config in children.items(
                    ):
                        if child_stream_name in sync_streams:
                            LOGGER.info(
                                'START Syncing: {}'.format(child_stream_name))
                            write_schema(catalog, child_stream_name)
                            # For each parent record
                            for record in transformed_data:
                                i = 0
                                # Set parent_id
                                for id_field in id_fields:
                                    if i == 0:
                                        parent_id_field = id_field
                                    if id_field == 'id':
                                        parent_id_field = id_field
                                    i = i + 1
                                parent_id = record.get(parent_id_field)

                                if stream_name == 'ad_accounts':
                                    timezone_desc = record.get(
                                        'timezone', timezone_desc)

                                # sync_endpoint for child
                                LOGGER.info(
                                    'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\
                                        .format(child_stream_name, stream_name, parent_id))

                                child_total_records = sync_endpoint(
                                    client=client,
                                    config=config,
                                    catalog=catalog,
                                    state=state,
                                    stream_name=child_stream_name,
                                    endpoint_config=child_endpoint_config,
                                    sync_streams=sync_streams,
                                    selected_streams=selected_streams,
                                    timezone_desc=timezone_desc,
                                    parent_id=parent_id)

                                LOGGER.info(
                                    'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\
                                        .format(child_stream_name, parent_id, child_total_records))
                                # End transformed data record loop
                            # End if child in sync_streams
                        # End child streams for parent
                    # End if children

                # Parent record batch
                total_records = total_records + record_count
                endpoint_total = endpoint_total + record_count

                LOGGER.info(
                    'Synced Stream: {}, page: {}, records: {} to {}'.format(
                        stream_name, page, offset, total_records))
                # Pagination: increment the offset by the limit (batch-size) and page
                if limit:
                    offset = offset + limit
                page = page + 1
                # End page/batch - while next URL loop
            # End country_code loop

        # Update the state with the max_bookmark_value for the stream date window
        # Snapchat Ads API does not allow page/batch sorting; bookmark written for date window
        if bookmark_field and stream_name in selected_streams:
            write_bookmark(state, stream_name, max_bookmark_value,
                           bookmark_field, parent, parent_id)

        # Increment date window and sum endpoint_total
        start_window = end_window
        next_end_window = end_window + timedelta(days=date_window_size)
        if next_end_window > now_datetime:
            end_window = now_datetime
        else:
            end_window = next_end_window
        # End date window

    # Return total_records (for all pages and date windows)
    return endpoint_total
Exemplo n.º 23
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        path,
        static_params,
        bookmark_query_field=None,
        bookmark_field=None,
        bookmark_type=None,
        parent=None,
        parent_id=None):

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    max_bookmark_value = None
    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, start_date)
        max_bookmark_value = last_datetime

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    to_rec = 0
    limit = 100  # Default per_page limit is 100
    total_endpoint_records = 0
    next_url = '{}/{}'.format(client.base_url, path)
    params = {
        'page': page,
        'per': limit,
        **static_params  # adds in endpoint specific, sort, filter params
    }

    total_processed_records = 0

    while next_url is not None:
        # Need URL querystring for 1st page; subsequent pages provided by next_url
        # querystring: Squash query params into string
        if page == 1:
            if bookmark_query_field:
                if bookmark_type == 'datetime':
                    params[bookmark_query_field] = start_date
                elif bookmark_type == 'integer':
                    params[bookmark_query_field] = last_integer
            if params != {}:
                querystring = '&'.join([
                    '%s=%s' % (key, value) for (key, value) in params.items()
                ])
        else:
            querystring = None
        LOGGER.info('URL for Stream {}: {}{}'.format(
            stream_name, next_url,
            '?{}'.format(querystring) if querystring else ''))

        # API request data
        # total_endpoint_records: API response for all pages
        data = {}
        data, total_endpoint_records, next_url = client.get(
            url=next_url, path=path, params=querystring, endpoint=stream_name)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            return total_endpoint_records  # No data results

        # Transform data with transform_json from transform.py
        # The data_key identifies the array/list of records below the <root> element
        transformed_data = []  # initialize the record list
        if isinstance(data, list):
            transformed_data = transform_json(data, stream_name)

        if not transformed_data or transformed_data is None:
            LOGGER.info('No transformed data for data = {}'.format(data))
            return total_endpoint_records  # No data results

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value, record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            bookmark_type=bookmark_type,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime,
            last_integer=last_integer,
            parent=parent,
            parent_id=parent_id)

        total_processed_records = total_processed_records + record_count
        LOGGER.info(
            'Stream {}, batch processed {} records, total processed records {}'
            .format(stream_name, record_count, total_processed_records))

        # Update the state with the max_bookmark_value for the stream
        if bookmark_field:
            write_bookmark(state, stream_name, max_bookmark_value)

        # to_rec: to record; ending record for the batch page
        to_rec = offset + len(data)
        LOGGER.info(
            'Synced Stream: {}, page: {}, records: {} to {} of {}'.format(
                stream_name, page, offset, to_rec, total_endpoint_records))
        # Pagination: increment the offset by the limit (batch-size) and page
        if not next_url:
            offset = offset + len(data)
        else:
            offset = offset + limit
        page = page + 1

    # Return total_endpoint_records across all pages
    LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format(
        stream_name, page - 1, total_endpoint_records))
    return total_endpoint_records
Exemplo n.º 24
0
def sync_records(ns, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        ns.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing NetSuite data for stream %s', stream)

    previous_max_replication_key = None

    query_func = ns.query

    for rec in query_func(ns, catalog_entry, state):

        counter.increment()
        with Transformer(
                pre_hook=transform_data_hook(ns, stream)) as transformer:
            rec = transformer.transform(rec, schema)

        singer.write_message(
            singer.RecordMessage(stream=(stream_alias or stream),
                                 record=rec,
                                 version=stream_version,
                                 time_extracted=start_time))

        if replication_key:
            _rec = rec.get(replication_key, None)
            original_replication_key_value = ""
            replication_key_value = None
            if replication_key and _rec is not None:
                original_replication_key_value = _rec
                replication_key_value = singer_utils.strptime_with_tz(
                    original_replication_key_value)

            # Before writing a bookmark, make sure Quickbooks has not given us a
            # record with one outside our range
            if previous_max_replication_key is None or (
                    replication_key_value
                    and replication_key_value <= start_time
                    and replication_key_value > previous_max_replication_key):
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              replication_key,
                                              original_replication_key_value)
                previous_max_replication_key = replication_key_value

            # Tables with no replication_key will send an
            # activate_version message for the next sync

    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'version', None)
Exemplo n.º 25
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(
                    lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE, ctx,
                                                      record['companyId'])

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 26
0
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state,
                     config: Dict):
    time_extracted = utils.now()

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)
    log_file = None
    log_pos = None

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            else:

                # Compare event's columns to the schema properties
                diff = set(get_db_column_types(binlog_event).keys()).\
                    difference(catalog_entry.schema.properties.keys())

                # If there are additional cols in the event then run discovery and update the catalog
                if diff:
                    #run discovery for the current table only
                    catalog_entry = discover_catalog(
                        mysql_conn, config.get('filter_dbs'),
                        catalog_entry.table).streams[0]

                    # the new catalog has "stream" property = table name, we need to update that to make it the same as
                    # the result of the "resolve_catalog" function
                    catalog_entry.stream = tap_stream_id
                    desired_columns = list(
                        catalog_entry.schema.properties.keys())

                    # Add the _sdc_deleted_at col
                    add_automatic_properties(catalog_entry, desired_columns)

                    # update this dictionary while we're at it
                    binlog_streams_map[tap_stream_id][
                        'catalog_entry'] = catalog_entry
                    binlog_streams_map[tap_stream_id][
                        'desired_columns'] = desired_columns

                    # send the new scheme to target
                    write_schema_message(catalog_entry=catalog_entry)

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.debug(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        # Update log_file and log_pos after every processed binlog event
        log_file = reader.log_file
        log_pos = reader.log_pos

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == log_file and log_pos >= current_log_pos:
            break

        # Update singer bookmark and send STATE message periodically
        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            state = update_bookmarks(state, binlog_streams_map, log_file,
                                     log_pos)
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    # Update singer bookmark at the last time to point it the the last processed binlog event
    if log_file and log_pos:
        state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
Exemplo n.º 27
0
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(
                    UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(
                            lift_properties_and_versions(row), schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset',
                                                  data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Exemplo n.º 28
0
def sync_campaigns(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("campaigns")
    singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias'))
    LOGGER.info("sync_campaigns(NO bookmarks)")
    url = get_url("campaigns_all")
    params = {'limit': 500}

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]):
            record = request(get_url("campaigns_detail", campaign_id=row['id'])).json()
            record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata)
            singer.write_record("campaigns", record, catalog.get('stream_alias'), time_extracted=utils.now())

    return STATE
Exemplo n.º 29
0
def sync_endpoint(
        client,  # pylint: disable=too-many-branches,too-many-nested-blocks
        catalog,
        state,
        start_date,
        stream_name,
        path,
        static_params,
        endpoint_config,
        bookmark_query_field=None,
        bookmark_field=None,
        bookmark_type=None,
        id_fields=None,
        selected_streams=None,
        parent=None,
        parent_id=None):
    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    data_key = endpoint_config.get('data_key')
    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, start_date)
        max_bookmark_value = last_datetime

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    limit = 100  # Default per_page limit is 100
    total_endpoint_records = 0
    url = '{}/{}'.format(client.base_url, path)
    next_token = None
    params = {
        'perPage': limit,
        **static_params  # adds in endpoint specific, sort, filter params
    }

    total_processed_records = 0

    while url is not None:
        # Need URL querystring for 1st page; subsequent pages provided by next_url
        # querystring: Squash query params into string
        if page == 1:
            if bookmark_query_field:
                if bookmark_type == 'datetime':
                    params[bookmark_query_field] = start_date
                elif bookmark_type == 'integer':
                    params[bookmark_query_field] = last_integer
        else:
            if next_token:
                params['nextToken'] = next_token

        if params != {}:
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])
            querystring = querystring.replace('<parent_id>', str(parent_id))

        LOGGER.info('URL for Stream {}: {}{}'.format(
            stream_name, url,
            '?{}'.format(querystring) if querystring else ''))

        if stream_name == 'recipients' and parent_id is None:
            break

        # API request data
        # total_endpoint_records: API response for all pages
        data, next_token = client.get(url=url,
                                      path=path,
                                      params=querystring,
                                      endpoint=stream_name)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            return total_endpoint_records  # No data results

        if stream_name == 'recipients':
            if not data.get(data_key):
                break

        # Transform data with transform_data from transform.py
        # The data_key identifies the array/list of records below the <root> element
        transformed_data = transform_data(data.get(data_key), stream_name,
                                          parent_id)

        record_count = 0

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value, record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            bookmark_type=bookmark_type,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime,
            last_integer=last_integer,
            parent=parent,
            parent_id=parent_id)

        total_processed_records = total_processed_records + record_count
        LOGGER.info(
            'Stream {}, batch processed {} records, total processed records {}'
            .format(stream_name, record_count, total_processed_records))

        # Loop thru parent batch records for each children objects (if should stream)
        children = endpoint_config.get('children')
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                LOGGER.info(child_stream_name, child_endpoint_config)
                if child_stream_name in selected_streams:
                    write_schema(catalog, child_stream_name)
                    # For each parent record
                    for record in transformed_data:
                        i = 0
                        # Set parent_id
                        for id_field in id_fields:
                            if i == 0:
                                parent_id_field = id_field
                            if id_field == 'id':
                                parent_id_field = id_field
                            i = i + 1
                        parent_id = record.get(parent_id_field)

                        # sync_endpoint for child
                        LOGGER.info(
                            'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'
                            .format(child_stream_name, stream_name, parent_id))
                        child_path = child_endpoint_config.get(
                            'path', child_stream_name).format(str(parent_id))
                        child_bookmark_field = next(
                            iter(
                                child_endpoint_config.get(
                                    'replication_keys', [])), None)
                        child_total_records = sync_endpoint(
                            client=client,
                            catalog=catalog,
                            state=state,
                            start_date=start_date,
                            stream_name=child_stream_name,
                            path=child_path,
                            endpoint_config=child_endpoint_config,
                            static_params=child_endpoint_config.get(
                                'params', {}),
                            bookmark_query_field=child_endpoint_config.get(
                                'bookmark_query_field'),
                            bookmark_field=child_bookmark_field,
                            bookmark_type=child_endpoint_config.get(
                                'bookmark_type'),
                            id_fields=child_endpoint_config.get(
                                'key_properties'),
                            selected_streams=selected_streams,
                            parent=child_endpoint_config.get('parent'),
                            parent_id=parent_id)
                        LOGGER.info(
                            'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'
                            .format(child_stream_name, parent_id,
                                    child_total_records))

        # to_rec: to record; ending record for the batch page
        to_rec = offset + record_count
        total_processed_records = to_rec
        LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format(
            stream_name, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size) and page
        offset = offset + record_count
        page = page + 1

        # If the API doesn't return a next token then that was the last page of results
        if not next_token:
            # Update the state with the max_bookmark_value for the stream
            if bookmark_field:
                write_bookmark(state, stream_name, max_bookmark_value)
            url = None

    # Return total_endpoint_records across all pages
    LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format(
        stream_name, page - 1, total_endpoint_records))
    return total_endpoint_records
Exemplo n.º 30
0
 def get_absolute_start_end_time(self, last_dttm, lookback=0):
     now_dttm = now()
     abs_start, abs_end = self.round_times(
         last_dttm - timedelta(days=lookback), now_dttm)
     return abs_start, abs_end