Python get_bookmark示例，singer.get_bookmark Python示例

示例#1

0

显示文件

def sync_stream(stream_name):
    """
    Sync each stream, looking for newly created records. Updates are captured by events stream.
    """
    LOGGER.info("Started syncing stream %s", stream_name)

    stream_metadata = metadata.to_map(Context.get_catalog_entry(stream_name)['metadata'])
    stream_field_whitelist = json.loads(Context.config.get('whitelist_map', '{}')).get(stream_name)

    extraction_time = singer.utils.now()
    replication_key = metadata.get(stream_metadata, (), 'valid-replication-keys')[0]
    # Invoice Items bookmarks on `date`, but queries on `created`
    filter_key = 'created' if stream_name == 'invoice_items' else replication_key
    stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \
        int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    bookmark = stream_bookmark

    # if this stream has a sub_stream, compare the bookmark
    sub_stream_name = SUB_STREAMS.get(stream_name)

    # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark)
    should_sync_sub_stream = sub_stream_name and Context.is_selected(sub_stream_name)
    if should_sync_sub_stream:
        sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \
            or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())

        # if there is a sub stream, set bookmark to sub stream's bookmark
        # since we know it must be earlier than the stream's bookmark
        if sub_stream_bookmark != stream_bookmark:
            bookmark = sub_stream_bookmark
    else:
        sub_stream_bookmark = None

    with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        end_time = dt_to_epoch(utils.now())

        window_size = float(Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE))

        if DEFAULT_DATE_WINDOW_SIZE != window_size:
            LOGGER.info('Using non-default date window size of %.2f',window_size)
        start_window = bookmark

        # NB: Immutable streams are never synced for updates. We've
        # observed a short lag period between when records are created and
        # when they are available via the API, so these streams will need
        # a short lookback window.
        if stream_name in IMMUTABLE_STREAMS:
            # pylint:disable=fixme
            # TODO: This may be an issue for other streams' created_at
            # entries, but to keep the surface small, doing this only for
            # immutable streams at first to confirm the suspicion.
            start_window -= IMMUTABLE_STREAM_LOOKBACK

        # NB: We observed records coming through newest->oldest and so
        # date-windowing was added and the tap only bookmarks after it has
        # gotten through a date window
        while start_window < end_time:
            stop_window = dt_to_epoch(epoch_to_dt(start_window) + timedelta(days=window_size))
            # cut off the last window at the end time
            if stop_window > end_time:
                stop_window = end_time

            for stream_obj in paginate(
                    STREAM_SDK_OBJECTS[stream_name]['sdk_object'],
                    filter_key,
                    start_window,
                    stop_window,
                    STREAM_SDK_OBJECTS[stream_name].get('request_args')
            ):

                # get the replication key value from the object
                rec = unwrap_data_objects(stream_obj.to_dict_recursive())
                rec = reduce_foreign_keys(rec, stream_name)
                stream_obj_created = rec[replication_key]
                rec['updated'] = stream_obj_created

                # sync stream if object is greater than or equal to the bookmark
                if stream_obj_created >= stream_bookmark:
                    rec = transformer.transform(rec,
                                                Context.get_catalog_entry(stream_name)['schema'],
                                                stream_metadata)

                    # At this point, the record has been transformed and so
                    # any de-selected fields have been pruned. Now, prune off
                    # any fields that aren't present in the whitelist.
                    if stream_field_whitelist:
                        rec = apply_whitelist(rec, stream_field_whitelist)

                    singer.write_record(stream_name,
                                        rec,
                                        time_extracted=extraction_time)

                    Context.new_counts[stream_name] += 1

                # sync sub streams if its selected and the parent object
                # is greater than its bookmark
                if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark:
                    sync_sub_stream(sub_stream_name, stream_obj)

            # Update stream/sub-streams bookmarks as stop window
            if stop_window > stream_bookmark:
                stream_bookmark = stop_window
                singer.write_bookmark(Context.state,
                                      stream_name,
                                      replication_key,
                                      stream_bookmark)

            # the sub stream bookmarks on its parent
            if should_sync_sub_stream and stop_window > sub_stream_bookmark:
                sub_stream_bookmark = stop_window
                singer.write_bookmark(Context.state,
                                      sub_stream_name,
                                      replication_key,
                                      sub_stream_bookmark)

            singer.write_state(Context.state)

            # update window for next iteration
            start_window = stop_window

    singer.write_state(Context.state)

示例#2

0

显示文件

文件： __init__.py 项目： flatfair/tap-hubspot

def get_start(state, tap_stream_id, bookmark_key):
    current_bookmark = singer.get_bookmark(state, tap_stream_id, bookmark_key)
    if current_bookmark is None:
        return CONFIG['start_date']
    return current_bookmark

示例#3

0

显示文件

def do_sync(config, state, stream):
    singer.set_currently_syncing(state, stream.tap_stream_id)
    singer.write_state(state)

    client = bigquery.Client()
    metadata = stream.metadata[0]["metadata"]
    tap_stream_id = stream.tap_stream_id

    inclusive_start = True
    start_datetime = singer.get_bookmark(state, tap_stream_id,
                                         BOOKMARK_KEY_NAME)
    if start_datetime:
        if not config.get("start_always_inclusive"):
            inclusive_start = False
    else:
        start_datetime = config.get("start_datetime")
    start_datetime = dateutil.parser.parse(start_datetime).strftime(
        "%Y-%m-%d %H:%M:%S.%f")

    if config.get("end_datetime"):
        end_datetime = dateutil.parser.parse(
            config.get("end_datetime")).strftime("%Y-%m-%d %H:%M:%S.%f")

    singer.write_schema(tap_stream_id, stream.schema.to_dict(),
                        stream.key_properties)

    keys = {
        "table": metadata["table"],
        "columns": metadata["columns"],
        "datetime_key": metadata.get("datetime_key"),
        "start_datetime": start_datetime,
        "end_datetime": end_datetime
    }

    limit = config.get("limit", None)
    query = _build_query(keys,
                         metadata.get("filters", []),
                         inclusive_start,
                         limit=limit)
    query_job = client.query(query)

    properties = stream.schema.properties
    last_update = start_datetime

    LOGGER.info("Running query:\n    %s" % query)

    extract_tstamp = datetime.datetime.utcnow()
    extract_tstamp = extract_tstamp.replace(tzinfo=datetime.timezone.utc)

    with metrics.record_counter(tap_stream_id) as counter:
        for row in query_job:
            record = {}
            for key in properties.keys():
                prop = properties[key]

                if key in [
                        LEGACY_TIMESTAMP, EXTRACT_TIMESTAMP, BATCH_TIMESTAMP
                ]:
                    continue

                if row[key] is None:
                    if prop.type[0] != "null":
                        raise ValueError(
                            "NULL value not allowed by the schema")
                    else:
                        record[key] = None
                elif prop.format == "date-time":
                    if type(row[key]) == str:
                        r = dateutil.parser.parse(row[key])
                    elif type(row[key]) == datetime.date:
                        r = datetime.datetime(year=row[key].year,
                                              month=row[key].month,
                                              day=row[key].day)
                    elif type(row[key]) == datetime.datetime:
                        r = row[key]
                    record[key] = r.isoformat()
                elif prop.type[1] == "string":
                    record[key] = str(row[key])
                elif prop.type[1] == "number":
                    record[key] = Decimal(row[key])
                elif prop.type[1] == "integer":
                    record[key] = int(row[key])
                else:
                    record[key] = row[key]

            if LEGACY_TIMESTAMP in properties.keys():
                record[LEGACY_TIMESTAMP] = int(round(time.time() * 1000))
            if EXTRACT_TIMESTAMP in properties.keys():
                record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat()

            singer.write_record(stream.stream, record)

            last_update = record[keys["datetime_key"]]
            counter.increment()

    state = singer.write_bookmark(state, tap_stream_id, BOOKMARK_KEY_NAME,
                                  last_update)

    singer.write_state(state)

示例#4

0

显示文件

文件： common.py 项目： timvisher/tap-mysql

def sync_query(cursor, catalog_entry, state, select_sql, columns,
               stream_version, params):
    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          'replication_key')

    query_string = cursor.mogrify(select_sql, params)

    time_extracted = utils.now()

    LOGGER.info('Running %s', query_string)
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = database_name
        counter.tags['table'] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version, row, columns,
                                                  time_extracted)
            singer.write_message(record_message)

            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get('replication-method')

            if replication_method in {'FULL_TABLE', 'LOG_BASED'}:
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id, 'max_pk_values')

                if max_pk_values:
                    last_pk_fetched = {
                        k: v
                        for k, v in record_message.record.items()
                        if k in key_properties
                    }

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'last_pk_fetched',
                                                  last_pk_fetched)

            elif replication_method == 'INCREMENTAL':
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key',
                                                  replication_key)

                    state = singer.write_bookmark(
                        state, catalog_entry.tap_stream_id,
                        'replication_key_value',
                        record_message.record[replication_key])
            if rows_saved % 1000 == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

示例#5

0

显示文件

def sync_rows(config,
              state,
              tap_stream_id,
              key_properties=[],
              auth_method=None,
              max_page=None,
              assume_sorted=True,
              filter_by_schema=True,
              raw_output=False):
    """
    - max_page: Force sync to end after max_page. Mostly used for debugging.
    - assume_sorted: Trust the data to be presorted by the
                     index/timestamp/datetime keys
                     so it is safe to finish the replication once the last
                     update index/timestamp/datetime passes the end.
    """
    schema = load_schema(config["schema_dir"], tap_stream_id)
    params = get_init_endpoint_params(config, state, tap_stream_id)
    bookmark_type = get_bookmark_type(config)
    start = get_start(config, state, tap_stream_id, "last_update")
    end = get_end(config)

    headers = get_http_headers(config)

    if start is None:
        LOGGER.warning("None of timestamp_key, datetime_key, and index_key" +
                       " are set in conifg. Bookmarking is not available.")

    start_str = human_readable(bookmark_type, start)
    end_str = human_readable(bookmark_type, end)
    # Log the conditions
    LOGGER.info("Stream %s has %s set starting %s and ending %s." %
                (tap_stream_id, bookmark_type, start_str, end_str))
    # I trust you set URL format contains those params. The behavior depends
    # on the data source API's spec.
    # I will not filter out the records outside the boundary. Every record
    # received is will be written out.

    LOGGER.info("assume_sorted is set to %s" % assume_sorted)
    # I trust the data to be sorted by the index/timestamp/datetime keys.
    # So it is safe to finish the replication once the last
    # update index/timestamp/datetime passes the end.
    # When in doubt, set this to False. Always perform post-replication dedup.

    LOGGER.info("filter_by_schema is set to %s." % filter_by_schema)
    # The fields undefined/not-conforming to the schema will be written out.

    LOGGER.info("auth_method is set to %s" % auth_method)

    # Initialize the counters
    last_update = start

    # Offset is the number of records (vs. page)
    offset_number = params.get("current_offset", 0)
    page_number = params.get("current_page", 0)

    # When we rely on index/datetime/timestamp to parse the next GET URL,
    # we will get the record we have already seen in the current process.
    # When we get last_record_extracted from state file, we can also
    # compare with the previous process to further avoiding duplicated
    # records in the target data store.
    prev_written_record = None
    last_record_extracted = singer.get_bookmark(state, tap_stream_id,
                                                "last_record_extracted")
    if last_record_extracted:
        prev_written_record = json.loads(last_record_extracted)

    # First writ out the schema
    if raw_output is False:
        singer.write_schema(tap_stream_id, schema, key_properties)

    # Fetch and iterate over to write the records
    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            params.update({"current_page": page_number})
            params.update({"current_page_one_base": page_number + 1})
            params.update({"current_offset": offset_number})
            params.update({"last_update": last_update})

            endpoint = get_endpoint(config["url"], tap_stream_id, params)
            LOGGER.info("GET %s", endpoint)

            rows = generate_request(tap_stream_id, endpoint, auth_method,
                                    headers, config.get("username"),
                                    config.get("password"))
            rows = get_record_list(rows, config.get("record_list_level"))

            LOGGER.info("Current page %d" % page_number)
            LOGGER.info("Current offset %d" % offset_number)

            for row in rows:
                record = get_record(row, config.get("record_level"))
                if filter_by_schema:
                    record = filter_record(record, schema)

                # It's important to compare the record before adding
                # EXTRACT_TIMESTAMP
                if record == prev_written_record:
                    LOGGER.debug("Skipping the duplicated row %s" % record)
                    continue

                if EXTRACT_TIMESTAMP in schema["properties"].keys():
                    extract_tstamp = datetime.datetime.utcnow()
                    extract_tstamp = extract_tstamp.replace(
                        tzinfo=datetime.timezone.utc)
                    record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat()

                next_last_update = get_last_update(config, record, last_update)

                if not end or next_last_update < end:
                    if raw_output:
                        sys.stdout.write(json.dumps(record) + "\n")
                    else:
                        singer.write_record(tap_stream_id, record)

                    counter.increment()  # Increment only when we write
                    last_update = next_last_update

                    # prev_written_record may be persisted for the next run.
                    # EXTRACT_TIMESTAMP will be different. So popping it out
                    # before storing.
                    record.pop(EXTRACT_TIMESTAMP)
                    prev_written_record = record

            # Exit conditions
            if len(rows) < config["items_per_page"]:
                LOGGER.info(
                    ("Response is less than set item per page (%d)." +
                     "Finishing the extraction") % config["items_per_page"])
                break
            if max_page and page_number + 1 >= max_page:
                LOGGER.info("Max page %d reached. Finishing the extraction.")
                break
            if assume_sorted and end and next_last_update >= end:
                LOGGER.info(("Record greater than %s and assume_sorted is" +
                             " set. Finishing the extraction.") % end)
                break

            page_number += 1
            offset_number += len(rows)

    state = singer.write_bookmark(state, tap_stream_id, "last_update",
                                  last_update)
    if prev_written_record:
        state = singer.write_bookmark(state, tap_stream_id,
                                      "last_record_extracted",
                                      json.dumps(prev_written_record))

    if raw_output == False:
        singer.write_state(state)

    return state

示例#6

0

显示文件

文件： __init__.py 项目： cubedevinc/tap-salesforce

def do_sync(sf, catalog, state):
    starting_stream = state.get("current_stream")

    if starting_stream:
        LOGGER.info("Resuming sync from %s", starting_stream)
    else:
        LOGGER.info("Starting sync")

    for catalog_entry in catalog["streams"]:
        stream_version = get_stream_version(catalog_entry, state)
        stream = catalog_entry['stream']
        stream_alias = catalog_entry.get('stream_alias')
        stream_name = catalog_entry["tap_stream_id"]
        activate_version_message = singer.ActivateVersionMessage(
            stream=(stream_alias or stream), version=stream_version)

        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        mdata = metadata.to_map(catalog_entry['metadata'])

        if not stream_is_selected(mdata):
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        if starting_stream:
            if starting_stream == stream_name:
                LOGGER.info("%s: Resuming", stream_name)
                starting_stream = None
            else:
                LOGGER.info("%s: Skipping - already synced", stream_name)
                continue
        else:
            LOGGER.info("%s: Starting", stream_name)

        state["current_stream"] = stream_name
        singer.write_state(state)
        key_properties = metadata.to_map(catalog_entry['metadata']).get(
            (), {}).get('table-key-properties')
        singer.write_schema(stream, catalog_entry['schema'], key_properties,
                            replication_key, stream_alias)

        job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                     'JobID')
        if job_id:
            with metrics.record_counter(stream) as counter:
                LOGGER.info(
                    "Found JobID from previous Bulk Query. Resuming sync for job: %s",
                    job_id)
                # Resuming a sync should clear out the remaining state once finished
                counter = resume_syncing_bulk_query(sf, catalog_entry, job_id,
                                                    state, counter)
                LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                            counter.value)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('JobID', None)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('BatchIDs', None)
                bookmark = state.get('bookmarks',
                                     {}).get(catalog_entry['tap_stream_id'],
                                             {}).pop('JobHighestBookmarkSeen',
                                                     None)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              replication_key, bookmark)
                singer.write_state(state)
        else:
            # Tables with a replication_key or an empty bookmark will emit an
            # activate_version at the beginning of their sync
            bookmark_is_empty = state.get('bookmarks', {}).get(
                catalog_entry['tap_stream_id']) is None

            if replication_key or bookmark_is_empty:
                singer.write_message(activate_version_message)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              'version', stream_version)
            counter = sync_stream(sf, catalog_entry, state)
            LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                        counter.value)

    state["current_stream"] = None
    singer.write_state(state)
    LOGGER.info("Finished sync")

示例#7

0

显示文件

文件： logical_replication.py 项目： jeberhardt/pipelinewise-tap-postgres

def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
    lsn_comitted = min([get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams])
    start_lsn = lsn_comitted
    lsn_to_flush = None
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    lsn_last_processed = None
    lsn_currently_processing = None
    lsn_received_timestamp = None
    lsn_processed_count = 0
    logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300
    poll_interval = 10
    poll_timestamp = None

    selected_tables = []
    for s in logical_streams:
        selected_tables.append("{}.{}".format(s['metadata'][0]['metadata']['schema-name'], s['table_name']))

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    # Create replication connection and cursor
    conn = post_db.open_connection(conn_info, True)
    cur = conn.cursor()

    try:
        LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format(datetime.datetime.utcnow(), int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot))
        cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, options={'write-in-chunks': 1, 'add-tables': ','.join(selected_tables)})
    except psycopg2.ProgrammingError:
        raise Exception("Unable to start replication with logical replication (slot {})".format(slot))

    # Emulate some behaviour of pg_recvlogical
    LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format(datetime.datetime.utcnow()))
    cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True)
    time.sleep(1)

    lsn_received_timestamp = datetime.datetime.utcnow()
    poll_timestamp = datetime.datetime.utcnow()

    while True:
        # Disconnect when no data received for logical_poll_total_seconds
        # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts
        poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds()
        if poll_duration > logical_poll_total_seconds:
            LOGGER.info("{} : Breaking - {} seconds of polling with no data".format(datetime.datetime.utcnow(), poll_duration))
            break

        try:
            msg = cur.read_message()
        except Exception as e:
            LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e))
            raise

        if msg:
            if msg.data_start > end_lsn:
                LOGGER.info("{} : Breaking - current {} is past end_lsn {}".format(datetime.datetime.utcnow(), int_to_lsn(msg.data_start), int_to_lsn(end_lsn)))
                break

            state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn)

            # When using wal2json with write-in-chunks, multiple messages can have the same lsn
            # This is to ensure we only flush to lsn that has completed entirely
            if lsn_currently_processing is None:
                lsn_currently_processing = msg.data_start
                LOGGER.info("{} : First message received is {} at {}".format(datetime.datetime.utcnow(), int_to_lsn(lsn_currently_processing), datetime.datetime.utcnow()))

                # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing
                LOGGER.info("{} : Confirming write up to {}, flush to {}".format(datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush)))
                cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True)

            elif (int(msg.data_start) > lsn_currently_processing):
                lsn_last_processed = lsn_currently_processing
                lsn_currently_processing = msg.data_start
                lsn_received_timestamp = datetime.datetime.utcnow()
                lsn_processed_count = lsn_processed_count + 1
                if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD:
                    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
                    lsn_processed_count = 0

        # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL
        if datetime.datetime.utcnow() >= (poll_timestamp + datetime.timedelta(seconds=poll_interval)):
            if lsn_currently_processing is None:
                LOGGER.info("{} : Sending keep-alive message to source server (last message received was {} at {})".format(
                    datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp))
                cur.send_feedback()
            elif state_file is None:
                LOGGER.info("{} : Sending keep-alive message to source server (last message received was {} at {})".format(
                    datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp))
                cur.send_feedback()
            else:
                # Read lsn_comitted currently captured in state file on disk
                lsn_comitted = min([get_bookmark(utils.load_json(state_file), s['tap_stream_id'], 'lsn') for s in logical_streams])
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing
                LOGGER.info("{} : Confirming write up to {}, flush to {} (last message received was {} at {})".format(
                    datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_last_processed), lsn_received_timestamp))
                cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True)
            poll_timestamp = datetime.datetime.utcnow()

    # Close replication connection and cursor
    cur.close()
    conn.close()

    if lsn_last_processed:
        if lsn_comitted > lsn_last_processed:
            lsn_last_processed = lsn_comitted
            LOGGER.info("Current lsn_last_processed {} is older than lsn_comitted {}".format(int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted)))
        for s in logical_streams:
            LOGGER.info("updating bookmark for stream {} to lsn = {} ({})".format(s['tap_stream_id'], lsn_last_processed, int_to_lsn(lsn_last_processed)))
            state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state

示例#8

0

显示文件

def sync_report(client, account_id, report_stream):
    report_name = stringcase.pascalcase(report_stream.stream)

    report_schema = get_report_schema(client, report_name)
    singer.write_schema(report_stream.stream, report_schema, [])

    state_key = '{}_{}'.format(account_id, report_stream.stream)
    config_start_date = CONFIG.get('start_date')
    bookmark = singer.get_bookmark(STATE, state_key, 'date')
    conversion_window = int(CONFIG.get('conversion_window', '-30'))
    start_date = arrow.get(bookmark
                           or config_start_date).shift(days=conversion_window)
    end_date = arrow.get(CONFIG.get('end_date'))  # defaults to now

    LOGGER.info('Syncing report: {} - from {} to {}'.format(
        report_name, start_date, end_date))

    report_request = client.factory.create('{}Request'.format(report_name))
    report_request.Format = 'Csv'
    report_request.Aggregation = 'Daily'
    report_request.Language = 'English'
    report_request.ExcludeReportHeader = True
    report_request.ExcludeReportFooter = True

    scope = client.factory.create('AccountThroughAdGroupReportScope')
    scope.AccountIds = {'long': [account_id]}
    report_request.Scope = scope

    excluded_fields = ['GregorianDate', '_sdc_report_datetime']
    if report_name in reports.EXTRA_FIELDS:
        excluded_fields += reports.EXTRA_FIELDS[report_name]

    selected_fields = get_selected_fields(report_stream,
                                          exclude=excluded_fields)
    selected_fields.append('TimePeriod')

    report_columns = client.factory.create(
        'ArrayOf{}Column'.format(report_name))
    getattr(report_columns,
            '{}Column'.format(report_name)).append(selected_fields)
    report_request.Columns = report_columns

    request_start_date = client.factory.create('Date')
    request_start_date.Day = start_date.day
    request_start_date.Month = start_date.month
    request_start_date.Year = start_date.year

    request_end_date = client.factory.create('Date')
    request_end_date.Day = end_date.day
    request_end_date.Month = end_date.month
    request_end_date.Year = end_date.year

    report_time = client.factory.create('ReportTime')
    report_time.CustomDateRangeStart = request_start_date
    report_time.CustomDateRangeEnd = request_end_date
    report_time.PredefinedTime = None
    report_request.Time = report_time

    report_time = arrow.get().isoformat()

    request_id = client.SubmitGenerateReport(report_request)

    download_url = poll_report(client, report_name, start_date, end_date,
                               request_id)

    if download_url:
        stream_report(report_stream.stream, report_name, download_url,
                      report_time)

    singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat())
    singer.write_state(STATE)

示例#9

0

显示文件

def do_sync(sf, catalog, state):
    starting_stream = state.get("current_stream")

    if starting_stream:
        LOGGER.info("Resuming sync from %s", starting_stream)
    else:
        LOGGER.info("Starting sync")

    for catalog_entry in catalog["streams"]:
        stream_version = get_stream_version(catalog_entry, state)
        stream = catalog_entry['stream']
        stream_alias = catalog_entry.get('stream_alias')
        stream_name = catalog_entry["tap_stream_id"]
        activate_version_message = singer.ActivateVersionMessage(
            stream=(stream_alias or stream), version=stream_version)

        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        mdata = metadata.to_map(catalog_entry['metadata'])

        if not stream_is_selected(mdata):
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        if starting_stream:
            if starting_stream == stream_name:
                LOGGER.info("%s: Resuming", stream_name)
                starting_stream = None
            else:
                LOGGER.info("%s: Skipping - already synced", stream_name)
                continue
        else:
            LOGGER.info("%s: Starting", stream_name)

        state["current_stream"] = stream_name
        singer.write_state(state)
        key_properties = metadata.to_map(catalog_entry['metadata']).get(
            (), {}).get('table-key-properties')
        singer.write_schema(stream, catalog_entry['schema'], key_properties,
                            replication_key, stream_alias)

        job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                     'JobID')
        batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                        'BatchIDs')
        # Checking whether job_id list is not empty and batches list is not empty
        if job_id and batch_ids:
            with metrics.record_counter(stream) as counter:
                LOGGER.info(
                    "Found JobID from previous Bulk Query. Resuming sync for job: %s",
                    job_id)
                # Resuming a sync should clear out the remaining state once finished
                counter = resume_syncing_bulk_query(sf, catalog_entry, job_id,
                                                    state, counter)
                LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                            counter.value)
                # Remove Job info from state once we complete this resumed query. One of a few cases could have occurred:
                # 1. The job succeeded, in which case make JobHighestBookmarkSeen the new bookmark
                # 2. The job partially completed, in which case make JobHighestBookmarkSeen the new bookmark, or
                #    existing bookmark if no bookmark exists for the Job.
                # 3. The job completely failed, in which case maintain the existing bookmark, or None if no bookmark
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('JobID', None)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                               {}).pop('BatchIDs', None)
                bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                     .pop('JobHighestBookmarkSeen', None)
                existing_bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                              .pop(replication_key, None)
                state = singer.write_bookmark(
                    state, catalog_entry['tap_stream_id'], replication_key,
                    bookmark or existing_bookmark
                )  # If job is removed, reset to existing bookmark or None
                singer.write_state(state)
        else:
            # Tables with a replication_key or an empty bookmark will emit an
            # activate_version at the beginning of their sync
            bookmark_is_empty = state.get('bookmarks', {}).get(
                catalog_entry['tap_stream_id']) is None

            if replication_key or bookmark_is_empty:
                singer.write_message(activate_version_message)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              'version', stream_version)
            counter = sync_stream(sf, catalog_entry, state)
            LOGGER.info("%s: Completed sync (%s rows)", stream_name,
                        counter.value)

    state["current_stream"] = None
    singer.write_state(state)
    LOGGER.info("Finished sync")

示例#10

0

显示文件

文件： full_table.py 项目： mpcarter/pipelinewise-tap-oracle

def sync_table(conn_config, stream, state, desired_columns):
   connection = orc_db.open_connection(conn_config)
   connection.outputtypehandler = common.OutputTypeHandler

   cur = connection.cursor()
   cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
   cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""")
   cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""")
   time_extracted = utils.now()

   #before writing the table version to state, check if we had one to begin with
   first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None

   #pick a new table version IFF we do not have an ORA_ROWSCN in our state
   #the presence of an ORA_ROWSCN indicates that we were interrupted last time through
   if singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') is None:
      nascent_stream_version = int(time.time() * 1000)
   else:
      nascent_stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version')

   state = singer.write_bookmark(state,
                                 stream.tap_stream_id,
                                 'version',
                                 nascent_stream_version)
   singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

   # cur = connection.cursor()
   md = metadata.to_map(stream.metadata)
   schema_name = md.get(()).get('schema-name')

   escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns)
   escaped_schema  = schema_name
   escaped_table   = stream.table
   activate_version_message = singer.ActivateVersionMessage(
      stream=stream.stream,
      version=nascent_stream_version)

   if first_run:
      singer.write_message(activate_version_message)

   with metrics.record_counter(None) as counter:
      ora_rowscn = singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN')
      if ora_rowscn:
         LOGGER.info("Resuming Full Table replication %s from ORA_ROWSCN %s", nascent_stream_version, ora_rowscn)
         select_sql      = """SELECT {}, ORA_ROWSCN
                                FROM {}.{}
                               WHERE ORA_ROWSCN >= {}
                               ORDER BY ORA_ROWSCN ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema,
                                           escaped_table,
                                           ora_rowscn)
      else:
         select_sql      = """SELECT {}, ORA_ROWSCN
                                FROM {}.{}
                               ORDER BY ORA_ROWSCN ASC""".format(','.join(escaped_columns),
                                                                    escaped_schema,
                                                                    escaped_table)

      rows_saved = 0
      LOGGER.info("select %s", select_sql)
      for row in cur.execute(select_sql):
         ora_rowscn = row[-1]
         row = row[:-1]
         record_message = common.row_to_singer_message(stream,
                                                       row,
                                                       nascent_stream_version,
                                                       desired_columns,
                                                       time_extracted)

         singer.write_message(record_message)
         state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', ora_rowscn)
         rows_saved = rows_saved + 1
         if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

         counter.increment()


   state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', None)
   #always send the activate version whether first run or subsequent
   singer.write_message(activate_version_message)
   cur.close()
   connection.close()
   return state

示例#11

0

显示文件

def sync_table(client, stream, state, stream_version, blacklist):
    common.whitelist_bookmark_keys(generate_bookmark_keys(stream),
                                   stream['tap_stream_id'], state)

    mdata = metadata.to_map(stream['metadata'])
    stream_metadata = mdata.get(())

    database_name = stream_metadata['database-name']

    db = client[database_name]
    collection = db[stream['stream']]

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream['stream'], version=stream_version)

    initial_full_table_complete = singer.get_bookmark(
        state, stream['tap_stream_id'], 'initial_full_table_complete')

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete:
        singer.write_message(activate_version_message)

    max_id_value = singer.get_bookmark(
        state, stream['tap_stream_id'],
        'max_id_value') or get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'max_id_value', max_id_value)

    find_filter = {'$lte': objectid.ObjectId(max_id_value)}

    if last_id_fetched:
        find_filter['$gt'] = objectid.ObjectId(last_id_fetched)

    LOGGER.info("Starting full table replication for table {}.{}".format(
        database_name, stream['stream']))

    with metrics.record_counter(None) as counter:
        with collection.find({'_id': find_filter},
                             sort=[("_id", pymongo.DESCENDING)]) as cursor:
            rows_saved = 0

            time_extracted = utils.now()

            while cursor.alive:
                try:
                    row = next(cursor)
                    rows_saved += 1

                    whitelisted_row = {
                        k: v
                        for k, v in row.items() if k not in blacklist
                    }
                    record_message = common.row_to_singer_record(
                        stream, whitelisted_row, stream_version,
                        time_extracted)

                    singer.write_message(record_message)

                    state = singer.write_bookmark(state,
                                                  stream['tap_stream_id'],
                                                  'last_id_fetched',
                                                  str(row['_id']))

                    if rows_saved % 1000 == 0:
                        singer.write_state(state)
                except InvalidBSON as e:
                    LOGGER.info(e)
                    continue

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')

    singer.write_message(activate_version_message)
    singer.write_state(state)

示例#12

0

显示文件

def get_bookmark(name):
    bookmark = singer.get_bookmark(Context.state, name, 'start_date')
    if bookmark is None:
        bookmark = Context.config['start_date']
    return bookmark

示例#13

0

显示文件

def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id in binlog_streams_map.keys():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    server_id = fetch_server_id(mysql_conn)

    connection_wrapper = make_connection_wrapper(config)

    reader = BinLogStreamReader(connection_settings={},
                                server_id=server_id,
                                log_file=log_file,
                                log_pos=log_pos,
                                resume_stream=True,
                                only_events=[
                                    RotateEvent, WriteRowsEvent,
                                    UpdateRowsEvent, DeleteRowsEvent
                                ],
                                pymysql_wrapper=connection_wrapper)

    time_extracted = utils.now()

    LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                log_file, log_pos)

    rows_saved = 0
    events_skipped = 0

    current_log_file, current_log_pos = fetch_current_log_file_and_pos(
        mysql_conn)

    for binlog_event in reader:
        if isinstance(binlog_event, RotateEvent):
            state = update_bookmarks(state, binlog_streams_map,
                                     binlog_event.next_binlog,
                                     binlog_event.position)
        else:
            tap_stream_id = common.generate_tap_stream_id(
                binlog_event.schema, binlog_event.table)
            streams_map_entry = binlog_streams_map.get(tap_stream_id, {})
            catalog_entry = streams_map_entry.get('catalog_entry')
            desired_columns = streams_map_entry.get('desired_columns')

            if not catalog_entry:
                events_skipped = events_skipped + 1

                if events_skipped % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.info(
                        "Skipped %s events so far as they were not for selected tables; %s rows extracted",
                        events_skipped, rows_saved)

            elif catalog_entry:
                initial_binlog_complete = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id,
                    'initial_binlog_complete')

                if (initial_binlog_complete and reader.log_file == log_file
                        and reader.log_pos == log_pos):
                    LOGGER.info(
                        "Skipping event for stream(%s) log_file=%s and log_pos=%s as it was processed last sync",
                        catalog_entry.tap_stream_id, reader.log_file,
                        reader.log_pos)
                    continue

                if isinstance(binlog_event, WriteRowsEvent):
                    rows_saved = handle_write_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, UpdateRowsEvent):
                    rows_saved = handle_update_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)

                elif isinstance(binlog_event, DeleteRowsEvent):
                    rows_saved = handle_delete_rows_event(
                        binlog_event, catalog_entry, state, desired_columns,
                        rows_saved, time_extracted)
                else:
                    LOGGER.info(
                        "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE",
                        binlog_event.schema, binlog_event.table)

        state = update_bookmarks(state, binlog_streams_map, reader.log_file,
                                 reader.log_pos)

        # The iterator across python-mysql-replication's fetchone method should ultimately terminate
        # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send
        # one causing binlog replication to hang.
        if current_log_file == reader.log_file and reader.log_pos >= current_log_pos:
            break

        if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or
            (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)):
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    state = update_initial_binlog_complete(binlog_streams_map, state)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

示例#14

0

显示文件

def sync_event_updates(stream_name):
    '''
    Get updates via events endpoint

    look at 'events update' bookmark and pull events after that
    '''
    LOGGER.info("Started syncing event based updates")

    date_window_size = 60 * 60 * 24 # Seconds in a day

    bookmark_value = singer.get_bookmark(Context.state,
                                         stream_name + '_events',
                                         'updates_created') or \
                     int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    max_created = bookmark_value
    date_window_start = max_created
    date_window_end = max_created + date_window_size

    stop_paging = False

    # Create a map to hold relate event object ids to timestamps
    updated_object_timestamps = {}

    while not stop_paging:
        extraction_time = singer.utils.now()

        response = STREAM_SDK_OBJECTS['events']['sdk_object'].list(**{
            "limit": 100,
            "type": STREAM_TO_TYPE_FILTER[stream_name]['type'],
            "stripe_account" : Context.config.get('account_id'),
            # None passed to starting_after appears to retrieve
            # all of them so this should always be safe.
            "created[gte]": date_window_start,
            "created[lt]": date_window_end,
        })

        # If no results, and we are not up to current time
        if not len(response) and date_window_end > extraction_time.timestamp(): # pylint: disable=len-as-condition
            stop_paging = True

        for events_obj in response.auto_paging_iter():
            event_resource_obj = events_obj.data.object
            sub_stream_name = SUB_STREAMS.get(stream_name)


            # Check whether we should sync the event based on its created time
            if not should_sync_event(events_obj,
                                     STREAM_TO_TYPE_FILTER[stream_name]['object'],
                                     updated_object_timestamps):
                continue

            # Syncing an event as its the first time we've seen it or its the most recent version
            with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
                event_resource_metadata = metadata.to_map(
                    Context.get_catalog_entry(stream_name)['metadata']
                )

                # Filter out line items with null ids
                if isinstance(events_obj.get('data').get('object'), stripe.Invoice):
                    invoice_obj = events_obj.get('data', {}).get('object', {})
                    line_items = invoice_obj.get('lines', {}).get('data')

                    if line_items:
                        filtered_line_items = [line_item for line_item in line_items
                                               if line_item.get('id')]

                        invoice_obj['lines']['data'] = filtered_line_items

                rec = recursive_to_dict(event_resource_obj)
                rec = unwrap_data_objects(rec)
                rec = reduce_foreign_keys(rec, stream_name)
                rec["updated"] = events_obj.created
                rec = transformer.transform(
                    rec,
                    Context.get_catalog_entry(stream_name)['schema'],
                    event_resource_metadata
                )

                if events_obj.created >= bookmark_value:
                    if rec.get('id') is not None:
                        singer.write_record(stream_name,
                                            rec,
                                            time_extracted=extraction_time)
                        Context.updated_counts[stream_name] += 1

                        # Delete events should be synced but not their subobjects
                        if events_obj.get('type', '').endswith('.deleted'):
                            continue

                        if sub_stream_name and Context.is_selected(sub_stream_name):
                            if event_resource_obj:
                                sync_sub_stream(sub_stream_name,
                                                event_resource_obj,
                                                updates=True)
            if events_obj.created > max_created:
                max_created = events_obj.created

        # The events stream returns results in descending order, so we
        # cannot bookmark until the entire page is processed
        date_window_start = date_window_end
        date_window_end = date_window_end + date_window_size
        singer.write_bookmark(Context.state,
                              stream_name + '_events',
                              'updates_created',
                              max_created)
        singer.write_state(Context.state)

    singer.write_state(Context.state)

示例#15

0

显示文件

文件： full_table.py 项目： tailsdotcom/pipelinewise-tap-mysql

def sync_table(config, mysql_conn, catalog_entry, state, columns,
               stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = 'version' in bookmark

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    key_props_are_auto_incrementing = pks_are_auto_incrementing(
        mysql_conn, catalog_entry)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            if key_props_are_auto_incrementing:
                LOGGER.info(
                    "Detected auto-incrementing primary key(s) - will replicate incrementally"
                )
                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id,
                    'max_pk_values') or get_max_pk_values(cur, catalog_entry)

                if not max_pk_values:
                    LOGGER.info(
                        "No max value for auto-incrementing PK found for table %s",
                        catalog_entry.table)
                else:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'max_pk_values',
                                                  max_pk_values)

                    pk_clause = generate_pk_clause(catalog_entry, state)

                    select_sql += pk_clause

            params = {}

            common.sync_query(config, cur, catalog_entry, state, select_sql,
                              columns, stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)

示例#16

0

显示文件

 def get_bookmark(self, state):
     return singer.get_bookmark(state, self.name, self.replication_key)

示例#17

0

显示文件

def get_start(STATE, tap_stream_id, bookmark_key):
    current_bookmark = singer.get_bookmark(STATE, tap_stream_id, bookmark_key)
    if current_bookmark is None:
        return CONFIG["start_date"]

    return current_bookmark

示例#18

0

显示文件

def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream["tap_stream_id"],
                                         "version")
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=stream_version,
    )

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get("replication-key")
    replication_key_value = singer.get_bookmark(state, stream["tap_stream_id"],
                                                "replication_key_value")
    replication_key_sql_datatype = md_map.get(
        ("properties", replication_key)).get("sql-datatype")

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to client_encoding in postgresql.conf.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size
                LOGGER.info("Beginning new incremental replication sync %s",
                            stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} > '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value,
                        replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key),
                    )
                else:
                    # if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                        post_db.prepare_columns_sql(replication_key),
                    )

                LOGGER.info("select statement: %s with itersize %s",
                            select_sql, cur.itersize)
                cur.execute(select_sql)
                LOGGER.info("Query returned - processing results")

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream,
                        rec,
                        stream_version,
                        desired_columns,
                        time_extracted,
                        md_map,
                    )
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    # Picking a replication_key with NULL values will result in it ALWAYS being
                    # synced which is not great. Even worse would be allowing the NULL value to
                    # enter into the state.
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state,
                            stream["tap_stream_id"],
                            "replication_key_value",
                            record_message.record[replication_key],
                        )

                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    LOGGER.info("Incremental table tap complete")
    return state

示例#19

0

显示文件

文件： full_table.py 项目： team-blaze/tap-postgres

def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream["tap_stream_id"],
                                    "version") is None

    # pick a new table version IFF we do not have an xmin in our state
    # the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream["tap_stream_id"], "xmin") is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream["tap_stream_id"],
                                                     "version")

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version,
    )

    if first_run:
        singer.write_message(activate_version_message)

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to client_encoding in postgresql.conf.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size

                xmin = singer.get_bookmark(state, stream["tap_stream_id"],
                                           "xmin")
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version,
                        xmin,
                    )
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) <= age('{}'::xid)
                                     ORDER BY xmin::text::bigint ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                        xmin,
                    )
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text::bigint ASC""".format(
                        ",".join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream["table_name"]),
                    )

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)
                LOGGER.info("Query returned - processing results")

                rows_saved = 0
                for rec in cur:
                    xmin = rec["xmin"]
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state,
                                                  stream["tap_stream_id"],
                                                  "xmin", xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

                LOGGER.info(
                    "Processing complete - saved {} rows".format(rows_saved))

    # once we have completed the full table replication, discard the xmin bookmark.
    # the xmin bookmark only comes into play when a full table replication is interrupted
    LOGGER.info("Writing bookmark")
    state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None)

    # always send the activate version whether first run or subsequent
    LOGGER.info("Sending activate version message")
    singer.write_message(activate_version_message)

    LOGGER.info("Full table tap complete")
    return state

示例#20

0

显示文件

文件： __init__.py 项目： yalelikeyale/noaa_challenge

def get_start(STATE, tap_stream_id, bookmark_key):
    current_bookmark = singer.get_bookmark(STATE, tap_stream_id, bookmark_key)
    if current_bookmark is None:
        return 0
    return current_bookmark

示例#21

0

显示文件

 def get_bookmark(self, state):
     return utils.strptime_with_tz(
         singer.get_bookmark(state, self.name, self.replication_key))

示例#22

0

显示文件

文件： sync.py 项目： jazsmi/tap-square

def sync(config, state, catalog):
    client = SquareClient(config)

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client, state)
            replication_key = stream_obj.replication_key
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Staring sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            start_time = singer.get_bookmark(state, tap_stream_id,
                                             replication_key,
                                             config['start_date'])
            bookmarked_cursor = singer.get_bookmark(state, tap_stream_id,
                                                    'cursor')

            if tap_stream_id == 'shifts':
                replication_key = stream_obj.replication_key

                sync_start_bookmark = singer.get_bookmark(
                    state, tap_stream_id, 'sync_start',
                    singer.utils.strftime(
                        singer.utils.now(),
                        format_str=singer.utils.DATETIME_PARSE))
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    'sync_start',
                    sync_start_bookmark,
                )
                for page, cursor in stream_obj.sync(start_time,
                                                    bookmarked_cursor):
                    for record in page:
                        if record[replication_key] >= start_time:
                            transformed_record = transformer.transform(
                                record,
                                stream_schema,
                                stream_metadata,
                            )
                            singer.write_record(
                                tap_stream_id,
                                transformed_record,
                            )
                    state = singer.write_bookmark(state, tap_stream_id,
                                                  'cursor', cursor)
                    singer.write_state(state)

                state = singer.clear_bookmark(state, tap_stream_id,
                                              'sync_start')
                state = singer.write_bookmark(
                    state,
                    tap_stream_id,
                    replication_key,
                    sync_start_bookmark,
                )
                singer.write_state(state)

            elif stream_obj.replication_method == 'INCREMENTAL':
                replication_key = stream_obj.replication_key
                max_record_value = start_time
                for page, cursor in stream_obj.sync(start_time,
                                                    bookmarked_cursor):
                    for record in page:
                        transformed_record = transformer.transform(
                            record, stream_schema, stream_metadata)
                        singer.write_record(
                            tap_stream_id,
                            transformed_record,
                        )
                        if record[replication_key] > max_record_value:
                            max_record_value = transformed_record[
                                replication_key]

                    state = singer.write_bookmark(state, tap_stream_id,
                                                  'cursor', cursor)
                    state = singer.write_bookmark(state, tap_stream_id,
                                                  replication_key,
                                                  max_record_value)
                    singer.write_state(state)

            else:
                for record in stream_obj.sync(start_time, bookmarked_cursor):
                    transformed_record = transformer.transform(
                        record, stream_schema, stream_metadata)
                    singer.write_record(
                        tap_stream_id,
                        transformed_record,
                    )
            state = singer.clear_bookmark(state, tap_stream_id, 'cursor')
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)

示例#23

0

显示文件

文件： __init__.py 项目： jbelke/tap-mysql

def do_sync_historical_binlog(mysql_conn, config, catalog_entry, state,
                              columns):
    binlog.verify_binlog_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)
    key_properties = common.get_key_properties(catalog_entry)

    if is_view:
        raise Exception(
            "Unable to replicate stream({}) with binlog because it is a view.".
            format(catalog_entry.stream))

    log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                  'log_pos')

    max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'max_pk_values')

    last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          'last_pk_fetched')

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)

    if log_file and log_pos and max_pk_values:
        LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s",
                    catalog_entry.tap_stream_id)
        full_table.sync_table(mysql_conn, catalog_entry, state, columns,
                              stream_version)

    else:
        LOGGER.info(
            "Performing initial full table sync for LOG_BASED stream %s",
            catalog_entry.tap_stream_id)

        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'initial_binlog_complete', False)

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(
            mysql_conn)
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'version', stream_version)

        if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry):
            # We must save log_file and log_pos across FULL_TABLE syncs when using
            # an incrementing PK
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', current_log_file)

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', current_log_pos)

            full_table.sync_table(mysql_conn, catalog_entry, state, columns,
                                  stream_version)

        else:
            full_table.sync_table(mysql_conn, catalog_entry, state, columns,
                                  stream_version)
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_file', current_log_file)

            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'log_pos', current_log_pos)

示例#24

0

显示文件

文件： base.py 项目： gadget-inc/tap-shopify

 def get_since_id(self):
     return singer.get_bookmark(Context.state,
                                # name is overridden by some substreams
                                self.name,
                                'since_id')

示例#25

0

显示文件

文件： __init__.py 项目： flatfair/tap-hubspot

def get_current_sync_start(state, tap_stream_id):
    current_sync_start_value = singer.get_bookmark(state, tap_stream_id,
                                                   "current_sync_start")
    if current_sync_start_value is None:
        return current_sync_start_value
    return utils.strptime_to_utc(current_sync_start_value)

示例#26

0

显示文件

文件： full_table.py 项目： MercuryTechnologies/pipelinewise-tap-postgres

def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version IFF we do not have an xmin in our state
    # the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream['tap_stream_id'],
                                                     'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE

                fq_table_name = post_db.fully_qualified_table_name(
                    schema_name, stream['table_name'])
                xmin = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'xmin')
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version, xmin)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) <= age('{}'::xid)
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns), fq_table_name, xmin)
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns), fq_table_name)

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    xmin = rec['xmin']
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state,
                                                  stream['tap_stream_id'],
                                                  'xmin', xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # once we have completed the full table replication, discard the xmin bookmark.
    # the xmin bookmark only comes into play when a full table replication is interrupted
    state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None)

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state

示例#27

0

显示文件

文件： __init__.py 项目： psschroeter/tap-mongodb

def sync_stream(client, stream, state):
    tap_stream_id = stream['tap_stream_id']

    common.COUNTS[tap_stream_id] = 0
    common.TIMES[tap_stream_id] = 0
    common.SCHEMA_COUNT[tap_stream_id] = 0
    common.SCHEMA_TIMES[tap_stream_id] = 0


    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    database_name = metadata.get(md_map, (), 'database-name')

    stream_projection = load_stream_projection(stream)

    # Emit a state message to indicate that we've started this stream
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, stream['tap_stream_id'])
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    write_schema_message(stream)
    common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        timer.tags['table'] = stream['table_name']

        if replication_method == 'LOG_BASED':
            if oplog.oplog_has_aged_out(client, state, tap_stream_id):
                # remove all state for stream
                # then it will do a full sync and start oplog again.
                LOGGER.info("Clearing state because Oplog has aged out")
                state.get('bookmarks', {}).pop(tap_stream_id)

            # make sure initial full table sync has been completed
            if not singer.get_bookmark(state, tap_stream_id, 'initial_full_table_complete'):
                msg = 'Must complete full table sync before starting oplog replication for %s'
                LOGGER.info(msg, tap_stream_id)

                # only mark current ts in oplog on first sync so tap has a
                # starting point after the full table sync
                if singer.get_bookmark(state, tap_stream_id, 'version') is None:
                    collection_oplog_ts = oplog.get_latest_ts(client)
                    oplog.update_bookmarks(state, tap_stream_id, collection_oplog_ts)

                full_table.sync_collection(client, stream, state, stream_projection)

            oplog.sync_collection(client, stream, state, stream_projection)

        elif replication_method == 'FULL_TABLE':
            full_table.sync_collection(client, stream, state, stream_projection)

        elif replication_method == 'INCREMENTAL':
            incremental.sync_collection(client, stream, state, stream_projection)
        else:
            raise Exception(
                "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \
                methods are supported (you passed {})".format(replication_method))

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

示例#28

0

显示文件

 def get_bookmark(self, state, name=None):
     name = self.name if not name else name
     return (singer.get_bookmark(
         state, name, self.replication_key)) or Context.config["start_date"]

示例#29

0

显示文件

文件： __init__.py 项目： pj164/tap-salesforce

async def sync_catalog_entry(sf, catalog_entry, state):
    stream_version = get_stream_version(catalog_entry, state)
    stream = catalog_entry['stream']
    stream_alias = catalog_entry.get('stream_alias')
    stream_name = catalog_entry["tap_stream_id"]
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')

    mdata = metadata.to_map(catalog_entry['metadata'])

    if not stream_is_selected(mdata):
        LOGGER.info("%s: Skipping - not selected", stream_name)
        return

    LOGGER.info("%s: Starting", stream_name)

    singer.write_state(state)
    key_properties = metadata.to_map(catalog_entry['metadata']).get(
        (), {}).get('table-key-properties')
    singer.write_schema(stream, catalog_entry['schema'], key_properties,
                        replication_key, stream_alias)

    loop = asyncio.get_event_loop()

    job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                 'JobID')
    if job_id:
        with metrics.record_counter(stream) as counter:
            LOGGER.info(
                "Found JobID from previous Bulk Query. Resuming sync for job: %s",
                job_id)
            # Resuming a sync should clear out the remaining state once finished
            await loop.run_in_executor(None, resume_syncing_bulk_query, sf,
                                       catalog_entry, job_id, state, counter)
            LOGGER.info("Completed sync for %s", stream_name)
            state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                           {}).pop('JobID', None)
            state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'],
                                           {}).pop('BatchIDs', None)
            bookmark = state.get('bookmarks',
                                 {}).get(catalog_entry['tap_stream_id'],
                                         {}).pop('JobHighestBookmarkSeen',
                                                 None)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          replication_key, bookmark)
            singer.write_state(state)
    else:
        state_msg_threshold = CONFIG.get('state_message_threshold', 1000)

        # Tables with a replication_key or an empty bookmark will emit an
        # activate_version at the beginning of their sync
        bookmark_is_empty = state.get('bookmarks', {}).get(
            catalog_entry['tap_stream_id']) is None

        if replication_key or bookmark_is_empty:
            singer.write_message(activate_version_message)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          'version', stream_version)
        await loop.run_in_executor(None, sync_stream, sf, catalog_entry, state,
                                   state_msg_threshold)
        LOGGER.info("Completed sync for %s", stream_name)

示例#30

0

显示文件

def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params, original_state_file=''):
    replication_key = singer.get_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'replication_key')

    query_string = cursor.mogrify(select_sql, params)

    time_extracted = utils.now()

    # Adding logic to reattempt query in case of 2013 MySQL timeout.
    try:
        cursor.execute(select_sql, params)
    except Exception:
        LOGGER.info('Running %s',query_string)
        LOGGER.exception("An Exception has occurred.")
        cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        #Rename table to include database name so it is included when sent to Stitch.
        catalog_entry.table = str(database_name) + '_' + str(catalog_entry.table)
        catalog_entry.stream = str(database_name) + '_' + str(catalog_entry.stream)
        
        counter.tags['database'] = database_name
        counter.tags['table'] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version,
                                                  row,
                                                  columns,
                                                  time_extracted)
            singer.write_message(record_message)

            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get('replication-method')

            if replication_method in {'FULL_TABLE', 'LOG_BASED'}:
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(state,
                                                    catalog_entry.tap_stream_id,
                                                    'max_pk_values')

                if max_pk_values:
                    last_pk_fetched = {k:v for k,v in record_message.record.items()
                                       if k in key_properties}

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'last_pk_fetched',
                                                  last_pk_fetched)

            elif replication_method == 'INCREMENTAL':
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key',
                                                  replication_key)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key_value',
                                                  record_message.record[replication_key])
            if rows_saved % 1000 == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    #Update and backup state file to keep current increment in case of error.
    if original_state_file != '':
        os.rename(original_state_file,original_state_file + '_backup')
        with open(original_state_file,'w') as state_file:
            json.dump(copy.deepcopy(state),state_file)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))