Python send_schema_message示例，tap_postgres.sync_strategies.common.send_schema_message Python示例

示例#1

0

显示文件

文件： __init__.py 项目： transferwise/pipelinewise-tap-postgres

def do_sync_incremental(conn_config, stream, state, desired_columns, md_map):
    """
    Runs Incremental sync
    """
    replication_key = md_map.get((), {}).get('replication-key')
    LOGGER.info(
        "Stream %s is using incremental replication with replication key %s",
        stream['tap_stream_id'], replication_key)

    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'])
    illegal_bk_keys = set(stream_state.keys()).difference({
        'replication_key', 'replication_key_value', 'version',
        'last_replication_method'
    })
    if len(illegal_bk_keys) != 0:
        raise Exception(f"invalid keys found in state: {illegal_bk_keys}")

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'replication_key', replication_key)

    sync_common.send_schema_message(stream, [replication_key])
    state = incremental.sync_table(conn_config, stream, state, desired_columns,
                                   md_map)

    return state

示例#2

0

显示文件

def do_sync_incremental(conn_config, stream, state, desired_columns, md_map):
    replication_key = md_map.get((), {}).get('replication-key')
    LOGGER.info(
        "Stream %s is using incremental replication with replication key %s",
        stream['tap_stream_id'], replication_key)

    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'])
    illegal_bk_keys = set(stream_state.keys()).difference(
        set([
            'replication_key', 'replication_key_value', 'version',
            'last_replication_method'
        ]))
    if len(illegal_bk_keys) != 0:
        raise Exception(
            "invalid keys found in state: {}".format(illegal_bk_keys))

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'replication_key', replication_key)
    if md_map.get((), {}).get('is-view'):
        pks = md_map.get(()).get('table-key-properties')
    else:
        pks = md_map.get(()).get('table-key-properties')

    sync_common.send_schema_message(stream, pks)
    state = incremental.sync_table(conn_config, stream, state, desired_columns,
                                   md_map)

    return state

示例#3

0

显示文件

文件： __init__.py 项目： team-blaze/tap-postgres

def do_sync_incremental(conn_config, stream, state, desired_columns, md_map):
    replication_key = md_map.get((), {}).get("replication-key")
    LOGGER.info(
        "Stream %s is using incremental replication with replication key %s",
        stream["tap_stream_id"],
        replication_key,
    )

    stream_state = state.get("bookmarks", {}).get(stream["tap_stream_id"])
    illegal_bk_keys = set(stream_state.keys()).difference(
        set([
            "replication_key", "replication_key_value", "version",
            "last_replication_method"
        ]))
    if len(illegal_bk_keys) != 0:
        raise Exception(
            "invalid keys found in state: {}".format(illegal_bk_keys))

    state = singer.write_bookmark(state, stream["tap_stream_id"],
                                  "replication_key", replication_key)

    sync_common.send_schema_message(stream, [replication_key])
    state = incremental.sync_table(conn_config, stream, state, desired_columns,
                                   md_map)

    return state

示例#4

0

显示文件

文件： logical_replication.py 项目： perplexes/tap-postgres

def consume_message(streams, state, msg, time_extracted, conn_info):
    payload = json.loads(msg.payload)
    lsn = msg.data_start

    streams_lookup = {}
    for s in streams:
        streams_lookup[s['tap_stream_id']] = s

    for c in payload['change']:
        tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'],
                                                      c['schema'], c['table'])
        if streams_lookup.get(tap_stream_id) is None:
            continue

        target_stream = streams_lookup[tap_stream_id]
        stream_version = get_stream_version(target_stream['tap_stream_id'],
                                            state)
        stream_md_map = metadata.to_map(target_stream['metadata'])

        if c['kind'] == 'insert':
            col_vals = c['columnvalues'] + [None]
            col_names = c['columnnames'] + ['_sdc_deleted_at']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)
        elif c['kind'] == 'update':
            col_vals = c['columnvalues'] + [None]
            col_names = c['columnnames'] + ['_sdc_deleted_at']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)
        elif c['kind'] == 'delete':
            col_names = c['oldkeys']['keynames'] + ['_sdc_deleted_at']
            col_vals = c['oldkeys']['keyvalues'] + [
                singer.utils.strftime(time_extracted)
            ]
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)
        else:
            raise Exception("unrecognized replication operation: {}".format(
                c['kind']))

        sync_common.send_schema_message(target_stream, ['lsn'])

        singer.write_message(record_message)
        state = singer.write_bookmark(state, target_stream['tap_stream_id'],
                                      'lsn', lsn)
        LOGGER.debug(
            "sending feedback to server with NO flush_lsn. just a keep-alive")
        msg.cursor.send_feedback()

    LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start)
    msg.cursor.send_feedback(flush_lsn=msg.data_start)

    return state

示例#5

0

显示文件

文件： __init__.py 项目： receipt-bank/tap-postgres

def do_sync_full_table(conn_config, stream, state, desired_columns, md_map):
    LOGGER.info("Stream %s is using full_table replication", stream['tap_stream_id'])
    sync_common.send_schema_message(stream, [])
    if md_map.get((), {}).get('is-view'):
        state = full_table.sync_view(conn_config, stream, state, desired_columns, md_map)
    else:
        state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map)
    return state

示例#6

0

显示文件

文件： __init__.py 项目： transferwise/pipelinewise-tap-postgres

def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn):
    """
    Sync INCREMENTAL and FULL_TABLE streams
    """
    LOGGER.info("Beginning sync of stream(%s) with sync method(%s)",
                stream['tap_stream_id'], sync_method)
    md_map = metadata.to_map(stream['metadata'])
    conn_config['dbname'] = md_map.get(()).get('database-name')
    desired_columns = [
        c for c in stream['schema']['properties'].keys()
        if sync_common.should_sync_column(md_map, c)
    ]
    desired_columns.sort()

    if len(desired_columns) == 0:
        LOGGER.warning(
            'There are no columns selected for stream %s, skipping it',
            stream['tap_stream_id'])
        return state

    register_type_adapters(conn_config)

    if sync_method == 'full':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        state = do_sync_full_table(conn_config, stream, state, desired_columns,
                                   md_map)
    elif sync_method == 'incremental':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        state = do_sync_incremental(conn_config, stream, state,
                                    desired_columns, md_map)
    elif sync_method == 'logical_initial':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        LOGGER.info("Performing initial full table sync")
        state = singer.write_bookmark(state, stream['tap_stream_id'], 'lsn',
                                      end_lsn)

        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
        state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin',
                                      None)
    elif sync_method == 'logical_initial_interrupted':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        LOGGER.info(
            "Initial stage of full table sync was interrupted. resuming...")
        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
    else:
        raise Exception(
            f"unknown sync method {sync_method} for stream {stream['tap_stream_id']}"
        )

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state

示例#7

0

显示文件

文件： __init__.py 项目： team-blaze/tap-postgres

def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn):
    LOGGER.info("Beginning sync of stream(%s) with sync method(%s)",
                stream["tap_stream_id"], sync_method)
    md_map = metadata.to_map(stream["metadata"])
    conn_config["dbname"] = md_map.get(()).get("database-name")
    desired_columns = [
        c for c in stream["schema"]["properties"].keys()
        if sync_common.should_sync_column(md_map, c)
    ]
    desired_columns.sort()

    if len(desired_columns) == 0:
        LOGGER.warning(
            "There are no columns selected for stream %s, skipping it",
            stream["tap_stream_id"])
        return state

    register_type_adapters(conn_config)

    if sync_method == "full":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        state = do_sync_full_table(conn_config, stream, state, desired_columns,
                                   md_map)
    elif sync_method == "incremental":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        state = do_sync_incremental(conn_config, stream, state,
                                    desired_columns, md_map)
    elif sync_method == "logical_initial":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        LOGGER.info("Performing initial full table sync")
        state = singer.write_bookmark(state, stream["tap_stream_id"], "lsn",
                                      end_lsn)

        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
        state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin",
                                      None)
    elif sync_method == "logical_initial_interrupted":
        state = singer.set_currently_syncing(state, stream["tap_stream_id"])
        LOGGER.info(
            "Initial stage of full table sync was interrupted. resuming...")
        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
    else:
        raise Exception("unknown sync method {} for stream {}".format(
            sync_method, stream["tap_stream_id"]))

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state

示例#8

0

显示文件

文件： logical_replication.py 项目： carb1/singer

def sync_tables(conn_info, logical_streams, state, end_lsn):
    start_lsn = min([
        get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams
    ])
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    last_lsn_processed = None
    poll_total_seconds = conn_info[
        'logical_poll_total_seconds'] or 60 * 30  #we are willing to poll for a total of 30 minutes without finding a record
    keep_alive_time = 10.0
    begin_ts = datetime.datetime.now()

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    with post_db.open_connection(conn_info, True) as conn:
        with conn.cursor() as cur:
            LOGGER.info(
                "Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s",
                list(map(lambda s: s['tap_stream_id'], logical_streams)), slot,
                start_lsn, end_lsn, poll_total_seconds)
            try:
                cur.start_replication(slot_name=slot,
                                      decode=True,
                                      start_lsn=start_lsn)
            except psycopg2.ProgrammingError:
                raise Exception(
                    "unable to start replication with logical replication slot {}"
                    .format(slot))

            rows_saved = 0
            while True:
                poll_duration = (datetime.datetime.now() -
                                 begin_ts).total_seconds()
                if poll_duration > poll_total_seconds:
                    LOGGER.info(
                        "breaking after %s seconds of polling with no data",
                        poll_duration)
                    break

                msg = cur.read_message()
                if msg:
                    begin_ts = datetime.datetime.now()
                    if msg.data_start > end_lsn:
                        LOGGER.info("gone past end_lsn %s for run. breaking",
                                    end_lsn)
                        break

                    state = consume_message(logical_streams, state, msg,
                                            time_extracted, conn_info, end_lsn)
                    #msg has been consumed. it has been processed
                    last_lsn_processed = msg.data_start
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                else:
                    now = datetime.datetime.now()
                    timeout = keep_alive_time - (
                        now - cur.io_timestamp).total_seconds()
                    try:
                        sel = select([cur], [], [], max(0, timeout))
                        if not any(sel):
                            LOGGER.info(
                                "no data for %s seconds. sending feedback to server with NO flush_lsn. just a keep-alive",
                                timeout)
                            cur.send_feedback()

                    except InterruptedError:
                        pass  # recalculate timeout and continue

    if last_lsn_processed:
        for s in logical_streams:
            LOGGER.info(
                "updating bookmark for stream %s to last_lsn_processed %s",
                s['tap_stream_id'], last_lsn_processed)
            state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                          last_lsn_processed)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state

示例#9

0

显示文件

文件： logical_replication.py 项目： transferwise/pipelinewise-tap-postgres

def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
    state_comitted = state
    lsn_comitted = min([
        get_bookmark(state_comitted, s['tap_stream_id'], 'lsn')
        for s in logical_streams
    ])
    start_lsn = lsn_comitted
    lsn_to_flush = None
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    lsn_last_processed = None
    lsn_currently_processing = None
    lsn_received_timestamp = None
    lsn_processed_count = 0
    start_run_timestamp = datetime.datetime.utcnow()
    max_run_seconds = conn_info['max_run_seconds']
    break_at_end_lsn = conn_info['break_at_end_lsn']
    logical_poll_total_seconds = conn_info[
        'logical_poll_total_seconds'] or 10800  # 3 hours
    poll_interval = 10
    poll_timestamp = None

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    version = get_pg_version(conn_info)

    # Create replication connection and cursor
    conn = post_db.open_connection(conn_info, True, True)
    cur = conn.cursor()

    # Set session wal_sender_timeout for PG12 and above
    if version >= 120000:
        wal_sender_timeout = 10800000  # 10800000ms = 3 hours
        LOGGER.info('Set session wal_sender_timeout = %i milliseconds',
                    wal_sender_timeout)
        cur.execute(f"SET SESSION wal_sender_timeout = {wal_sender_timeout}")

    try:
        LOGGER.info('Request wal streaming from %s to %s (slot %s)',
                    int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot)
        # psycopg2 2.8.4 will send a keep-alive message to postgres every status_interval
        cur.start_replication(slot_name=slot,
                              decode=True,
                              start_lsn=start_lsn,
                              status_interval=poll_interval,
                              options={
                                  'write-in-chunks':
                                  1,
                                  'add-tables':
                                  streams_to_wal2json_tables(logical_streams)
                              })

    except psycopg2.ProgrammingError as ex:
        raise Exception(
            f"Unable to start replication with logical replication (slot {ex})"
        ) from ex

    lsn_received_timestamp = datetime.datetime.utcnow()
    poll_timestamp = datetime.datetime.utcnow()

    try:
        while True:
            # Disconnect when no data received for logical_poll_total_seconds
            # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts
            poll_duration = (datetime.datetime.utcnow() -
                             lsn_received_timestamp).total_seconds()
            if poll_duration > logical_poll_total_seconds:
                LOGGER.info('Breaking - %i seconds of polling with no data',
                            poll_duration)
                break

            if datetime.datetime.utcnow() >= (
                    start_run_timestamp +
                    datetime.timedelta(seconds=max_run_seconds)):
                LOGGER.info('Breaking - reached max_run_seconds of %i',
                            max_run_seconds)
                break

            try:
                msg = cur.read_message()
            except Exception as e:
                LOGGER.error(e)
                raise

            if msg:
                if (break_at_end_lsn) and (msg.data_start > end_lsn):
                    LOGGER.info(
                        'Breaking - latest wal message %s is past end_lsn %s',
                        int_to_lsn(msg.data_start), int_to_lsn(end_lsn))
                    break

                state = consume_message(logical_streams, state, msg,
                                        time_extracted, conn_info)

                # When using wal2json with write-in-chunks, multiple messages can have the same lsn
                # This is to ensure we only flush to lsn that has completed entirely
                if lsn_currently_processing is None:
                    lsn_currently_processing = msg.data_start
                    LOGGER.info('First wal message received is %s',
                                int_to_lsn(lsn_currently_processing))

                    # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run
                    lsn_to_flush = lsn_comitted
                    if lsn_currently_processing < lsn_to_flush:
                        lsn_to_flush = lsn_currently_processing
                    LOGGER.info('Confirming write up to %s, flush to %s',
                                int_to_lsn(lsn_to_flush),
                                int_to_lsn(lsn_to_flush))
                    cur.send_feedback(write_lsn=lsn_to_flush,
                                      flush_lsn=lsn_to_flush,
                                      reply=True,
                                      force=True)

                elif int(msg.data_start) > lsn_currently_processing:
                    lsn_last_processed = lsn_currently_processing
                    lsn_currently_processing = msg.data_start
                    lsn_received_timestamp = datetime.datetime.utcnow()
                    lsn_processed_count = lsn_processed_count + 1
                    if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD:
                        LOGGER.debug(
                            'Updating bookmarks for all streams to lsn = %s (%s)',
                            lsn_last_processed, int_to_lsn(lsn_last_processed))
                        for s in logical_streams:
                            state = singer.write_bookmark(
                                state, s['tap_stream_id'], 'lsn',
                                lsn_last_processed)
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))
                        lsn_processed_count = 0
            else:
                try:
                    # Wait for a second unless a message arrives
                    select([cur], [], [], 1)
                except InterruptedError:
                    pass

            # Every poll_interval, update latest comitted lsn position from the state_file
            if datetime.datetime.utcnow() >= (
                    poll_timestamp +
                    datetime.timedelta(seconds=poll_interval)):
                if lsn_currently_processing is None:
                    LOGGER.info('Waiting for first wal message')
                else:
                    LOGGER.info('Lastest wal message received was %s',
                                int_to_lsn(lsn_last_processed))
                    try:
                        with open(state_file, mode="r",
                                  encoding="utf-8") as fh:
                            state_comitted = json.load(fh)
                    except Exception:
                        LOGGER.debug('Unable to open and parse %s', state_file)
                    finally:
                        lsn_comitted = min([
                            get_bookmark(state_comitted, s['tap_stream_id'],
                                         'lsn') for s in logical_streams
                        ])
                        if (lsn_currently_processing > lsn_comitted) and (
                                lsn_comitted > lsn_to_flush):
                            lsn_to_flush = lsn_comitted
                            LOGGER.info(
                                'Confirming write up to %s, flush to %s',
                                int_to_lsn(lsn_to_flush),
                                int_to_lsn(lsn_to_flush))
                            cur.send_feedback(write_lsn=lsn_to_flush,
                                              flush_lsn=lsn_to_flush,
                                              reply=True,
                                              force=True)

                poll_timestamp = datetime.datetime.utcnow()

        # Close replication connection and cursor
        cur.close()
        conn.close()
    finally:
        if lsn_last_processed:
            if lsn_comitted > lsn_last_processed:
                lsn_last_processed = lsn_comitted
                LOGGER.info(
                    'Current lsn_last_processed %s is older than lsn_comitted %s',
                    int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted))

            LOGGER.info('Updating bookmarks for all streams to lsn = %s (%s)',
                        lsn_last_processed, int_to_lsn(lsn_last_processed))

            for s in logical_streams:
                state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                              lsn_last_processed)

        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    return state

示例#10

0

显示文件

文件： logical_replication.py 项目： transferwise/pipelinewise-tap-postgres

def consume_message(streams, state, msg, time_extracted, conn_info):
    # Strip leading comma generated by write-in-chunks and parse valid JSON
    try:
        payload = json.loads(msg.payload.lstrip(','))
    except Exception:
        return state

    lsn = msg.data_start

    streams_lookup = {s['tap_stream_id']: s for s in streams}

    tap_stream_id = post_db.compute_tap_stream_id(payload['schema'],
                                                  payload['table'])
    if streams_lookup.get(tap_stream_id) is None:
        return state

    target_stream = streams_lookup[tap_stream_id]

    if payload['kind'] not in {'insert', 'update', 'delete'}:
        raise UnsupportedPayloadKindError(
            f"unrecognized replication operation: {payload['kind']}")

    # Get the additional fields in payload that are not in schema properties:
    # only inserts and updates have the list of columns that can be used to detect any different in columns
    diff = set()
    if payload['kind'] in {'insert', 'update'}:
        diff = set(payload['columnnames']).difference(
            target_stream['schema']['properties'].keys())

    # if there is new columns in the payload that are not in the schema properties then refresh the stream schema
    if diff:
        LOGGER.info(
            'Detected new columns "%s", refreshing schema of stream %s', diff,
            target_stream['stream'])
        # encountered a column that is not in the schema
        # refresh the stream schema and metadata by running discovery
        refresh_streams_schema(conn_info, [target_stream])

        # add the automatic properties back to the stream
        add_automatic_properties(target_stream,
                                 conn_info.get('debug_lsn', False))

        # publish new schema
        sync_common.send_schema_message(target_stream, ['lsn'])

    stream_version = get_stream_version(target_stream['tap_stream_id'], state)
    stream_md_map = metadata.to_map(target_stream['metadata'])

    desired_columns = {
        c
        for c in target_stream['schema']['properties'].keys()
        if sync_common.should_sync_column(stream_md_map, c)
    }

    if payload['kind'] in {'insert', 'update'}:
        col_names = []
        col_vals = []

        for idx, col in enumerate(payload['columnnames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['columnvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(None)

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    elif payload['kind'] == 'delete':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['oldkeys']['keynames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['oldkeys']['keyvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(singer.utils.strftime(time_extracted))

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    singer.write_message(record_message)
    state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn',
                                  lsn)

    return state

示例#11

0

显示文件

文件： logical_replication.py 项目： receipt-bank/tap-postgres

def sync_tables(conn_info, logical_streams, state, end_lsn):
    start_lsn = 0
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    last_lsn_processed = None
    poll_total_seconds = conn_info[
        'logical_poll_total_seconds'] or 60 * 5  #we are willing to poll for a total of 3 minutes without finding a record
    keep_alive_time = 0.1
    begin_ts = datetime.datetime.now()
    add_tables = []

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    for s in logical_streams:
        add_tables.append("{}.{}".format(
            s["metadata"][0]["metadata"]["schema-name"], s["table_name"]))

    if "lsn_to_flush" in state:
        start_lsn = state["lsn_to_flush"]

    conn = post_db.open_connection(conn_info, True)
    with conn.cursor() as cur:
        LOGGER.info(
            "Starting Logical Replication from slot %s: %s -> %s. poll_total_seconds: %s",
            slot, start_lsn, end_lsn, poll_total_seconds)
        LOGGER.info("Starting at bookmark: %s",
                    list(map(lambda s: s['tap_stream_id'], logical_streams)))

        replication_params = {
            "slot_name": slot,
            "decode": True,
            "start_lsn": start_lsn,
            "options": {
                "add-tables": ", ".join(add_tables)
            }
        }
        message_format = conn_info.get("wal2json_message_format") or "1"
        if message_format == "2":
            LOGGER.info("Using wal2json format-version 2")
            replication_params["options"]["format-version"] = 2
            replication_params["options"]["include-timestamp"] = True

        try:
            cur.start_replication(**replication_params)
        except psycopg2.ProgrammingError:
            raise Exception(
                "unable to start replication with logical replication slot {}".
                format(slot))

        # initial flush lsn from the previous run
        if "lsn_to_flush" in state:
            LOGGER.info("Flushing lsn %s from from previous job run!",
                        str(state["lsn_to_flush"]))
            cur.send_feedback(flush_lsn=state["lsn_to_flush"])

        rows_saved = 0
        idle_count = 0
        while True:
            poll_duration = (datetime.datetime.now() -
                             begin_ts).total_seconds()
            if poll_duration > poll_total_seconds:
                LOGGER.info(
                    "breaking after %s seconds of polling with no data",
                    poll_duration)
                if not last_lsn_processed:
                    state["lsn_to_flush"] = end_lsn
                break

            read_message_start = datetime.datetime.now()
            msg = cur.read_message()
            COUNTER['read_message'] += (datetime.datetime.now() -
                                        read_message_start).total_seconds()
            if msg:
                idle_count = 0
                begin_ts = datetime.datetime.now()
                if msg.data_start > end_lsn:
                    LOGGER.info("gone past end_lsn %s for run. breaking",
                                end_lsn)
                    break

                state = consume_message(logical_streams,
                                        state,
                                        msg,
                                        time_extracted,
                                        conn_info,
                                        end_lsn,
                                        message_format=message_format)
                #msg has been consumed. it has been processed
                last_lsn_processed = msg.data_start
                rows_saved = rows_saved + 1
                if rows_saved % COUNTER_PRINT_PERIOD == 0:
                    LOGGER.info(
                        "Rows saved = %s, Processed messages counter: %s",
                        str(rows_saved), str(COUNTER))
                if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                    LOGGER.debug("Sending state to loader: %s", str(state))
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))
            else:
                idle_count += 1
                if idle_count > 100:
                    idle_count = 0
                    tmp_poll_duration = (datetime.datetime.now() -
                                         begin_ts).total_seconds()
                    LOGGER.info(
                        "No data for ~10 seconds (%s seconds from start). sending feedback to server with NO flush_lsn. just a keep-alive",
                        tmp_poll_duration)
                    cur.send_feedback()
                else:
                    time.sleep(keep_alive_time)

    bookmark_lsn = last_lsn_processed if last_lsn_processed else end_lsn
    LOGGER.info("Finished processing messages - counter: %s", str(COUNTER))
    for s in logical_streams:
        LOGGER.info("updating bookmark for stream %s to last_lsn_processed %s",
                    s['tap_stream_id'], bookmark_lsn)
        state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                      bookmark_lsn)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state

示例#12

0

显示文件

文件： logical_replication.py 项目： andyoneal/pipelinewise-tap-postgres

def sync_tables(conn_info, logical_streams, state, end_lsn, state_file):
    lsn_comitted = min([
        get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams
    ])
    start_lsn = lsn_comitted
    lsn_to_flush = None
    time_extracted = utils.now()
    slot = locate_replication_slot(conn_info)
    lsn_last_processed = None
    lsn_currently_processing = None
    lsn_received_timestamp = None
    lsn_processed_count = 0
    logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300
    poll_interval = 10
    poll_timestamp = None

    selected_tables = []
    for s in logical_streams:
        selected_tables.append("{}.{}".format(
            s['metadata'][0]['metadata']['schema-name'], s['table_name']))

    for s in logical_streams:
        sync_common.send_schema_message(s, ['lsn'])

    # Create replication connection and cursor
    conn = post_db.open_connection(conn_info, True)
    cur = conn.cursor()

    try:
        LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format(
            datetime.datetime.utcnow(), int_to_lsn(start_lsn),
            int_to_lsn(end_lsn), slot))
        cur.start_replication(slot_name=slot,
                              decode=True,
                              start_lsn=start_lsn,
                              options={
                                  'write-in-chunks': 1,
                                  'add-tables': ','.join(selected_tables)
                              })
    except psycopg2.ProgrammingError:
        raise Exception(
            "Unable to start replication with logical replication (slot {})".
            format(slot))

    # Emulate some behaviour of pg_recvlogical
    LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format(
        datetime.datetime.utcnow()))
    cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True)
    time.sleep(1)

    lsn_received_timestamp = datetime.datetime.utcnow()
    poll_timestamp = datetime.datetime.utcnow()

    while True:
        # Disconnect when no data received for logical_poll_total_seconds
        # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts
        poll_duration = (datetime.datetime.utcnow() -
                         lsn_received_timestamp).total_seconds()
        if poll_duration > logical_poll_total_seconds:
            LOGGER.info(
                "{} : Breaking - {} seconds of polling with no data".format(
                    datetime.datetime.utcnow(), poll_duration))
            break

        try:
            msg = cur.read_message()
        except Exception as e:
            LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e))
            raise

        if msg:
            if msg.data_start > end_lsn:
                LOGGER.info(
                    "{} : Breaking - current {} is past end_lsn {}".format(
                        datetime.datetime.utcnow(), int_to_lsn(msg.data_start),
                        int_to_lsn(end_lsn)))
                break

            state = consume_message(logical_streams, state, msg,
                                    time_extracted, conn_info, end_lsn)

            # When using wal2json with write-in-chunks, multiple messages can have the same lsn
            # This is to ensure we only flush to lsn that has completed entirely
            if lsn_currently_processing is None:
                lsn_currently_processing = msg.data_start
                LOGGER.info("{} : First message received is {} at {}".format(
                    datetime.datetime.utcnow(),
                    int_to_lsn(lsn_currently_processing),
                    datetime.datetime.utcnow()))

                # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush:
                    lsn_to_flush = lsn_currently_processing
                LOGGER.info(
                    "{} : Confirming write up to {}, flush to {}".format(
                        datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush),
                        int_to_lsn(lsn_to_flush)))
                cur.send_feedback(write_lsn=lsn_to_flush,
                                  flush_lsn=lsn_to_flush,
                                  reply=True)

            elif (int(msg.data_start) > lsn_currently_processing):
                lsn_last_processed = lsn_currently_processing
                lsn_currently_processing = msg.data_start
                lsn_received_timestamp = datetime.datetime.utcnow()
                lsn_processed_count = lsn_processed_count + 1
                if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD:
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))
                    lsn_processed_count = 0

        # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL
        if datetime.datetime.utcnow() >= (
                poll_timestamp + datetime.timedelta(seconds=poll_interval)):
            if lsn_currently_processing is None:
                LOGGER.info(
                    "{} : Sending keep-alive message to source server (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback()
            elif state_file is None:
                LOGGER.info(
                    "{} : Sending keep-alive message to source server (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback()
            else:
                # Read lsn_comitted currently captured in state file on disk
                lsn_comitted = min([
                    get_bookmark(utils.load_json(state_file),
                                 s['tap_stream_id'], 'lsn')
                    for s in logical_streams
                ])
                lsn_to_flush = lsn_comitted
                if lsn_currently_processing < lsn_to_flush:
                    lsn_to_flush = lsn_currently_processing
                LOGGER.info(
                    "{} : Confirming write up to {}, flush to {} (last message received was {} at {})"
                    .format(datetime.datetime.utcnow(),
                            int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush),
                            int_to_lsn(lsn_last_processed),
                            lsn_received_timestamp))
                cur.send_feedback(write_lsn=lsn_to_flush,
                                  flush_lsn=lsn_to_flush,
                                  reply=True)
            poll_timestamp = datetime.datetime.utcnow()

    # Close replication connection and cursor
    cur.close()
    conn.close()

    if lsn_last_processed:
        if lsn_comitted > lsn_last_processed:
            lsn_last_processed = lsn_comitted
            LOGGER.info(
                "Current lsn_last_processed {} is older than lsn_comitted {}".
                format(int_to_lsn(lsn_last_processed),
                       int_to_lsn(lsn_comitted)))
        for s in logical_streams:
            LOGGER.info(
                "updating bookmark for stream {} to lsn = {} ({})".format(
                    s['tap_stream_id'], lsn_last_processed,
                    int_to_lsn(lsn_last_processed)))
            state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn',
                                          lsn_last_processed)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state