def do_sync_incremental(conn_config, stream, state, desired_columns, md_map): """ Runs Incremental sync """ replication_key = md_map.get((), {}).get('replication-key') LOGGER.info( "Stream %s is using incremental replication with replication key %s", stream['tap_stream_id'], replication_key) stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id']) illegal_bk_keys = set(stream_state.keys()).difference({ 'replication_key', 'replication_key_value', 'version', 'last_replication_method' }) if len(illegal_bk_keys) != 0: raise Exception(f"invalid keys found in state: {illegal_bk_keys}") state = singer.write_bookmark(state, stream['tap_stream_id'], 'replication_key', replication_key) sync_common.send_schema_message(stream, [replication_key]) state = incremental.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def do_sync_incremental(conn_config, stream, state, desired_columns, md_map): replication_key = md_map.get((), {}).get('replication-key') LOGGER.info( "Stream %s is using incremental replication with replication key %s", stream['tap_stream_id'], replication_key) stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id']) illegal_bk_keys = set(stream_state.keys()).difference( set([ 'replication_key', 'replication_key_value', 'version', 'last_replication_method' ])) if len(illegal_bk_keys) != 0: raise Exception( "invalid keys found in state: {}".format(illegal_bk_keys)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'replication_key', replication_key) if md_map.get((), {}).get('is-view'): pks = md_map.get(()).get('table-key-properties') else: pks = md_map.get(()).get('table-key-properties') sync_common.send_schema_message(stream, pks) state = incremental.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def do_sync_incremental(conn_config, stream, state, desired_columns, md_map): replication_key = md_map.get((), {}).get("replication-key") LOGGER.info( "Stream %s is using incremental replication with replication key %s", stream["tap_stream_id"], replication_key, ) stream_state = state.get("bookmarks", {}).get(stream["tap_stream_id"]) illegal_bk_keys = set(stream_state.keys()).difference( set([ "replication_key", "replication_key_value", "version", "last_replication_method" ])) if len(illegal_bk_keys) != 0: raise Exception( "invalid keys found in state: {}".format(illegal_bk_keys)) state = singer.write_bookmark(state, stream["tap_stream_id"], "replication_key", replication_key) sync_common.send_schema_message(stream, [replication_key]) state = incremental.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def consume_message(streams, state, msg, time_extracted, conn_info): payload = json.loads(msg.payload) lsn = msg.data_start streams_lookup = {} for s in streams: streams_lookup[s['tap_stream_id']] = s for c in payload['change']: tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table']) if streams_lookup.get(tap_stream_id) is None: continue target_stream = streams_lookup[tap_stream_id] stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) if c['kind'] == 'insert': col_vals = c['columnvalues'] + [None] col_names = c['columnnames'] + ['_sdc_deleted_at'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'update': col_vals = c['columnvalues'] + [None] col_names = c['columnnames'] + ['_sdc_deleted_at'] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif c['kind'] == 'delete': col_names = c['oldkeys']['keynames'] + ['_sdc_deleted_at'] col_vals = c['oldkeys']['keyvalues'] + [ singer.utils.strftime(time_extracted) ] record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) else: raise Exception("unrecognized replication operation: {}".format( c['kind'])) sync_common.send_schema_message(target_stream, ['lsn']) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) LOGGER.debug( "sending feedback to server with NO flush_lsn. just a keep-alive") msg.cursor.send_feedback() LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start) msg.cursor.send_feedback(flush_lsn=msg.data_start) return state
def do_sync_full_table(conn_config, stream, state, desired_columns, md_map): LOGGER.info("Stream %s is using full_table replication", stream['tap_stream_id']) sync_common.send_schema_message(stream, []) if md_map.get((), {}).get('is-view'): state = full_table.sync_view(conn_config, stream, state, desired_columns, md_map) else: state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn): """ Sync INCREMENTAL and FULL_TABLE streams """ LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream['tap_stream_id'], sync_method) md_map = metadata.to_map(stream['metadata']) conn_config['dbname'] = md_map.get(()).get('database-name') desired_columns = [ c for c in stream['schema']['properties'].keys() if sync_common.should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( 'There are no columns selected for stream %s, skipping it', stream['tap_stream_id']) return state register_type_adapters(conn_config) if sync_method == 'full': state = singer.set_currently_syncing(state, stream['tap_stream_id']) state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif sync_method == 'incremental': state = singer.set_currently_syncing(state, stream['tap_stream_id']) state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) elif sync_method == 'logical_initial': state = singer.set_currently_syncing(state, stream['tap_stream_id']) LOGGER.info("Performing initial full table sync") state = singer.write_bookmark(state, stream['tap_stream_id'], 'lsn', end_lsn) sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None) elif sync_method == 'logical_initial_interrupted': state = singer.set_currently_syncing(state, stream['tap_stream_id']) LOGGER.info( "Initial stage of full table sync was interrupted. resuming...") sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) else: raise Exception( f"unknown sync method {sync_method} for stream {stream['tap_stream_id']}" ) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn): LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream["tap_stream_id"], sync_method) md_map = metadata.to_map(stream["metadata"]) conn_config["dbname"] = md_map.get(()).get("database-name") desired_columns = [ c for c in stream["schema"]["properties"].keys() if sync_common.should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( "There are no columns selected for stream %s, skipping it", stream["tap_stream_id"]) return state register_type_adapters(conn_config) if sync_method == "full": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif sync_method == "incremental": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) elif sync_method == "logical_initial": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) LOGGER.info("Performing initial full table sync") state = singer.write_bookmark(state, stream["tap_stream_id"], "lsn", end_lsn) sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None) elif sync_method == "logical_initial_interrupted": state = singer.set_currently_syncing(state, stream["tap_stream_id"]) LOGGER.info( "Initial stage of full table sync was interrupted. resuming...") sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) else: raise Exception("unknown sync method {} for stream {}".format( sync_method, stream["tap_stream_id"])) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_tables(conn_info, logical_streams, state, end_lsn): start_lsn = min([ get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams ]) time_extracted = utils.now() slot = locate_replication_slot(conn_info) last_lsn_processed = None poll_total_seconds = conn_info[ 'logical_poll_total_seconds'] or 60 * 30 #we are willing to poll for a total of 30 minutes without finding a record keep_alive_time = 10.0 begin_ts = datetime.datetime.now() for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) with post_db.open_connection(conn_info, True) as conn: with conn.cursor() as cur: LOGGER.info( "Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s", list(map(lambda s: s['tap_stream_id'], logical_streams)), slot, start_lsn, end_lsn, poll_total_seconds) try: cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn) except psycopg2.ProgrammingError: raise Exception( "unable to start replication with logical replication slot {}" .format(slot)) rows_saved = 0 while True: poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() if poll_duration > poll_total_seconds: LOGGER.info( "breaking after %s seconds of polling with no data", poll_duration) break msg = cur.read_message() if msg: begin_ts = datetime.datetime.now() if msg.data_start > end_lsn: LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn) #msg has been consumed. it has been processed last_lsn_processed = msg.data_start rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) else: now = datetime.datetime.now() timeout = keep_alive_time - ( now - cur.io_timestamp).total_seconds() try: sel = select([cur], [], [], max(0, timeout)) if not any(sel): LOGGER.info( "no data for %s seconds. sending feedback to server with NO flush_lsn. just a keep-alive", timeout) cur.send_feedback() except InterruptedError: pass # recalculate timeout and continue if last_lsn_processed: for s in logical_streams: LOGGER.info( "updating bookmark for stream %s to last_lsn_processed %s", s['tap_stream_id'], last_lsn_processed) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', last_lsn_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file): state_comitted = state lsn_comitted = min([ get_bookmark(state_comitted, s['tap_stream_id'], 'lsn') for s in logical_streams ]) start_lsn = lsn_comitted lsn_to_flush = None time_extracted = utils.now() slot = locate_replication_slot(conn_info) lsn_last_processed = None lsn_currently_processing = None lsn_received_timestamp = None lsn_processed_count = 0 start_run_timestamp = datetime.datetime.utcnow() max_run_seconds = conn_info['max_run_seconds'] break_at_end_lsn = conn_info['break_at_end_lsn'] logical_poll_total_seconds = conn_info[ 'logical_poll_total_seconds'] or 10800 # 3 hours poll_interval = 10 poll_timestamp = None for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) version = get_pg_version(conn_info) # Create replication connection and cursor conn = post_db.open_connection(conn_info, True, True) cur = conn.cursor() # Set session wal_sender_timeout for PG12 and above if version >= 120000: wal_sender_timeout = 10800000 # 10800000ms = 3 hours LOGGER.info('Set session wal_sender_timeout = %i milliseconds', wal_sender_timeout) cur.execute(f"SET SESSION wal_sender_timeout = {wal_sender_timeout}") try: LOGGER.info('Request wal streaming from %s to %s (slot %s)', int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot) # psycopg2 2.8.4 will send a keep-alive message to postgres every status_interval cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, status_interval=poll_interval, options={ 'write-in-chunks': 1, 'add-tables': streams_to_wal2json_tables(logical_streams) }) except psycopg2.ProgrammingError as ex: raise Exception( f"Unable to start replication with logical replication (slot {ex})" ) from ex lsn_received_timestamp = datetime.datetime.utcnow() poll_timestamp = datetime.datetime.utcnow() try: while True: # Disconnect when no data received for logical_poll_total_seconds # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds() if poll_duration > logical_poll_total_seconds: LOGGER.info('Breaking - %i seconds of polling with no data', poll_duration) break if datetime.datetime.utcnow() >= ( start_run_timestamp + datetime.timedelta(seconds=max_run_seconds)): LOGGER.info('Breaking - reached max_run_seconds of %i', max_run_seconds) break try: msg = cur.read_message() except Exception as e: LOGGER.error(e) raise if msg: if (break_at_end_lsn) and (msg.data_start > end_lsn): LOGGER.info( 'Breaking - latest wal message %s is past end_lsn %s', int_to_lsn(msg.data_start), int_to_lsn(end_lsn)) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info) # When using wal2json with write-in-chunks, multiple messages can have the same lsn # This is to ensure we only flush to lsn that has completed entirely if lsn_currently_processing is None: lsn_currently_processing = msg.data_start LOGGER.info('First wal message received is %s', int_to_lsn(lsn_currently_processing)) # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info('Confirming write up to %s, flush to %s', int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True, force=True) elif int(msg.data_start) > lsn_currently_processing: lsn_last_processed = lsn_currently_processing lsn_currently_processing = msg.data_start lsn_received_timestamp = datetime.datetime.utcnow() lsn_processed_count = lsn_processed_count + 1 if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD: LOGGER.debug( 'Updating bookmarks for all streams to lsn = %s (%s)', lsn_last_processed, int_to_lsn(lsn_last_processed)) for s in logical_streams: state = singer.write_bookmark( state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) lsn_processed_count = 0 else: try: # Wait for a second unless a message arrives select([cur], [], [], 1) except InterruptedError: pass # Every poll_interval, update latest comitted lsn position from the state_file if datetime.datetime.utcnow() >= ( poll_timestamp + datetime.timedelta(seconds=poll_interval)): if lsn_currently_processing is None: LOGGER.info('Waiting for first wal message') else: LOGGER.info('Lastest wal message received was %s', int_to_lsn(lsn_last_processed)) try: with open(state_file, mode="r", encoding="utf-8") as fh: state_comitted = json.load(fh) except Exception: LOGGER.debug('Unable to open and parse %s', state_file) finally: lsn_comitted = min([ get_bookmark(state_comitted, s['tap_stream_id'], 'lsn') for s in logical_streams ]) if (lsn_currently_processing > lsn_comitted) and ( lsn_comitted > lsn_to_flush): lsn_to_flush = lsn_comitted LOGGER.info( 'Confirming write up to %s, flush to %s', int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True, force=True) poll_timestamp = datetime.datetime.utcnow() # Close replication connection and cursor cur.close() conn.close() finally: if lsn_last_processed: if lsn_comitted > lsn_last_processed: lsn_last_processed = lsn_comitted LOGGER.info( 'Current lsn_last_processed %s is older than lsn_comitted %s', int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted)) LOGGER.info('Updating bookmarks for all streams to lsn = %s (%s)', lsn_last_processed, int_to_lsn(lsn_last_processed)) for s in logical_streams: state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def consume_message(streams, state, msg, time_extracted, conn_info): # Strip leading comma generated by write-in-chunks and parse valid JSON try: payload = json.loads(msg.payload.lstrip(',')) except Exception: return state lsn = msg.data_start streams_lookup = {s['tap_stream_id']: s for s in streams} tap_stream_id = post_db.compute_tap_stream_id(payload['schema'], payload['table']) if streams_lookup.get(tap_stream_id) is None: return state target_stream = streams_lookup[tap_stream_id] if payload['kind'] not in {'insert', 'update', 'delete'}: raise UnsupportedPayloadKindError( f"unrecognized replication operation: {payload['kind']}") # Get the additional fields in payload that are not in schema properties: # only inserts and updates have the list of columns that can be used to detect any different in columns diff = set() if payload['kind'] in {'insert', 'update'}: diff = set(payload['columnnames']).difference( target_stream['schema']['properties'].keys()) # if there is new columns in the payload that are not in the schema properties then refresh the stream schema if diff: LOGGER.info( 'Detected new columns "%s", refreshing schema of stream %s', diff, target_stream['stream']) # encountered a column that is not in the schema # refresh the stream schema and metadata by running discovery refresh_streams_schema(conn_info, [target_stream]) # add the automatic properties back to the stream add_automatic_properties(target_stream, conn_info.get('debug_lsn', False)) # publish new schema sync_common.send_schema_message(target_stream, ['lsn']) stream_version = get_stream_version(target_stream['tap_stream_id'], state) stream_md_map = metadata.to_map(target_stream['metadata']) desired_columns = { c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c) } if payload['kind'] in {'insert', 'update'}: col_names = [] col_vals = [] for idx, col in enumerate(payload['columnnames']): if col in desired_columns: col_names.append(col) col_vals.append(payload['columnvalues'][idx]) col_names.append('_sdc_deleted_at') col_vals.append(None) if conn_info.get('debug_lsn'): col_names.append('_sdc_lsn') col_vals.append(str(lsn)) record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) elif payload['kind'] == 'delete': col_names = [] col_vals = [] for idx, col in enumerate(payload['oldkeys']['keynames']): if col in desired_columns: col_names.append(col) col_vals.append(payload['oldkeys']['keyvalues'][idx]) col_names.append('_sdc_deleted_at') col_vals.append(singer.utils.strftime(time_extracted)) if conn_info.get('debug_lsn'): col_names.append('_sdc_lsn') col_vals.append(str(lsn)) record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) singer.write_message(record_message) state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn', lsn) return state
def sync_tables(conn_info, logical_streams, state, end_lsn): start_lsn = 0 time_extracted = utils.now() slot = locate_replication_slot(conn_info) last_lsn_processed = None poll_total_seconds = conn_info[ 'logical_poll_total_seconds'] or 60 * 5 #we are willing to poll for a total of 3 minutes without finding a record keep_alive_time = 0.1 begin_ts = datetime.datetime.now() add_tables = [] for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) for s in logical_streams: add_tables.append("{}.{}".format( s["metadata"][0]["metadata"]["schema-name"], s["table_name"])) if "lsn_to_flush" in state: start_lsn = state["lsn_to_flush"] conn = post_db.open_connection(conn_info, True) with conn.cursor() as cur: LOGGER.info( "Starting Logical Replication from slot %s: %s -> %s. poll_total_seconds: %s", slot, start_lsn, end_lsn, poll_total_seconds) LOGGER.info("Starting at bookmark: %s", list(map(lambda s: s['tap_stream_id'], logical_streams))) replication_params = { "slot_name": slot, "decode": True, "start_lsn": start_lsn, "options": { "add-tables": ", ".join(add_tables) } } message_format = conn_info.get("wal2json_message_format") or "1" if message_format == "2": LOGGER.info("Using wal2json format-version 2") replication_params["options"]["format-version"] = 2 replication_params["options"]["include-timestamp"] = True try: cur.start_replication(**replication_params) except psycopg2.ProgrammingError: raise Exception( "unable to start replication with logical replication slot {}". format(slot)) # initial flush lsn from the previous run if "lsn_to_flush" in state: LOGGER.info("Flushing lsn %s from from previous job run!", str(state["lsn_to_flush"])) cur.send_feedback(flush_lsn=state["lsn_to_flush"]) rows_saved = 0 idle_count = 0 while True: poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() if poll_duration > poll_total_seconds: LOGGER.info( "breaking after %s seconds of polling with no data", poll_duration) if not last_lsn_processed: state["lsn_to_flush"] = end_lsn break read_message_start = datetime.datetime.now() msg = cur.read_message() COUNTER['read_message'] += (datetime.datetime.now() - read_message_start).total_seconds() if msg: idle_count = 0 begin_ts = datetime.datetime.now() if msg.data_start > end_lsn: LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn, message_format=message_format) #msg has been consumed. it has been processed last_lsn_processed = msg.data_start rows_saved = rows_saved + 1 if rows_saved % COUNTER_PRINT_PERIOD == 0: LOGGER.info( "Rows saved = %s, Processed messages counter: %s", str(rows_saved), str(COUNTER)) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug("Sending state to loader: %s", str(state)) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) else: idle_count += 1 if idle_count > 100: idle_count = 0 tmp_poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() LOGGER.info( "No data for ~10 seconds (%s seconds from start). sending feedback to server with NO flush_lsn. just a keep-alive", tmp_poll_duration) cur.send_feedback() else: time.sleep(keep_alive_time) bookmark_lsn = last_lsn_processed if last_lsn_processed else end_lsn LOGGER.info("Finished processing messages - counter: %s", str(COUNTER)) for s in logical_streams: LOGGER.info("updating bookmark for stream %s to last_lsn_processed %s", s['tap_stream_id'], bookmark_lsn) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', bookmark_lsn) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file): lsn_comitted = min([ get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams ]) start_lsn = lsn_comitted lsn_to_flush = None time_extracted = utils.now() slot = locate_replication_slot(conn_info) lsn_last_processed = None lsn_currently_processing = None lsn_received_timestamp = None lsn_processed_count = 0 logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300 poll_interval = 10 poll_timestamp = None selected_tables = [] for s in logical_streams: selected_tables.append("{}.{}".format( s['metadata'][0]['metadata']['schema-name'], s['table_name'])) for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) # Create replication connection and cursor conn = post_db.open_connection(conn_info, True) cur = conn.cursor() try: LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format( datetime.datetime.utcnow(), int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot)) cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, options={ 'write-in-chunks': 1, 'add-tables': ','.join(selected_tables) }) except psycopg2.ProgrammingError: raise Exception( "Unable to start replication with logical replication (slot {})". format(slot)) # Emulate some behaviour of pg_recvlogical LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format( datetime.datetime.utcnow())) cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True) time.sleep(1) lsn_received_timestamp = datetime.datetime.utcnow() poll_timestamp = datetime.datetime.utcnow() while True: # Disconnect when no data received for logical_poll_total_seconds # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds() if poll_duration > logical_poll_total_seconds: LOGGER.info( "{} : Breaking - {} seconds of polling with no data".format( datetime.datetime.utcnow(), poll_duration)) break try: msg = cur.read_message() except Exception as e: LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e)) raise if msg: if msg.data_start > end_lsn: LOGGER.info( "{} : Breaking - current {} is past end_lsn {}".format( datetime.datetime.utcnow(), int_to_lsn(msg.data_start), int_to_lsn(end_lsn))) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn) # When using wal2json with write-in-chunks, multiple messages can have the same lsn # This is to ensure we only flush to lsn that has completed entirely if lsn_currently_processing is None: lsn_currently_processing = msg.data_start LOGGER.info("{} : First message received is {} at {}".format( datetime.datetime.utcnow(), int_to_lsn(lsn_currently_processing), datetime.datetime.utcnow())) # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info( "{} : Confirming write up to {}, flush to {}".format( datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush))) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) elif (int(msg.data_start) > lsn_currently_processing): lsn_last_processed = lsn_currently_processing lsn_currently_processing = msg.data_start lsn_received_timestamp = datetime.datetime.utcnow() lsn_processed_count = lsn_processed_count + 1 if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) lsn_processed_count = 0 # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL if datetime.datetime.utcnow() >= ( poll_timestamp + datetime.timedelta(seconds=poll_interval)): if lsn_currently_processing is None: LOGGER.info( "{} : Sending keep-alive message to source server (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() elif state_file is None: LOGGER.info( "{} : Sending keep-alive message to source server (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() else: # Read lsn_comitted currently captured in state file on disk lsn_comitted = min([ get_bookmark(utils.load_json(state_file), s['tap_stream_id'], 'lsn') for s in logical_streams ]) lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info( "{} : Confirming write up to {}, flush to {} (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) poll_timestamp = datetime.datetime.utcnow() # Close replication connection and cursor cur.close() conn.close() if lsn_last_processed: if lsn_comitted > lsn_last_processed: lsn_last_processed = lsn_comitted LOGGER.info( "Current lsn_last_processed {} is older than lsn_comitted {}". format(int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted))) for s in logical_streams: LOGGER.info( "updating bookmark for stream {} to lsn = {} ({})".format( s['tap_stream_id'], lsn_last_processed, int_to_lsn(lsn_last_processed))) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state