def do_sync_full_table(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id in binlog_streams_map.keys(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get("server_id"): server_id = int(config.get("server_id")) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) try: reader = BinLogStreamReader( connection_settings={}, server_id=server_id, slave_uuid="stitch-slave-{}".format(server_id), log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper, ) LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state) finally: # BinLogStreamReader doesn't implement the `with` methods # So, try/finally will close the chain from the top reader.close() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_traditional_stream(conn_config, stream, state, sync_method, end_scn): LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream.tap_stream_id, sync_method) md_map = metadata.to_map(stream.metadata) desired_columns = [c for c in stream.schema.properties.keys() if common.should_sync_column(md_map, c)] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning('There are no columns selected for stream %s, skipping it', stream.tap_stream_id) return state if sync_method == 'full': LOGGER.info("Stream %s is using full_table replication", stream.tap_stream_id) state = singer.set_currently_syncing(state, stream.tap_stream_id) common.send_schema_message(stream, []) if md_map.get((), {}).get('is-view'): state = full_table.sync_view(conn_config, stream, state, desired_columns) else: state = full_table.sync_table(conn_config, stream, state, desired_columns) elif sync_method == 'log_initial': #start off with full-table replication state = singer.set_currently_syncing(state, stream.tap_stream_id) LOGGER.info("stream %s is using log_miner. will use full table for first run", stream.tap_stream_id) state = singer.write_bookmark(state, stream.tap_stream_id, 'scn', end_scn) common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns) elif sync_method == 'log_initial_interrupted': LOGGER.info("Initial stage of full table sync was interrupted. resuming...") state = singer.set_currently_syncing(state, stream.tap_stream_id) common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns) elif sync_method == 'incremental': state = singer.set_currently_syncing(state, stream.tap_stream_id) state = do_sync_incremental(conn_config, stream, state, desired_columns) else: raise Exception("unknown sync method {} for stream {}".format(sync_method, stream.tap_stream_id)) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def test_round_trip(self): record_message = singer.RecordMessage( record={'name': 'foo'}, stream='users') schema_message = singer.SchemaMessage( stream='users', key_properties=['name'], schema={'type': 'object', 'properties': { 'name': {'type': 'string'}}}) state_message = singer.StateMessage(value={'seq': 1}) self.assertEqual(record_message, singer.parse_message(singer.format_message(record_message))) self.assertEqual(schema_message, singer.parse_message(singer.format_message(schema_message))) self.assertEqual(state_message, singer.parse_message(singer.format_message(state_message)))
def sync_traditional_stream(conn_config, stream, state, sync_method): LOGGER.info("Beginning sync of stream(%s) with sync method(%s)", stream['tap_stream_id'], sync_method) md_map = metadata.to_map(stream['metadata']) conn_config['dbname'] = md_map.get(()).get('database-name') desired_columns = [c for c in stream['schema']['properties'].keys() if should_sync_column(md_map, c)] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning('There are no columns selected for stream %s, skipping it', stream['tap_stream_id']) return state register_type_adapters(conn_config) if sync_method == 'full': state = singer.set_currently_syncing(state, stream['tap_stream_id']) state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif sync_method == 'incremental': state = singer.set_currently_syncing(state, stream['tap_stream_id']) state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) elif sync_method == 'logical_initial': state = singer.set_currently_syncing(state, stream['tap_stream_id']) end_lsn = logical_replication.fetch_current_lsn(conn_config) LOGGER.info("Performing initial full table sync") state = singer.write_bookmark(state, stream['tap_stream_id'], 'lsn', end_lsn) sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None) elif sync_method == 'logical_initial_interrupted': state = singer.set_currently_syncing(state, stream['tap_stream_id']) LOGGER.info("Initial stage of full table sync was interrupted. resuming...") sync_common.send_schema_message(stream, []) state = full_table.sync_table(conn_config, stream, state, desired_columns, md_map) else: raise Exception("unknown sync method {} for stream {}".format(sync_method, stream['tap_stream_id'])) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_table(conn, catalog_entry, state): metadata = singer_metadata.to_map(catalog_entry.metadata) LOGGER.info("Beginning sync for {}.".format(catalog_entry.stream)) entity = catalog_entry.tap_stream_id if not entity: return start = get_start(entity, state) formatted_start = dateutil.parser.parse(start).strftime(DATETIME_FMT) params = { 'start': formatted_start, } with metrics.record_counter(None) as counter: counter.tags['app'] = singer_metadata.get(metadata, tuple(), "tap-quickbase.app_id") counter.tags['table'] = catalog_entry.table extraction_time = singer_utils.now() for rows_saved, row in enumerate( gen_request(conn, catalog_entry, params)): counter.increment() schema_dict = catalog_entry.schema.to_dict() rec = transform_bools(row, schema_dict) rec = transform_datetimes(rec, schema_dict, catalog_entry.stream) rec = singer.transform( rec, schema_dict, singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) yield singer.RecordMessage(stream=catalog_entry.stream, record=rec, time_extracted=extraction_time) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, REPLICATION_KEY, rec[REPLICATION_KEY]) if (rows_saved + 1) % 1000 == 0: yield singer.StateMessage(value=copy.deepcopy(state))
def sync_table(conn, catalog_entry, state): LOGGER.info("Beginning sync for {}.{} table.".format( catalog_entry.database, catalog_entry.table )) entity = catalog_entry.tap_stream_id if not entity: return start = get_start(entity) formatted_start = dateutil.parser.parse(start).strftime(DATETIME_FMT) params = { 'start': formatted_start, } with metrics.record_counter(None) as counter: counter.tags['database'] = catalog_entry.database counter.tags['table'] = catalog_entry.table extraction_time = singer_utils.now() for rows_saved, row in enumerate(gen_request(conn, catalog_entry, params)): counter.increment() transform_data(row, catalog_entry.schema) yield singer.RecordMessage( stream=catalog_entry.stream, record=row, time_extracted=extraction_time ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, REPLICATION_KEY, row[REPLICATION_KEY] ) if (rows_saved+1) % 1000 == 0: yield singer.StateMessage(value=copy.deepcopy(state))
def overloaded_parse_message(msg): """Parse a message string into a Message object.""" # We are not using Decimals for parsing here. # We recognize that exposes data to potentially # lossy conversions. However, this will affect # very few data points and we have chosen to # leave conversion as is for now. obj = simplejson.loads(msg, use_decimal=True) msg_type = _required_key(obj, 'type') if msg_type == 'RECORD': time_extracted = obj.get('time_extracted') if time_extracted: try: time_extracted = ciso8601.parse_datetime(time_extracted) except Exception: time_extracted = None return singer.RecordMessage(stream=_required_key(obj, 'stream'), record=_required_key(obj, 'record'), version=obj.get('version'), time_extracted=time_extracted) if msg_type == 'SCHEMA': return singer.SchemaMessage( stream=_required_key(obj, 'stream'), schema=_required_key(obj, 'schema'), key_properties=_required_key(obj, 'key_properties'), bookmark_properties=obj.get('bookmark_properties')) if msg_type == 'STATE': return singer.StateMessage(value=_required_key(obj, 'value')) if msg_type == 'ACTIVATE_VERSION': return singer.ActivateVersionMessage( stream=_required_key(obj, 'stream'), version=_required_key(obj, 'version')) return None
def generate_messages(conn, catalog, state): for catalog_entry in catalog.streams: if not catalog_entry.is_selected(): continue # Emit a SCHEMA message before we sync any records yield singer.SchemaMessage( stream=catalog_entry.stream, schema=catalog_entry.schema.to_dict(), key_properties=catalog_entry.key_properties, bookmark_properties=[REPLICATION_KEY] ) # Emit a RECORD message for each record in the result set with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table for message in sync_table(conn, catalog_entry, state): yield message # Emit a state message yield singer.StateMessage(value=copy.deepcopy(state))
def do_sync_incremental(mysql_conn, catalog_entry, state, columns, original_state_file=''): LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) md_map = metadata.to_map(catalog_entry.metadata) replication_key = md_map.get((), {}).get('replication-key') if not replication_key: raise Exception( "Cannot use INCREMENTAL replication for table ({}) without a replication key." .format(catalog_entry.stream)) write_schema_message(catalog_entry=catalog_entry, bookmark_properties=[replication_key]) incremental.sync_table(mysql_conn, catalog_entry, state, columns, original_state_file) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_tables(conn_info, logical_streams, state, end_lsn): start_lsn = min([ get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams ]) time_extracted = utils.now() slot = locate_replication_slot(conn_info) last_lsn_processed = None poll_total_seconds = conn_info[ 'logical_poll_total_seconds'] or 60 * 30 #we are willing to poll for a total of 30 minutes without finding a record keep_alive_time = 10.0 begin_ts = datetime.datetime.now() for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) with post_db.open_connection(conn_info, True) as conn: with conn.cursor() as cur: LOGGER.info( "Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s", list(map(lambda s: s['tap_stream_id'], logical_streams)), slot, start_lsn, end_lsn, poll_total_seconds) try: cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn) except psycopg2.ProgrammingError: raise Exception( "unable to start replication with logical replication slot {}" .format(slot)) rows_saved = 0 while True: poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() if poll_duration > poll_total_seconds: LOGGER.info( "breaking after %s seconds of polling with no data", poll_duration) break msg = cur.read_message() if msg: begin_ts = datetime.datetime.now() if msg.data_start > end_lsn: LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn) #msg has been consumed. it has been processed last_lsn_processed = msg.data_start rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) else: now = datetime.datetime.now() timeout = keep_alive_time - ( now - cur.io_timestamp).total_seconds() try: sel = select([cur], [], [], max(0, timeout)) if not any(sel): LOGGER.info( "no data for %s seconds. sending feedback to server with NO flush_lsn. just a keep-alive", timeout) cur.send_feedback() except InterruptedError: pass # recalculate timeout and continue if last_lsn_processed: for s in logical_streams: LOGGER.info( "updating bookmark for stream %s to last_lsn_processed %s", s['tap_stream_id'], last_lsn_processed) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', last_lsn_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting full table sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') db = client[database_name] collection = db[stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') max_id_value = common.string_to_class(max_id_value, max_id_type) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: last_id_fetched_type = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( stream['tap_stream_id'], find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) # pylint: disable=logging-format-interpolation LOGGER.info(query_message) with collection.find({'_id': find_filter}, projection, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() schema = stream['schema'] or {"type": "object", "properties": {}} for row in cursor: rows_saved += 1 schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name( stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time( ) - schema_build_start_time record_message = common.row_to_singer_record( stream, row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
def sync_stream(client, stream, state): tap_stream_id = stream['tap_stream_id'] common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') database_name = metadata.get(md_map, (), 'database-name') stream_projection = load_stream_projection(stream) # Emit a state message to indicate that we've started this stream state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, stream['tap_stream_id']) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': if oplog.oplog_has_aged_out(client, state, tap_stream_id): # remove all state for stream # then it will do a full sync and start oplog again. LOGGER.info("Clearing state because Oplog has aged out") state.get('bookmarks', {}).pop(tap_stream_id) # make sure initial full table sync has been completed if not singer.get_bookmark(state, tap_stream_id, 'initial_full_table_complete'): msg = 'Must complete full table sync before starting oplog replication for %s' LOGGER.info(msg, tap_stream_id) # only mark current ts in oplog on first sync so tap has a # starting point after the full table sync if singer.get_bookmark(state, tap_stream_id, 'version') is None: collection_oplog_ts = oplog.get_latest_ts(client) oplog.update_bookmarks(state, tap_stream_id, collection_oplog_ts) full_table.sync_collection(client, stream, state, stream_projection) oplog.sync_collection(client, stream, state, stream_projection) elif replication_method == 'FULL_TABLE': full_table.sync_collection(client, stream, state, stream_projection) elif replication_method == 'INCREMENTAL': incremental.sync_collection(client, stream, state, stream_projection) else: raise Exception( "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \ methods are supported (you passed {})".format( replication_method)) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # NB: Flag 0x1 indicates that the binlog has been closed successfully, so we can rely on this being a complete log. if hasattr(binlog_event, 'flags') and binlog_event.flags & LOG_EVENT_BINLOG_IN_USE_F: state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(connection, config, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') verify_binlog_config(connection, catalog_entry) verify_log_file_exists(connection, catalog_entry, log_file, log_pos) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) server_id = fetch_server_id(connection) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) table_path = (catalog_entry.database, catalog_entry.stream) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 for binlog_event in reader: if reader.log_file == log_file and reader.log_pos == log_pos: LOGGER.info( "Skipping event for log_file=%s and log_pos=%s as it was processed last sync", reader.log_file, reader.log_pos) continue if isinstance(binlog_event, RotateEvent): state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', binlog_event.next_binlog) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', binlog_event.position) elif (binlog_event.schema, binlog_event.table) == table_path: db_column_types = {c.name: c.type for c in binlog_event.columns} if isinstance(binlog_event, WriteRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, UpdateRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['after_values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, DeleteRowsEvent): for row in binlog_event.rows: event_ts = datetime.datetime.utcfromtimestamp( binlog_event.timestamp).replace(tzinfo=pytz.UTC) vals = row['values'] vals[SDC_DELETED_AT] = event_ts filtered_vals = { k: v for k, v in vals.items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', reader.log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', reader.log_pos) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: yield singer.StateMessage(value=copy.deepcopy(state)) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file): state_comitted = state lsn_comitted = min([ get_bookmark(state_comitted, s['tap_stream_id'], 'lsn') for s in logical_streams ]) start_lsn = lsn_comitted lsn_to_flush = None time_extracted = utils.now() slot = locate_replication_slot(conn_info) lsn_last_processed = None lsn_currently_processing = None lsn_received_timestamp = None lsn_processed_count = 0 start_run_timestamp = datetime.datetime.utcnow() max_run_seconds = conn_info['max_run_seconds'] break_at_end_lsn = conn_info['break_at_end_lsn'] logical_poll_total_seconds = conn_info[ 'logical_poll_total_seconds'] or 10800 # 3 hours poll_interval = 10 poll_timestamp = None for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) version = get_pg_version(conn_info) # Create replication connection and cursor conn = post_db.open_connection(conn_info, True, True) cur = conn.cursor() # Set session wal_sender_timeout for PG12 and above if version >= 120000: wal_sender_timeout = 10800000 # 10800000ms = 3 hours LOGGER.info('Set session wal_sender_timeout = %i milliseconds', wal_sender_timeout) cur.execute(f"SET SESSION wal_sender_timeout = {wal_sender_timeout}") try: LOGGER.info('Request wal streaming from %s to %s (slot %s)', int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot) # psycopg2 2.8.4 will send a keep-alive message to postgres every status_interval cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, status_interval=poll_interval, options={ 'write-in-chunks': 1, 'add-tables': streams_to_wal2json_tables(logical_streams) }) except psycopg2.ProgrammingError as ex: raise Exception( f"Unable to start replication with logical replication (slot {ex})" ) from ex lsn_received_timestamp = datetime.datetime.utcnow() poll_timestamp = datetime.datetime.utcnow() try: while True: # Disconnect when no data received for logical_poll_total_seconds # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds() if poll_duration > logical_poll_total_seconds: LOGGER.info('Breaking - %i seconds of polling with no data', poll_duration) break if datetime.datetime.utcnow() >= ( start_run_timestamp + datetime.timedelta(seconds=max_run_seconds)): LOGGER.info('Breaking - reached max_run_seconds of %i', max_run_seconds) break try: msg = cur.read_message() except Exception as e: LOGGER.error(e) raise if msg: if (break_at_end_lsn) and (msg.data_start > end_lsn): LOGGER.info( 'Breaking - latest wal message %s is past end_lsn %s', int_to_lsn(msg.data_start), int_to_lsn(end_lsn)) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info) # When using wal2json with write-in-chunks, multiple messages can have the same lsn # This is to ensure we only flush to lsn that has completed entirely if lsn_currently_processing is None: lsn_currently_processing = msg.data_start LOGGER.info('First wal message received is %s', int_to_lsn(lsn_currently_processing)) # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info('Confirming write up to %s, flush to %s', int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True, force=True) elif int(msg.data_start) > lsn_currently_processing: lsn_last_processed = lsn_currently_processing lsn_currently_processing = msg.data_start lsn_received_timestamp = datetime.datetime.utcnow() lsn_processed_count = lsn_processed_count + 1 if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD: LOGGER.debug( 'Updating bookmarks for all streams to lsn = %s (%s)', lsn_last_processed, int_to_lsn(lsn_last_processed)) for s in logical_streams: state = singer.write_bookmark( state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) lsn_processed_count = 0 else: try: # Wait for a second unless a message arrives select([cur], [], [], 1) except InterruptedError: pass # Every poll_interval, update latest comitted lsn position from the state_file if datetime.datetime.utcnow() >= ( poll_timestamp + datetime.timedelta(seconds=poll_interval)): if lsn_currently_processing is None: LOGGER.info('Waiting for first wal message') else: LOGGER.info('Lastest wal message received was %s', int_to_lsn(lsn_last_processed)) try: with open(state_file, mode="r", encoding="utf-8") as fh: state_comitted = json.load(fh) except Exception: LOGGER.debug('Unable to open and parse %s', state_file) finally: lsn_comitted = min([ get_bookmark(state_comitted, s['tap_stream_id'], 'lsn') for s in logical_streams ]) if (lsn_currently_processing > lsn_comitted) and ( lsn_comitted > lsn_to_flush): lsn_to_flush = lsn_comitted LOGGER.info( 'Confirming write up to %s, flush to %s', int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True, force=True) poll_timestamp = datetime.datetime.utcnow() # Close replication connection and cursor cur.close() conn.close() finally: if lsn_last_processed: if lsn_comitted > lsn_last_processed: lsn_last_processed = lsn_comitted LOGGER.info( 'Current lsn_last_processed %s is older than lsn_comitted %s', int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted)) LOGGER.info('Updating bookmarks for all streams to lsn = %s (%s)', lsn_last_processed, int_to_lsn(lsn_last_processed)) for s in logical_streams: state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def do_sync(conn_config, catalog, default_replication_method, state): currently_syncing = singer.get_currently_syncing(state) streams = list(filter(is_selected_via_metadata, catalog.streams)) streams.sort(key=lambda s: s.tap_stream_id) if currently_syncing: LOGGER.info("currently_syncing: %s", currently_syncing) currently_syncing_stream = list( filter(lambda s: s.tap_stream_id == currently_syncing, streams)) other_streams = list( filter(lambda s: s.tap_stream_id != currently_syncing, streams)) streams = currently_syncing_stream + other_streams else: LOGGER.info("NO currently_syncing") LOGGER.info("will sync: %s", list(map(lambda s: s.tap_stream_id, streams))) for stream in streams: md_map = metadata.to_map(stream.metadata) conn_config['dbname'] = md_map.get(()).get('database-name') state = singer.set_currently_syncing(state, stream.tap_stream_id) desired_columns = [ c for c in stream.schema.properties.keys() if should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( 'There are no columns selected for stream %s, skipping it', stream.tap_stream_id) continue replication_method = md_map.get( (), {}).get('replication-method', default_replication_method) if replication_method not in set( ['LOG_BASED', 'FULL_TABLE', 'INCREMENTAL']): raise Exception("Unrecognized replication_method {}".format( replication_method)) replication_key = md_map.get((), {}).get('replication-key') state = clear_state_on_replication_change(state, stream.tap_stream_id, replication_key, replication_method) if replication_method == 'LOG_BASED' and md_map.get( (), {}).get('is-view'): LOGGER.warning( 'Logical Replication is NOT supported for views. skipping stream %s', stream.tap_stream_id) continue if replication_method == 'LOG_BASED': state = do_sync_logical_replication(conn_config, stream, state, desired_columns, md_map) elif replication_method == 'FULL_TABLE': state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif replication_method == 'INCREMENTAL': state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_collection( collection: Collection, stream: Dict, state: Optional[Dict], ) -> None: """ Syncs the stream records incrementally Args: collection: MongoDB collection instance stream: stream dictionary state: state dictionary if exists """ LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id']) # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {}) replication_key_name = metadata.to_map(stream['metadata']).get( ()).get('replication-key') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if stream_state.get('replication_key_value'): find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gt'] = common.string_to_class( stream_state.get('replication_key_value'), stream_state.get('replication_key_type')) # log query LOGGER.info('Querying %s with: %s', stream['tap_stream_id'], dict(find=find_filter)) with collection.find(find_filter, sort=[(replication_key_name, pymongo.ASCENDING) ]) as cursor: rows_saved = 0 start_time = time.time() for row in cursor: singer.write_message( common.row_to_singer_record(stream=stream, row=row, time_extracted=utils.now(), time_deleted=None, version=stream_version)) rows_saved += 1 update_bookmark(row, state, stream['tap_stream_id'], replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[stream['tap_stream_id']] += rows_saved common.TIMES[stream['tap_stream_id']] += time.time() - start_time singer.write_message(activate_version_message) LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): """...""" replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') time_extracted = utils.now() LOGGER.info('Running %s', select_sql) cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: counter.tags['database'] = database_name counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get('replication-method') if replication_method == 'FULL_TABLE': key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = { k: v for k, v in record_message.record.items() if k in key_properties } state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(connection, catalog_entry, state): columns = list(catalog_entry.schema.properties.keys()) start_date = CONFIG.get('start_date') formatted_start_date = None if not columns: LOGGER.warning( 'There are no columns selected for table {}, skipping it'.format( catalog_entry.table)) return tap_stream_id = catalog_entry.tap_stream_id LOGGER.info('Beginning sync for {} table'.format(tap_stream_id)) with connection.cursor() as cursor: schema, table = catalog_entry.table.split('.') select = 'SELECT {} FROM {}.{}'.format( ','.join('"{}"'.format(c) for c in columns), '"{}"'.format(schema), '"{}"'.format(table)) params = {} if start_date is not None: formatted_start_date = datetime.datetime.strptime( start_date, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC) replication_key = metadata.to_map(catalog_entry.metadata).get( (), {}).get('replication-key') replication_key_value = None bookmark_is_empty = state.get('bookmarks', {}).get(tap_stream_id) is None stream_version = get_stream_version(tap_stream_id, state) state = singer.write_bookmark(state, tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # If there's a replication key, we want to emit an ACTIVATE_VERSION # message at the beginning so the records show up right away. If # there's no bookmark at all for this stream, assume it's the very # first replication. That is, clients have never seen rows for this # stream before, so they can immediately acknowledge the present # version. if replication_key or bookmark_is_empty: yield activate_version_message if replication_key: replication_key_value = singer.get_bookmark( state, tap_stream_id, 'replication_key_value') or formatted_start_date.isoformat() if replication_key_value is not None: entry_schema = catalog_entry.schema if entry_schema.properties[replication_key].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select += ' WHERE {} >= %(replication_key_value)s ORDER BY {} ' \ 'ASC'.format(replication_key, replication_key) params['replication_key_value'] = replication_key_value elif replication_key is not None: select += ' ORDER BY {} ASC'.format(replication_key) time_extracted = utils.now() query_string = cursor.mogrify(select, params) LOGGER.info('Running {}'.format(query_string)) cursor.execute(select, params) row = cursor.fetchone() rows_saved = 0 with metrics.record_counter(None) as counter: counter.tags['database'] = catalog_entry.database counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_record(catalog_entry, stream_version, row, columns, time_extracted) yield record_message if replication_key is not None: state = singer.write_bookmark( state, tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: yield singer.StateMessage(value=copy.deepcopy(state)) row = cursor.fetchone() if not replication_key: yield activate_version_message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', None) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_tables_logminer(cur, streams, state, start_scn, end_scn): time_extracted = utils.now() start_logmnr_sql = """BEGIN DBMS_LOGMNR.START_LOGMNR( startScn => {}, endScn => {}, OPTIONS => DBMS_LOGMNR.DICT_FROM_ONLINE_CATALOG + DBMS_LOGMNR.COMMITTED_DATA_ONLY + DBMS_LOGMNR.CONTINUOUS_MINE); END;""".format(start_scn, end_scn) LOGGER.info("Starting LogMiner for %s: %s -> %s", list(map(lambda s: s.tap_stream_id, streams)), start_scn, end_scn) LOGGER.info("%s",start_logmnr_sql) cur.execute(start_logmnr_sql) #mine changes for stream in streams: md_map = metadata.to_map(stream.metadata) desired_columns = [c for c in stream.schema.properties.keys() if common.should_sync_column(md_map, c)] redo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(REDO_VALUE, :{})""".format(idx+1) for idx,c in enumerate(desired_columns)]) undo_value_sql_clause = ",\n ".join(["""DBMS_LOGMNR.MINE_VALUE(UNDO_VALUE, :{})""".format(idx+1) for idx,c in enumerate(desired_columns)]) schema_name = md_map.get(()).get('schema-name') stream_version = get_stream_version(stream.tap_stream_id, state) mine_sql = """ SELECT OPERATION, SQL_REDO, SCN, CSCN, COMMIT_TIMESTAMP, {}, {} from v$logmnr_contents where table_name = :table_name AND seg_owner = :seg_owner AND operation in ('INSERT', 'UPDATE', 'DELETE') """.format(redo_value_sql_clause, undo_value_sql_clause) binds = [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \ [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \ [stream.table] + [schema_name] rows_saved = 0 columns_for_record = desired_columns + ['scn', '_sdc_deleted_at'] with metrics.record_counter(None) as counter: LOGGER.info("Examing log for table %s", stream.tap_stream_id) common.send_schema_message(stream, ['lsn']) LOGGER.info("mine_sql=%s", mine_sql) for op, redo, scn, cscn, commit_ts, *col_vals in cur.execute(mine_sql, binds): redo_vals = col_vals[0:len(desired_columns)] undo_vals = col_vals[len(desired_columns):] if op == 'INSERT' or op == 'UPDATE': redo_vals += [cscn, None] record_message = row_to_singer_message(stream, redo_vals, stream_version, columns_for_record, time_extracted) elif op == 'DELETE': undo_vals += [cscn, singer.utils.strftime(commit_ts.replace(tzinfo=pytz.UTC))] record_message = row_to_singer_message(stream, undo_vals, stream_version, columns_for_record, time_extracted) else: raise Exception("unrecognized logminer operation: {}".format(op)) singer.write_message(record_message) rows_saved = rows_saved + 1 counter.increment() state = singer.write_bookmark(state, stream.tap_stream_id, 'scn', int(cscn)) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) for s in streams: LOGGER.info("updating bookmark for stream %s to end_lsn %s", s.tap_stream_id, end_scn) state = singer.write_bookmark(state, s.tap_stream_id, 'scn', end_scn) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # pick a new table version IFF we do not have an xmin in our state # the presence of an xmin indicates that we were interrupted last time through if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map( partial(post_db.prepare_columns_for_select_sql, md_map=md_map), desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to the value in postgresql.conf under client_encoding. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: cur.itersize = post_db.CURSOR_ITER_SIZE fq_table_name = post_db.fully_qualified_table_name( schema_name, stream['table_name']) xmin = singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') if xmin: LOGGER.info( "Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin) select_sql = """SELECT {}, xmin::text::bigint FROM {} where age(xmin::xid) <= age('{}'::xid) ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), fq_table_name, xmin) else: LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) select_sql = """SELECT {}, xmin::text::bigint FROM {} ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), fq_table_name) LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: xmin = rec['xmin'] rec = rec[:-1] record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', xmin) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() # once we have completed the full table replication, discard the xmin bookmark. # the xmin bookmark only comes into play when a full table replication is interrupted state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None) # always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, "replication_key") # query_string = cursor.mogrify(select_sql, params) time_extracted = utils.now() cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: counter.tags["database"] = database_name counter.tags["table"] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get("replication-method") if replication_method in {"FULL_TABLE", "LOG_BASED"}: key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, "max_pk_values") if max_pk_values: last_pk_fetched = { k: v for k, v in record_message.record.items() if k in key_properties } state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "last_pk_fetched", last_pk_fetched) elif replication_method == "INCREMENTAL": if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "replication_key", replication_key) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "replication_key_value", record_message.record[replication_key], ) if rows_saved % 1000 == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map( partial(post_db.prepare_columns_for_select_sql, md_map=md_map), desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=stream_version) singer.write_message(activate_version_message) replication_key = md_map.get((), {}).get('replication-key') replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'], 'replication_key_value') replication_key_sql_datatype = md_map.get( ('properties', replication_key)).get('sql-datatype') hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to the value in postgresql.conf under client_encoding. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='pipelinewise') as cur: cur.itersize = post_db.CURSOR_ITER_SIZE LOGGER.info("Beginning new incremental replication sync %s", stream_version) if replication_key_value: select_sql = """SELECT {} FROM {} WHERE {} >= '{}'::{} ORDER BY {} ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream['table_name']), post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype, post_db.prepare_columns_sql(replication_key)) else: #if not replication_key_value select_sql = """SELECT {} FROM {} ORDER BY {} ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream['table_name']), post_db.prepare_columns_sql(replication_key)) LOGGER.info('select statement: %s with itersize %s', select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved = rows_saved + 1 #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great #event worse would be allowing the NULL value to enter into the state if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream['tap_stream_id'], 'replication_key_value', record_message.record[replication_key]) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() return state
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting incremental sync for %s', tap_stream_id) stream_metadata = metadata.to_map(stream['metadata']).get(()) collection = client[stream_metadata['database-name']][stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None #pick a new table version if last run wasn't interrupted if first_run: stream_version = int(time.time() * 1000) else: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version ) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) # get replication key, and bookmarked value/type stream_state = state.get('bookmarks', {}).get(tap_stream_id, {}) replication_key_name = stream_metadata.get('replication-key') replication_key_value_bookmark = stream_state.get('replication_key_value') # write state message singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # create query find_filter = {} if replication_key_value_bookmark: find_filter[replication_key_name] = {} find_filter[replication_key_name]['$gte'] = \ common.string_to_class(replication_key_value_bookmark, stream_state.get('replication_key_type')) # log query query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) LOGGER.info(query_message) # query collection schema = {"type": "object", "properties": {}} with collection.find(find_filter, projection, sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() for row in cursor: schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message(singer.SchemaMessage( stream=common.calculate_destination_stream_name(stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[tap_stream_id] += 1 common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time record_message = common.row_to_singer_record(stream, row, stream_version, time_extracted) # gen_schema = common.row_to_schema_message(schema, record_message.record, row) # if DeepDiff(schema, gen_schema, ignore_order=True) != {}: # emit gen_schema # schema = gen_schema singer.write_message(record_message) rows_saved += 1 update_bookmark(row, state, tap_stream_id, replication_key_name) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time()-start_time singer.write_message(activate_version_message) LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
def sync_tables(conn_info, logical_streams, state, end_lsn): start_lsn = 0 time_extracted = utils.now() slot = locate_replication_slot(conn_info) last_lsn_processed = None poll_total_seconds = conn_info[ 'logical_poll_total_seconds'] or 60 * 5 #we are willing to poll for a total of 3 minutes without finding a record keep_alive_time = 0.1 begin_ts = datetime.datetime.now() add_tables = [] for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) for s in logical_streams: add_tables.append("{}.{}".format( s["metadata"][0]["metadata"]["schema-name"], s["table_name"])) if "lsn_to_flush" in state: start_lsn = state["lsn_to_flush"] conn = post_db.open_connection(conn_info, True) with conn.cursor() as cur: LOGGER.info( "Starting Logical Replication from slot %s: %s -> %s. poll_total_seconds: %s", slot, start_lsn, end_lsn, poll_total_seconds) LOGGER.info("Starting at bookmark: %s", list(map(lambda s: s['tap_stream_id'], logical_streams))) replication_params = { "slot_name": slot, "decode": True, "start_lsn": start_lsn, "options": { "add-tables": ", ".join(add_tables) } } message_format = conn_info.get("wal2json_message_format") or "1" if message_format == "2": LOGGER.info("Using wal2json format-version 2") replication_params["options"]["format-version"] = 2 replication_params["options"]["include-timestamp"] = True try: cur.start_replication(**replication_params) except psycopg2.ProgrammingError: raise Exception( "unable to start replication with logical replication slot {}". format(slot)) # initial flush lsn from the previous run if "lsn_to_flush" in state: LOGGER.info("Flushing lsn %s from from previous job run!", str(state["lsn_to_flush"])) cur.send_feedback(flush_lsn=state["lsn_to_flush"]) rows_saved = 0 idle_count = 0 while True: poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() if poll_duration > poll_total_seconds: LOGGER.info( "breaking after %s seconds of polling with no data", poll_duration) if not last_lsn_processed: state["lsn_to_flush"] = end_lsn break read_message_start = datetime.datetime.now() msg = cur.read_message() COUNTER['read_message'] += (datetime.datetime.now() - read_message_start).total_seconds() if msg: idle_count = 0 begin_ts = datetime.datetime.now() if msg.data_start > end_lsn: LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn, message_format=message_format) #msg has been consumed. it has been processed last_lsn_processed = msg.data_start rows_saved = rows_saved + 1 if rows_saved % COUNTER_PRINT_PERIOD == 0: LOGGER.info( "Rows saved = %s, Processed messages counter: %s", str(rows_saved), str(COUNTER)) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug("Sending state to loader: %s", str(state)) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) else: idle_count += 1 if idle_count > 100: idle_count = 0 tmp_poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() LOGGER.info( "No data for ~10 seconds (%s seconds from start). sending feedback to server with NO flush_lsn. just a keep-alive", tmp_poll_duration) cur.send_feedback() else: time.sleep(keep_alive_time) bookmark_lsn = last_lsn_processed if last_lsn_processed else end_lsn LOGGER.info("Finished processing messages - counter: %s", str(COUNTER)) for s in logical_streams: LOGGER.info("updating bookmark for stream %s to last_lsn_processed %s", s['tap_stream_id'], bookmark_lsn) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', bookmark_lsn) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_table(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute( """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""" ) time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') # If there was no bookmark for stream_version, it is the first time # this table is being sync'd, so get a new version, write to # state if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=stream_version) singer.write_message(activate_version_message) md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table replication_key = md.get((), {}).get('replication-key') #escaped_replication_key = common.prepare_columns_sql(stream, replication_key) replication_key_value = singer.get_bookmark(state, stream.tap_stream_id, 'replication_key_value') replication_key_sql_datatype = md.get( ('properties', replication_key)).get('sql-datatype') with metrics.record_counter(None) as counter: if replication_key_value: LOGGER.info("Resuming Incremental replication from %s = %s", replication_key, replication_key_value) casted_where_clause_arg = common.prepare_where_clause_arg( replication_key_value, replication_key_sql_datatype) select_sql = """SELECT {} FROM {}.{} WHERE {} >= {} ORDER BY {} ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, replication_key, casted_where_clause_arg, replication_key) else: select_sql = """SELECT {} FROM {}.{} ORDER BY {} ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, replication_key) rows_saved = 0 LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): record_message = common.row_to_singer_message( stream, row, stream_version, desired_columns, time_extracted) singer.write_message(record_message) rows_saved = rows_saved + 1 #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great #event worse would be allowing the NULL value to enter into the state if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() cur.close() connection.close() return state
def sync_collection(client, stream, state, stream_projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting oplog sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') collection_name = stream.get("table_name") stream_state = state.get('bookmarks', {}).get(tap_stream_id) oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'], stream_state['oplog_ts_inc']) # Write activate version message version = common.get_stream_version(tap_stream_id, state) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=version) singer.write_message(activate_version_message) time_extracted = utils.now() rows_saved = 0 start_time = time.time() oplog_query = {'ts': {'$gte': oplog_ts}} projection = transform_projection(stream_projection) oplog_replay = stream_projection is None LOGGER.info( 'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s', tap_stream_id, oplog_query, projection, oplog_replay) update_buffer = set() # consider adding oplog_replay, but this would require removing the projection # default behavior is a non_tailable cursor but we might want a tailable one # regardless of whether its long lived or not. with client.local.oplog.rs.find(oplog_query, projection, sort=[('$natural', pymongo.ASCENDING)], oplog_replay=oplog_replay) as cursor: for row in cursor: # assertions that mongo is respecing the ts query and sort order if row.get('ts') and row.get('ts') < oplog_ts: raise common.MongoAssertionException( "Mongo is not honoring the query param") if row.get('ts') and row.get('ts') < timestamp.Timestamp( stream_state['oplog_ts_time'], stream_state['oplog_ts_inc']): raise common.MongoAssertionException( "Mongo is not honoring the sort ascending param") if row.get('ns') != '{}.{}'.format(database_name, collection_name): if row.get('ts'): state = update_bookmarks(state, tap_stream_id, row['ts']) continue row_op = row['op'] if row_op == 'i': record_message = common.row_to_singer_record( stream, row['o'], version, time_extracted) singer.write_message(record_message) rows_saved += 1 elif row_op == 'u': update_buffer.add(row['o2']['_id']) elif row_op == 'd': # remove update from buffer if that document has been deleted if row['o']['_id'] in update_buffer: update_buffer.remove(row['o']['_id']) # Delete ops only contain the _id of the row deleted row['o'][SDC_DELETED_AT] = row['ts'] record_message = common.row_to_singer_record( stream, row['o'], version, time_extracted) singer.write_message(record_message) rows_saved += 1 state = update_bookmarks(state, tap_stream_id, row['ts']) # flush buffer if it has filled up if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH: for buffered_row in flush_buffer(client, update_buffer, stream_projection, database_name, collection_name): record_message = common.row_to_singer_record( stream, buffered_row, version, time_extracted) singer.write_message(record_message) rows_saved += 1 update_buffer = set() # write state every UPDATE_BOOKMARK_PERIOD messages if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: # flush buffer before writing state for buffered_row in flush_buffer(client, update_buffer, stream_projection, database_name, collection_name): record_message = common.row_to_singer_record( stream, buffered_row, version, time_extracted) singer.write_message(record_message) rows_saved += 1 update_buffer = set() # write state singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # flush buffer if finished with oplog for buffered_row in flush_buffer(client, update_buffer, stream_projection, database_name, collection_name): record_message = common.row_to_singer_record( stream, buffered_row, version, time_extracted) singer.write_message(record_message) rows_saved += 1 common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None #pick a new table version IFF we do not have an xmin in our state #the presence of an xmin indicates that we were interrupted last time through if singer.get_bookmark(state, stream.tap_stream_id, 'xmin') is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: xmin = singer.get_bookmark(state, stream.tap_stream_id, 'xmin') if xmin: LOGGER.info( "Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin) select_sql = """SELECT {}, xmin::text::bigint FROM {} where age(xmin::xid) < age('{}'::xid) ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream.table), xmin) else: LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) select_sql = """SELECT {}, xmin::text::bigint FROM {} ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream.table)) LOGGER.info("select %s", select_sql) cur.execute(select_sql) rows_saved = 0 rec = cur.fetchone() while rec is not None: xmin = rec['xmin'] rec = rec[:-1] record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', xmin) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() rec = cur.fetchone() #once we have completed the full table replication, discard the xmin bookmark. #the xmin bookmark only comes into play when a full table replication is interrupted state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', None) #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file): lsn_comitted = min([ get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams ]) start_lsn = lsn_comitted lsn_to_flush = None time_extracted = utils.now() slot = locate_replication_slot(conn_info) lsn_last_processed = None lsn_currently_processing = None lsn_received_timestamp = None lsn_processed_count = 0 logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300 poll_interval = 10 poll_timestamp = None selected_tables = [] for s in logical_streams: selected_tables.append("{}.{}".format( s['metadata'][0]['metadata']['schema-name'], s['table_name'])) for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) # Create replication connection and cursor conn = post_db.open_connection(conn_info, True) cur = conn.cursor() try: LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format( datetime.datetime.utcnow(), int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot)) cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, options={ 'write-in-chunks': 1, 'add-tables': ','.join(selected_tables) }) except psycopg2.ProgrammingError: raise Exception( "Unable to start replication with logical replication (slot {})". format(slot)) # Emulate some behaviour of pg_recvlogical LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format( datetime.datetime.utcnow())) cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True) time.sleep(1) lsn_received_timestamp = datetime.datetime.utcnow() poll_timestamp = datetime.datetime.utcnow() while True: # Disconnect when no data received for logical_poll_total_seconds # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds() if poll_duration > logical_poll_total_seconds: LOGGER.info( "{} : Breaking - {} seconds of polling with no data".format( datetime.datetime.utcnow(), poll_duration)) break try: msg = cur.read_message() except Exception as e: LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e)) raise if msg: if msg.data_start > end_lsn: LOGGER.info( "{} : Breaking - current {} is past end_lsn {}".format( datetime.datetime.utcnow(), int_to_lsn(msg.data_start), int_to_lsn(end_lsn))) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn) # When using wal2json with write-in-chunks, multiple messages can have the same lsn # This is to ensure we only flush to lsn that has completed entirely if lsn_currently_processing is None: lsn_currently_processing = msg.data_start LOGGER.info("{} : First message received is {} at {}".format( datetime.datetime.utcnow(), int_to_lsn(lsn_currently_processing), datetime.datetime.utcnow())) # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info( "{} : Confirming write up to {}, flush to {}".format( datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush))) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) elif (int(msg.data_start) > lsn_currently_processing): lsn_last_processed = lsn_currently_processing lsn_currently_processing = msg.data_start lsn_received_timestamp = datetime.datetime.utcnow() lsn_processed_count = lsn_processed_count + 1 if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) lsn_processed_count = 0 # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL if datetime.datetime.utcnow() >= ( poll_timestamp + datetime.timedelta(seconds=poll_interval)): if lsn_currently_processing is None: LOGGER.info( "{} : Sending keep-alive message to source server (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() elif state_file is None: LOGGER.info( "{} : Sending keep-alive message to source server (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() else: # Read lsn_comitted currently captured in state file on disk lsn_comitted = min([ get_bookmark(utils.load_json(state_file), s['tap_stream_id'], 'lsn') for s in logical_streams ]) lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info( "{} : Confirming write up to {}, flush to {} (last message received was {} at {})" .format(datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) poll_timestamp = datetime.datetime.utcnow() # Close replication connection and cursor cur.close() conn.close() if lsn_last_processed: if lsn_comitted > lsn_last_processed: lsn_last_processed = lsn_comitted LOGGER.info( "Current lsn_last_processed {} is older than lsn_comitted {}". format(int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted))) for s in logical_streams: LOGGER.info( "updating bookmark for stream {} to lsn = {} ({})".format( s['tap_stream_id'], lsn_last_processed, int_to_lsn(lsn_last_processed))) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state