def sync_binlog_streams(mysql_conn, binlog_catalog, config, state): if binlog_catalog.streams: for stream in binlog_catalog.streams: write_schema_message(stream) with metrics.job_timer("sync_binlog"): binlog.sync_binlog_stream(mysql_conn, config, binlog_catalog.streams, state)
def sync_binlog_streams(mysql_conn, binlog_catalog, config, state): if binlog_catalog.streams: for stream in binlog_catalog.streams: write_schema_message(stream) with metrics.job_timer('sync_binlog'): binlog_streams_map = binlog.generate_streams_map(binlog_catalog.streams) binlog.sync_binlog_stream(mysql_conn, config, binlog_streams_map, state)
def do_sync_incremental(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) md_map = metadata.to_map(catalog_entry.metadata) replication_key = md_map.get((), {}).get('replication-key') if not replication_key: raise Exception( f"Cannot use INCREMENTAL replication for table ({catalog_entry.stream}) without a replication key.") write_schema_message(catalog_entry=catalog_entry, bookmark_properties=[replication_key]) incremental.sync_table(mysql_conn, catalog_entry, state, columns) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync_full_table(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version") state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_full_table_complete", True ) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str): binlog.verify_binlog_config(mysql_conn) if use_gtid and engine == MYSQL_ENGINE: binlog.verify_gtid_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.") log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') gtid = None if use_gtid: gtid = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'gtid') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)): LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) current_gtid = None if use_gtid: current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid)
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception( f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view." ) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_file") log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_pos") max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, "max_pk_values" ) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info( "Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id, ) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info( "Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id, ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_binlog_complete", False ) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos( mysql_conn ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "version", stream_version ) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_file", current_log_file ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_pos", current_log_pos ) full_table.sync_table( mysql_conn, catalog_entry, state, columns, stream_version ) else: full_table.sync_table( mysql_conn, catalog_entry, state, columns, stream_version ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_file", current_log_file ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_pos", current_log_pos )
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config: Dict): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) else: # Compare event's columns to the schema properties # if a column no longer exists, the event will have something like __dropped_col_XY__ # to refer to this column, we don't want these columns to be included in the difference diff = set(filter(lambda k: False if re.match(r'__dropped_col_\d+__', k) else True, get_db_column_types(binlog_event).keys())).\ difference(catalog_entry.schema.properties.keys()) # If there are additional cols in the event then run discovery and update the catalog if diff: LOGGER.debug('Difference between event and schema: %s', diff) LOGGER.info('Running discovery ... ') # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected(new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the same as # the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop(col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message(catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config: Dict): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) else: # Compare event's columns to the schema properties diff = set(get_db_column_types(binlog_event).keys()).\ difference(catalog_entry.schema.properties.keys()) # If there are additional cols in the event then run discovery and update the catalog if diff: #run discovery for the current table only catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] # the new catalog has "stream" property = table name, we need to update that to make it the same as # the result of the "resolve_catalog" function catalog_entry.stream = tap_stream_id desired_columns = list( catalog_entry.schema.properties.keys()) # Add the _sdc_deleted_at col add_automatic_properties(catalog_entry, desired_columns) # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = desired_columns # send the new scheme to target write_schema_message(catalog_entry=catalog_entry) if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader, binlog_streams_map: Dict, state: Dict, config: Dict, end_log_file: str, end_log_pos: int): processed_rows_events = 0 events_skipped = 0 log_file = None log_pos = None gtid_pos = reader.auto_position # initial gtid, we set this when we created the reader's instance # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types. # Saving them here to avoid doing the check if we should ignore a column over and over again ignored_columns = set() # Exit from the loop when the reader either runs out of streams to return or we reach # the end position (which is Master's) for binlog_event in reader: # get reader current binlog file and position log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if (log_file > end_log_file) or (end_log_file == log_file and log_pos >= end_log_pos): LOGGER.info( 'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!', log_file, log_pos) # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master # binlog file and position above, making the latter behind the stream reader and it causes some data loss # in the next run by skipping everything between end_log_file and log_pos # so we need to update log_pos back to master's position log_file = end_log_file log_pos = end_log_pos break if isinstance(binlog_event, RotateEvent): LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d', binlog_event.next_binlog, binlog_event.position) state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position, gtid_pos) elif isinstance(binlog_event, MariadbGtidEvent) or isinstance( binlog_event, GtidEvent): gtid_pos = binlog_event.gtid LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__, gtid_pos) state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) # There is strange behavior happening when using GTID in the pymysqlreplication lib, # explained here: https://github.com/noplay/python-mysql-replication/issues/367 # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart # consuming binlog from old GTID pos when connection to server is lost. reader.auto_position = gtid_pos else: time_extracted = utils.now() tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped += 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, processed_rows_events) else: # Compare event's columns to the schema properties diff = __get_diff_in_columns_list( binlog_event, catalog_entry.schema.properties.keys(), ignored_columns) # If there are additional cols in the event then run discovery if needed and update the catalog if diff: LOGGER.info( 'Stream `%s`: Difference detected between event and schema: %s', tap_stream_id, diff) md_map = metadata.to_map(catalog_entry.metadata) if not should_run_discovery(diff, md_map): LOGGER.info( 'Stream `%s`: Not running discovery. Ignoring all detected columns in %s', tap_stream_id, diff) ignored_columns = ignored_columns.union(diff) else: LOGGER.info('Stream `%s`: Running discovery ... ', tap_stream_id) # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected( new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the # same as the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop( col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message( catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): processed_rows_events = handle_write_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): processed_rows_events = handle_update_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): processed_rows_events = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update singer bookmark and send STATE message periodically if ((processed_rows_events and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) LOGGER.info('Processed %s rows', processed_rows_events) # Update singer bookmark at the last time to point it the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos)