def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get("replication-key") discovered_table = discovered_catalog.get_stream(catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning( "Database %s table %s was selected but does not exist", database_name, catalog_entry.table, ) continue selected = { k for k, v in catalog_entry.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key } # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append( CatalogEntry( tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.tap_stream_id, table=catalog_entry.table, schema=Schema( type="object", properties={ col: discovered_table.schema.properties[col] for col in columns }, ), ) ) return result
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config: Dict): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) else: # Compare event's columns to the schema properties # if a column no longer exists, the event will have something like __dropped_col_XY__ # to refer to this column, we don't want these columns to be included in the difference diff = set(filter(lambda k: False if re.match(r'__dropped_col_\d+__', k) else True, get_db_column_types(binlog_event).keys())).\ difference(catalog_entry.schema.properties.keys()) # If there are additional cols in the event then run discovery and update the catalog if diff: LOGGER.debug('Difference between event and schema: %s', diff) LOGGER.info('Running discovery ... ') # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected(new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the same as # the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop(col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message(catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, rows_saved, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader, binlog_streams_map: Dict, state: Dict, config: Dict, end_log_file: str, end_log_pos: int): processed_rows_events = 0 events_skipped = 0 log_file = None log_pos = None gtid_pos = reader.auto_position # initial gtid, we set this when we created the reader's instance # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types. # Saving them here to avoid doing the check if we should ignore a column over and over again ignored_columns = set() # Exit from the loop when the reader either runs out of streams to return or we reach # the end position (which is Master's) for binlog_event in reader: # get reader current binlog file and position log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if (log_file > end_log_file) or (end_log_file == log_file and log_pos >= end_log_pos): LOGGER.info( 'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!', log_file, log_pos) # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master # binlog file and position above, making the latter behind the stream reader and it causes some data loss # in the next run by skipping everything between end_log_file and log_pos # so we need to update log_pos back to master's position log_file = end_log_file log_pos = end_log_pos break if isinstance(binlog_event, RotateEvent): LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d', binlog_event.next_binlog, binlog_event.position) state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position, gtid_pos) elif isinstance(binlog_event, MariadbGtidEvent) or isinstance( binlog_event, GtidEvent): gtid_pos = binlog_event.gtid LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__, gtid_pos) state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) # There is strange behavior happening when using GTID in the pymysqlreplication lib, # explained here: https://github.com/noplay/python-mysql-replication/issues/367 # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart # consuming binlog from old GTID pos when connection to server is lost. reader.auto_position = gtid_pos else: time_extracted = utils.now() tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped += 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, processed_rows_events) else: # Compare event's columns to the schema properties diff = __get_diff_in_columns_list( binlog_event, catalog_entry.schema.properties.keys(), ignored_columns) # If there are additional cols in the event then run discovery if needed and update the catalog if diff: LOGGER.info( 'Stream `%s`: Difference detected between event and schema: %s', tap_stream_id, diff) md_map = metadata.to_map(catalog_entry.metadata) if not should_run_discovery(diff, md_map): LOGGER.info( 'Stream `%s`: Not running discovery. Ignoring all detected columns in %s', tap_stream_id, diff) ignored_columns = ignored_columns.union(diff) else: LOGGER.info('Stream `%s`: Running discovery ... ', tap_stream_id) # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected( new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the # same as the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop( col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message( catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): processed_rows_events = handle_write_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): processed_rows_events = handle_update_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): processed_rows_events = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update singer bookmark and send STATE message periodically if ((processed_rows_events and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) LOGGER.info('Processed %s rows', processed_rows_events) # Update singer bookmark at the last time to point it the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos)