def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id, _ in binlog_streams_map.items(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) reader = None try: slave_uuid = f"bi-reader-%04x" % random.getrandbits(64) reader = BinLogStreamReader( connection_settings={}, server_id=server_id, slave_uuid=slave_uuid, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper, ) LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config) finally: # BinLogStreamReader doesn't implement the `with` methods # So, try/finally will close the chain from the top reader.close() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(connection, config, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') verify_binlog_config(connection, catalog_entry) verify_log_file_exists(connection, catalog_entry, log_file, log_pos) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) server_id = fetch_server_id(connection) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) table_path = (catalog_entry.database, catalog_entry.stream) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 for binlog_event in reader: if reader.log_file == log_file and reader.log_pos == log_pos: LOGGER.info( "Skipping event for log_file=%s and log_pos=%s as it was processed last sync", reader.log_file, reader.log_pos) continue if isinstance(binlog_event, RotateEvent): state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', binlog_event.next_binlog) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', binlog_event.position) elif (binlog_event.schema, binlog_event.table) == table_path: db_column_types = {c.name: c.type for c in binlog_event.columns} if isinstance(binlog_event, WriteRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, UpdateRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['after_values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, DeleteRowsEvent): for row in binlog_event.rows: event_ts = datetime.datetime.utcfromtimestamp( binlog_event.timestamp).replace(tzinfo=pytz.UTC) vals = row['values'] vals[SDC_DELETED_AT] = event_ts filtered_vals = { k: v for k, v in vals.items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', reader.log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', reader.log_pos) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: yield singer.StateMessage(value=copy.deepcopy(state)) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id in binlog_streams_map.keys(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = fetch_server_id(mysql_conn) LOGGER.info("No server_id provided, will use global server_id=%s", server_id) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def create_binlog_stream_reader(config: Dict, log_file: Optional[str], log_pos: Optional[int], gtid_pos: Optional[str]) -> BinLogStreamReader: """ Create an instance of BinlogStreamReader with the right config Args: config: dictionary of the content of tap config.json log_file: binlog file name to start replication from (Optional if using gtid) log_pos: binlog pos to start replication from (Optional if using gtid) gtid_pos: GTID pos to start replication from (Optional if using log_file & pos) Returns: Instance of BinlogStreamReader """ if config.get('server_id'): server_id = int(config.get('server_id')) LOGGER.info("Using provided server_id=%s", server_id) else: server_id = random.randint( 1, 2 ^ 32) # generate random server id for this slave LOGGER.info("Using randomly generated server_id=%s", server_id) engine = config['engine'] kwargs = { 'connection_settings': {}, 'pymysql_wrapper': make_connection_wrapper(config), 'is_mariadb': connection.MARIADB_ENGINE == engine, 'server_id': server_id, # slave server ID 'report_slave': socket.gethostname() or 'pipelinewise', # this is so this slave appears in SHOW SLAVE HOSTS; 'only_events': [WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent], } # only fetch events pertaining to the schemas in filter db. if config.get('filter_db'): kwargs['only_schemas'] = config['filter_db'].split(',') if config['use_gtid']: if not gtid_pos: raise ValueError( f'gtid_pos is empty "{gtid_pos}"! Cannot start logical replication from empty gtid.' ) LOGGER.info( "Starting logical replication from GTID '%s' on engine '%s'", gtid_pos, engine) # When using GTID, we want to listen in for GTID events and start from given gtid pos kwargs['only_events'].extend([GtidEvent, MariadbGtidEvent]) kwargs['auto_position'] = gtid_pos else: if not log_file or not log_pos or log_pos < 0: raise ValueError( f'log file or pos is empty ("{log_file}", "{log_pos}")! ' f'Cannot start logical replication from invalid log file/pos.') LOGGER.info("Starting logical replication from binlog file ['%s', %d]", log_file, log_pos) # When not using GTID, we want to listen in for rotate events, and start from given log position and file kwargs['only_events'].append(RotateEvent) kwargs['log_file'] = log_file kwargs['log_pos'] = log_pos kwargs['resume_stream'] = True return BinLogStreamReader(**kwargs)