def do_sync_full_table(mysql_conn, config, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) key_properties = common.get_key_properties(catalog_entry) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def handle_write_rows_event(event, catalog_entry, state, columns, rows_saved, time_extracted): stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) db_column_types = get_db_column_types(event) for row in event.rows: filtered_vals = { k: v for k, v in row['values'].items() if k in columns } record_message = row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) singer.write_message(record_message) rows_saved = rows_saved + 1 return rows_saved
def handle_delete_rows_event(event, catalog_entry, state, columns, rows_saved, time_extracted): stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) db_column_types = get_db_column_types(event) for row in event.rows: event_ts = datetime.datetime.utcfromtimestamp( event.timestamp).replace(tzinfo=pytz.UTC) vals = row['values'] vals[SDC_DELETED_AT] = event_ts filtered_vals = {k: v for k, v in vals.items() if k in columns} record_message = row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) singer.write_message(record_message) rows_saved = rows_saved + 1 return rows_saved
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str): binlog.verify_binlog_config(mysql_conn) if use_gtid and engine == MYSQL_ENGINE: binlog.verify_gtid_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.") log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') gtid = None if use_gtid: gtid = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'gtid') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)): LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) current_gtid = None if use_gtid: current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid)
def sync_table(connection, config, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') verify_binlog_config(connection, catalog_entry) verify_log_file_exists(connection, catalog_entry, log_file, log_pos) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) server_id = fetch_server_id(connection) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) table_path = (catalog_entry.database, catalog_entry.stream) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 for binlog_event in reader: if reader.log_file == log_file and reader.log_pos == log_pos: LOGGER.info( "Skipping event for log_file=%s and log_pos=%s as it was processed last sync", reader.log_file, reader.log_pos) continue if isinstance(binlog_event, RotateEvent): state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', binlog_event.next_binlog) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', binlog_event.position) elif (binlog_event.schema, binlog_event.table) == table_path: db_column_types = {c.name: c.type for c in binlog_event.columns} if isinstance(binlog_event, WriteRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, UpdateRowsEvent): for row in binlog_event.rows: filtered_vals = { k: v for k, v in row['after_values'].items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 elif isinstance(binlog_event, DeleteRowsEvent): for row in binlog_event.rows: event_ts = datetime.datetime.utcfromtimestamp( binlog_event.timestamp).replace(tzinfo=pytz.UTC) vals = row['values'] vals[SDC_DELETED_AT] = event_ts filtered_vals = { k: v for k, v in vals.items() if k in columns } yield row_to_singer_record(catalog_entry, stream_version, db_column_types, filtered_vals, time_extracted) rows_saved = rows_saved + 1 state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', reader.log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', reader.log_pos) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: yield singer.StateMessage(value=copy.deepcopy(state)) yield singer.StateMessage(value=copy.deepcopy(state))
def do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if is_view: raise Exception("Unable to replicate stream({}) with binlog because it is a view.".format(catalog_entry.stream)) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.sync_is_resumable(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when performing # a resumable full table sync state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos)
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception( f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view." ) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_file") log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_pos") max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, "max_pk_values" ) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info( "Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id, ) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info( "Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id, ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_binlog_complete", False ) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos( mysql_conn ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "version", stream_version ) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_file", current_log_file ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_pos", current_log_pos ) full_table.sync_table( mysql_conn, catalog_entry, state, columns, stream_version ) else: full_table.sync_table( mysql_conn, catalog_entry, state, columns, stream_version ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_file", current_log_file ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_pos", current_log_pos )
def sync_table(mysql_conn, catalog_entry, state, columns, limit=None): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) iterate_limit = True while iterate_limit: replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key_metadata, replication_key_metadata) params['replication_key_value'] = replication_key_value elif replication_key_metadata is not None: select_sql += ' ORDER BY `{}` ASC'.format(replication_key_metadata) if limit: select_sql += ' LIMIT {}'.format(limit) num_rows = common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) if limit is None or num_rows < limit: iterate_limit = False
def generate_messages(con, config, catalog, state): catalog = resolve_catalog(con, catalog, state) for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') replication_key = md_map.get((), {}).get('replication-key') if catalog_entry.is_view: key_properties = md_map.get((), {}).get('view-key-properties') else: key_properties = md_map.get((), {}).get('table-key-properties') with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table log_engine(con, catalog_entry) if replication_method == 'INCREMENTAL': LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, [replication_key]) for message in incremental.sync_table(con, catalog_entry, state, columns): yield message elif replication_method == 'LOG_BASED': if catalog_entry.is_view: raise Exception( "Unable to replicate stream({}) with binlog because it is a view." .format(catalog_entry.stream)) LOGGER.info("Stream %s is using binlog replication", catalog_entry.stream) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') yield generate_schema_message(catalog_entry, key_properties, []) if log_file and log_pos: columns = binlog.add_automatic_properties( catalog_entry, columns) for message in binlog.sync_table(con, config, catalog_entry, state, columns): yield message else: LOGGER.info("Performing initial full table sync") log_file, log_pos = binlog.fetch_current_log_file_and_pos( con) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) for message in full_table.sync_table( con, catalog_entry, state, columns, stream_version): yield message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', log_pos) yield singer.StateMessage(value=copy.deepcopy(state)) elif replication_method == 'FULL_TABLE': LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, []) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) for message in full_table.sync_table(con, catalog_entry, state, columns, stream_version): yield message # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) yield singer.StateMessage(value=copy.deepcopy(state)) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) # if we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))