def test_initial_full_table(self): state = {} binlog.fetch_current_log_file_and_pos(self.conn) global SINGER_MESSAGES SINGER_MESSAGES.clear() tap_mysql.do_sync(self.conn, {}, self.catalog, state) message_types = [type(m) for m in SINGER_MESSAGES] self.assertEqual(message_types, [ singer.StateMessage, singer.SchemaMessage, singer.ActivateVersionMessage, singer.RecordMessage, singer.RecordMessage, singer.StateMessage, singer.ActivateVersionMessage, singer.StateMessage, singer.SchemaMessage, singer.ActivateVersionMessage, singer.RecordMessage, singer.RecordMessage, singer.StateMessage, singer.ActivateVersionMessage, singer.StateMessage ]) activate_version_message_1 = list( filter( lambda m: isinstance(m, singer.ActivateVersionMessage) and m. stream == 'tap_mysql_test-binlog_1', SINGER_MESSAGES))[0] activate_version_message_2 = list( filter( lambda m: isinstance(m, singer.ActivateVersionMessage) and m. stream == 'tap_mysql_test-binlog_2', SINGER_MESSAGES))[0] self.assertIsNotNone( singer.get_bookmark(self.state, 'tap_mysql_test-binlog_1', 'log_file')) self.assertIsNotNone( singer.get_bookmark(self.state, 'tap_mysql_test-binlog_1', 'log_pos')) self.assertIsNotNone( singer.get_bookmark(self.state, 'tap_mysql_test-binlog_2', 'log_file')) self.assertIsNotNone( singer.get_bookmark(self.state, 'tap_mysql_test-binlog_2', 'log_pos')) self.assertEqual( singer.get_bookmark(state, 'tap_mysql_test-binlog_1', 'version'), activate_version_message_1.version) self.assertEqual( singer.get_bookmark(state, 'tap_mysql_test-binlog_2', 'version'), activate_version_message_2.version)
def test_initial_full_table(self): state = {} expected_log_file, expected_log_pos = binlog.fetch_current_log_file_and_pos(self.con) messages = list(tap_mysql.generate_messages(self.con, {}, self.catalog, state)) message_types = [type(m) for m in messages] self.assertEqual(message_types, [singer.StateMessage, singer.SchemaMessage, singer.ActivateVersionMessage, singer.RecordMessage, singer.RecordMessage, singer.StateMessage, singer.ActivateVersionMessage, singer.StateMessage, singer.StateMessage]) activate_version_message = list(filter(lambda m: isinstance(m, singer.ActivateVersionMessage), messages))[0] record_messages = list(filter(lambda m: isinstance(m, singer.RecordMessage), messages)) self.assertEqual(singer.get_bookmark(state, 'tap_mysql_test-binlog', 'log_file'), expected_log_file) self.assertEqual(singer.get_bookmark(state, 'tap_mysql_test-binlog', 'log_pos'), expected_log_pos) self.assertEqual(singer.get_bookmark(state, 'tap_mysql_test-binlog', 'version'), activate_version_message.version)
def setUp(self): self.maxDiff = None self.state = {} self.conn = test_utils.get_test_connection() log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn) with connect_with_backoff(self.conn) as open_conn: with open_conn.cursor() as cursor: cursor.execute('CREATE TABLE binlog_1 (id int, updated datetime)') cursor.execute('CREATE TABLE binlog_2 (id int, updated datetime)') cursor.execute('INSERT INTO binlog_1 (id, updated) VALUES (1, \'2017-06-01\')') cursor.execute('INSERT INTO binlog_1 (id, updated) VALUES (2, \'2017-06-20\')') cursor.execute('INSERT INTO binlog_1 (id, updated) VALUES (3, \'2017-09-22\')') cursor.execute('INSERT INTO binlog_2 (id, updated) VALUES (1, \'2017-10-22\')') cursor.execute('INSERT INTO binlog_2 (id, updated) VALUES (2, \'2017-11-10\')') cursor.execute('INSERT INTO binlog_2 (id, updated) VALUES (3, \'2017-12-10\')') cursor.execute('UPDATE binlog_1 set updated=\'2018-06-18\' WHERE id = 3') cursor.execute('UPDATE binlog_2 set updated=\'2018-06-18\' WHERE id = 2') cursor.execute('DELETE FROM binlog_1 WHERE id = 2') cursor.execute('DELETE FROM binlog_2 WHERE id = 1') open_conn.commit() self.catalog = test_utils.discover_catalog(self.conn, {}) for stream in self.catalog.streams: stream.stream = stream.table stream.metadata = [ {'breadcrumb': (), 'metadata': { 'selected': True, 'database-name': 'tap_mysql_test', 'table-key-propertes': ['id'] }}, {'breadcrumb': ('properties', 'id'), 'metadata': {'selected': True}}, {'breadcrumb': ('properties', 'updated'), 'metadata': {'selected': True}} ] test_utils.set_replication_method_and_key(stream, 'LOG_BASED', None) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_file', log_file) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_pos', log_pos) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'version', singer.utils.now())
def setUp(self): self.state = {} self.con = get_test_connection() with self.con.cursor() as cursor: # Purge all binary logs cursor.execute("RESET MASTER") log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.con) cursor.execute('CREATE TABLE binlog (id int, updated datetime)') cursor.execute('INSERT INTO binlog (id, updated) VALUES (1, \'2017-06-01\')') cursor.execute('INSERT INTO binlog (id, updated) VALUES (2, \'2017-06-20\')') cursor.execute('INSERT INTO binlog (id, updated) VALUES (3, \'2017-09-22\')') cursor.execute('UPDATE binlog set updated=\'2018-04-20\' WHERE id = 3') cursor.execute('DELETE FROM binlog WHERE id = 2') self.con.commit() self.catalog = discover_catalog(self.con) for stream in self.catalog.streams: stream.schema.selected = True stream.key_properties = [] stream.schema.properties['id'].selected = True stream.schema.properties['updated'].selected = True stream.stream = stream.table set_replication_method_and_key(stream, 'LOG_BASED', None) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_file', log_file) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_pos', log_pos) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'version', singer.utils.now())
def test_initial_full_table(self): state = {} expected_log_file, expected_log_pos = binlog.fetch_current_log_file_and_pos( self.conn) global SINGER_MESSAGES SINGER_MESSAGES.clear() tap_mysql.do_sync(self.conn, {}, self.catalog, state) message_types = [type(m) for m in SINGER_MESSAGES] self.assertEqual(message_types, [ singer.StateMessage, singer.SchemaMessage, singer.ActivateVersionMessage, singer.RecordMessage, singer.RecordMessage, singer.StateMessage, singer.ActivateVersionMessage, singer.StateMessage ]) record_messages = list( filter(lambda m: isinstance(m, singer.RecordMessage), SINGER_MESSAGES)) # Expected from 0.7.11 expected_records = [{ 'datetime_col': None, 'id': 1, 'timestamp_col': None, 'time_col': '1970-01-01T00:00:00.000000Z', 'date_col': None }, { 'datetime_col': None, 'id': 2, 'timestamp_col': None, 'time_col': None, 'date_col': None }] self.assertEqual(expected_records, [x.asdict()['record'] for x in record_messages])
def setUp(self): self.conn = test_utils.get_test_connection() self.state = {} log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn) with connect_with_backoff(self.conn) as open_conn: with open_conn.cursor() as cursor: cursor.execute( 'CREATE TABLE datetime_types (id int, datetime_col datetime, timestamp_col timestamp, time_col time, date_col date)' ) cursor.execute( 'INSERT INTO datetime_types (id, datetime_col, timestamp_col, time_col, date_col) VALUES (1, \'0000-00-00\', \'0000-00-00 00:00:00\', \'00:00:00\', \'0000-00-00\' )' ) cursor.execute( 'INSERT INTO datetime_types (id, datetime_col, timestamp_col, time_col, date_col) VALUES (2, NULL, NULL, NULL, NULL)' ) open_conn.commit() self.catalog = test_utils.discover_catalog(self.conn, {}) for stream in self.catalog.streams: stream.stream = stream.table stream.metadata = [{ 'breadcrumb': (), 'metadata': { 'selected': True, 'database-name': 'tap_mysql_test', 'table-key-propertes': ['id'] } }, { 'breadcrumb': ('properties', 'id'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'datetime_col'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'timestamp_col'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'time_col'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'date_col'), 'metadata': { 'selected': True } }] test_utils.set_replication_method_and_key(stream, 'LOG_BASED', None) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_file', log_file) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_pos', log_pos) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'version', singer.utils.now())
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str): binlog.verify_binlog_config(mysql_conn) if use_gtid and engine == MYSQL_ENGINE: binlog.verify_gtid_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.") log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') gtid = None if use_gtid: gtid = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'gtid') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)): LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) current_gtid = None if use_gtid: current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid)
def do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if is_view: raise Exception("Unable to replicate stream({}) with binlog because it is a view.".format(catalog_entry.stream)) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.sync_is_resumable(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when performing # a resumable full table sync state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos)
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception( f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view." ) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_file") log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, "log_pos") max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, "max_pk_values" ) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info( "Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id, ) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info( "Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id, ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_binlog_complete", False ) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos( mysql_conn ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "version", stream_version ) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_file", current_log_file ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_pos", current_log_pos ) full_table.sync_table( mysql_conn, catalog_entry, state, columns, stream_version ) else: full_table.sync_table( mysql_conn, catalog_entry, state, columns, stream_version ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_file", current_log_file ) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "log_pos", current_log_pos )
def test_initial_full_table(self): state = {} expected_log_file, expected_log_pos = binlog.fetch_current_log_file_and_pos( self.conn) global SINGER_MESSAGES SINGER_MESSAGES.clear() tap_mysql.do_sync(self.conn, {}, self.catalog, state) message_types = [type(m) for m in SINGER_MESSAGES] self.assertEqual( message_types, [ singer.StateMessage, singer.SchemaMessage, singer.ActivateVersionMessage, singer.RecordMessage, singer.RecordMessage, singer.StateMessage, singer.ActivateVersionMessage, singer.StateMessage, singer.SchemaMessage, singer.ActivateVersionMessage, singer.RecordMessage, singer.RecordMessage, singer.StateMessage, singer.ActivateVersionMessage, singer.StateMessage, ], ) activate_version_message_1 = list( filter( lambda m: isinstance(m, singer.ActivateVersionMessage) and m. stream == "binlog_1", SINGER_MESSAGES, ))[0] activate_version_message_2 = list( filter( lambda m: isinstance(m, singer.ActivateVersionMessage) and m. stream == "binlog_2", SINGER_MESSAGES, ))[0] record_messages = list( filter(lambda m: isinstance(m, singer.RecordMessage), SINGER_MESSAGES)) self.assertIsNotNone( singer.get_bookmark(self.state, "tap_mysql_test-binlog_1", "log_file")) self.assertIsNotNone( singer.get_bookmark(self.state, "tap_mysql_test-binlog_1", "log_pos")) self.assertIsNotNone( singer.get_bookmark(self.state, "tap_mysql_test-binlog_2", "log_file")) self.assertIsNotNone( singer.get_bookmark(self.state, "tap_mysql_test-binlog_2", "log_pos")) self.assertEqual( singer.get_bookmark(state, "tap_mysql_test-binlog_1", "version"), activate_version_message_1.version, ) self.assertEqual( singer.get_bookmark(state, "tap_mysql_test-binlog_2", "version"), activate_version_message_2.version, )
def setUp(self): self.maxDiff = None self.state = {} self.conn = test_utils.get_test_connection() log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn) with connect_with_backoff(self.conn) as open_conn: with open_conn.cursor() as cursor: cursor.execute( "CREATE TABLE binlog_1 (id int, updated datetime)") cursor.execute( "CREATE TABLE binlog_2 (id int, updated datetime)") cursor.execute( "INSERT INTO binlog_1 (id, updated) VALUES (1, '2017-06-01')" ) cursor.execute( "INSERT INTO binlog_1 (id, updated) VALUES (2, '2017-06-20')" ) cursor.execute( "INSERT INTO binlog_1 (id, updated) VALUES (3, '2017-09-22')" ) cursor.execute( "INSERT INTO binlog_2 (id, updated) VALUES (1, '2017-10-22')" ) cursor.execute( "INSERT INTO binlog_2 (id, updated) VALUES (2, '2017-11-10')" ) cursor.execute( "INSERT INTO binlog_2 (id, updated) VALUES (3, '2017-12-10')" ) cursor.execute( "UPDATE binlog_1 set updated='2018-06-18' WHERE id = 3") cursor.execute( "UPDATE binlog_2 set updated='2018-06-18' WHERE id = 2") cursor.execute("DELETE FROM binlog_1 WHERE id = 2") cursor.execute("DELETE FROM binlog_2 WHERE id = 1") open_conn.commit() self.catalog = test_utils.discover_catalog(self.conn, {}) for stream in self.catalog.streams: stream.stream = stream.table stream.metadata = [ { "breadcrumb": (), "metadata": { "selected": True, "database-name": "tap_mysql_test", "table-key-propertes": ["id"], }, }, { "breadcrumb": ("properties", "id"), "metadata": { "selected": True } }, { "breadcrumb": ("properties", "updated"), "metadata": { "selected": True } }, ] test_utils.set_replication_method_and_key(stream, "LOG_BASED", None) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, "log_file", log_file) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, "log_pos", log_pos) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, "version", singer.utils.now())
def generate_messages(con, config, catalog, state): catalog = resolve_catalog(con, catalog, state) for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') replication_key = md_map.get((), {}).get('replication-key') if catalog_entry.is_view: key_properties = md_map.get((), {}).get('view-key-properties') else: key_properties = md_map.get((), {}).get('table-key-properties') with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table log_engine(con, catalog_entry) if replication_method == 'INCREMENTAL': LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, [replication_key]) for message in incremental.sync_table(con, catalog_entry, state, columns): yield message elif replication_method == 'LOG_BASED': if catalog_entry.is_view: raise Exception( "Unable to replicate stream({}) with binlog because it is a view." .format(catalog_entry.stream)) LOGGER.info("Stream %s is using binlog replication", catalog_entry.stream) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') yield generate_schema_message(catalog_entry, key_properties, []) if log_file and log_pos: columns = binlog.add_automatic_properties( catalog_entry, columns) for message in binlog.sync_table(con, config, catalog_entry, state, columns): yield message else: LOGGER.info("Performing initial full table sync") log_file, log_pos = binlog.fetch_current_log_file_and_pos( con) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) for message in full_table.sync_table( con, catalog_entry, state, columns, stream_version): yield message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', log_pos) yield singer.StateMessage(value=copy.deepcopy(state)) elif replication_method == 'FULL_TABLE': LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, []) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) for message in full_table.sync_table(con, catalog_entry, state, columns, stream_version): yield message # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) yield singer.StateMessage(value=copy.deepcopy(state)) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) # if we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))