def run_and_verify_sync(self, conn_id, second_sync=False): """ Run a sync job and make sure it exited properly. Return a dictionary with keys of streams synced and values of records synced for each stream """ # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams() if not second_sync else self.expected_second_sync_streams(), self.expected_pks()) self.assertGreater( sum(sync_record_count.values()), 0, msg="failed to replicate any data: {}".format(sync_record_count)) print("total replicated row count: {}".format( sum(sync_record_count.values()))) return sync_record_count
def do_test(self, conn_id): # Select our catalogs our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream,{}).get('messages',[]) if stream in ['tickets', 'groups', 'users']: self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream)) for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
def run_test(self): conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalogs(conn_id) found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog)) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), found_catalog_names) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog if c['tap_stream_id'] == tap_stream_id ][0] schema_and_metadata = menagerie.get_annotated_schema( conn_id, found_stream['stream_id']) main_metadata = schema_and_metadata["metadata"] stream_metadata = [ mdata for mdata in main_metadata if mdata["breadcrumb"] == [] ] # assert that the pks are correct self.assertEqual( self.expected_pks()[tap_stream_id], set(stream_metadata[0]['metadata']['table-key-properties'])) for stream_catalog in catalog: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema['annotated-schema'], []) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual( self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id])
def run_sync_and_get_record_count(self, conn_id): sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) return runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
def test_run(self): # Select our catalogs # found_catalogs = menagerie.get_catalogs(conn_id) # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] # for c in our_catalogs: # c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) # c_metadata = metadata.to_map(c_annotated['metadata']) # connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) conn_id = self.create_connection() # Clear state before our run menagerie.set_state(conn_id, {}) # Select a stream found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = sum(record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream, {}).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] replication_methods = self.expected_replication_method() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): replication_method = replication_methods.get(stream) if replication_method is self.INCREMENTAL: self.assertTrue(stream in bookmarks) elif replication_method is self.FULL_TABLE: self.assertTrue(stream not in bookmarks) else: raise NotImplementedError( "stream {} has an invalid replication method {}".format(stream, replication_method) )
def first_sync_test(self, table_configs, conn_id): # run first full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() expected_pks = {} for config in table_configs: key = {config['HashKey']} if config.get('SortKey'): key |= {config.get('SortKey')} expected_pks[config['TableName']] = key # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, {x['TableName'] for x in table_configs}, expected_pks) state = menagerie.get_state(conn_id) state_version = menagerie.get_state_version(conn_id) first_versions = {} # assert that we get the correct number of records for each stream for config in table_configs: table_name = config['TableName'] self.assertEqual(config['num_rows'], record_count_by_stream[table_name]) # assert that an activate_version_message is first and last message sent for each stream self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][-1]['action']) # assert that the state has an initial_full_table_complete == True self.assertTrue( state['bookmarks'][table_name]['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[table_name] = state['bookmarks'][table_name][ 'version'] self.assertIsNotNone(first_versions[table_name]) # Write state with missing finished_shards so it # re-reads data from all shards # This should result in the next sync having same number of records # as the full table sync state['bookmarks'][table_name].pop('finished_shards') menagerie.set_state(conn_id, state, version=state_version)
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'csv_with_empty_lines').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #Empty line should be ignored in emitted records. expected_records = [ {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2}, {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3}, {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4}, {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def starter(self): """ Instantiate connection, run discovery, and initial sync. This entire process needs to retry if we get rate limited so that we are using a fresh connection and can test the activate version messages. """ ########################################################################## ### Instantiate connection ########################################################################## self.conn_id = connections.ensure_connection(self) ########################################################################## ### Discovery without the backoff ########################################################################## check_job_name = runner.run_check_mode(self, self.conn_id) exit_status = menagerie.get_exit_status(self.conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") print("discovered schemas are OK") # table and field selection test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in self.expected_test_streams] self.perform_and_verify_table_and_field_selection( self.conn_id, test_catalogs, select_all_fields=True, ) ########################################################################## ### Initial sync without the backoff ########################################################################## sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) self.record_count_by_stream_1 = runner.examine_target_output_file( self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) self.assertGreater( sum(self.record_count_by_stream_1.values()), 0, msg="failed to replicate any data: {}".format(self.record_count_by_stream_1) ) print("total replicated row count: {}".format(sum(self.record_count_by_stream_1.values())))
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): conn_id = connections.ensure_connection(self) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema) # Clear State and run sync menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=self.streams_to_select()) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
def test_run(self): conn_id = self.create_connection() # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] self.assertTrue('orders' in bookmarks)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs)) #menagerie.post_annotated_catalogs(conn_id, selected_catalogs) for c in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, c, menagerie.get_annotated_schema(conn_id, c['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # bookmarks for the 4 streams should be 2015-03-16 states = menagerie.get_state(conn_id)["bookmarks"] end_date = self.get_properties()["end_date"].split()[0] for k, v in states.items(): if "insights" in k: bm_date = v.get("date_start") self.assertEqual(end_date, bm_date) print("bookmarks match end_date of {}".format(end_date))
def run_sync(self, conn_id): """ Run a sync job and make sure it exited properly. Return a dictionary with keys of streams synced and values of records synced for each stream """ # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) return sync_record_count
def run_and_verify_sync(self, conn_id): sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_check_streams(), self.expected_primary_keys()) self.assertGreater( sum(sync_record_count.values()), 0, msg="failed to replicate any data: {}".format(sync_record_count)) print("total replicated row count: {}".format( sum(sync_record_count.values()))) return sync_record_count
def run_sync(self, expected_streams): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) return runner.get_upserts_from_target_output()
def run_and_verify_sync(self, conn_id, state): """ Run a sync job and make sure it exited properly. Return a dictionary with keys of streams synced and values of records synced for each stream """ # reset state to the state at the start of the sync in case we got interrupted menagerie.set_state(conn_id, state) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) try: menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) except AssertionError as e: if exit_status['discovery_error_message'] or exit_status[ 'tap_error_message']: print( "*******************RETRYING SYNC FOR TAP/DISCOVERY FAILURE*******************" ) raise RetryableTapError(e) raise # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) self.assertGreater( sum(sync_record_count.values()), 0, msg="failed to replicate any data: {}".format(sync_record_count)) print("total replicated row count: {}".format( sum(sync_record_count.values()))) return sync_record_count
def run_and_verify_sync(self, conn_id): """ Clear the connections state in menagerie and Run a Sync. Verify the exit code following the sync. Return the connection id and record count by stream """ #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) return first_record_count_by_stream
def test_run(self): conn_id = self.ensure_connection() # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) # Select all tables and fields self.select_all_streams_and_fields(conn_id, found_catalogs) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) # Examine target output for stream in self.expected_streams(): with self.subTest(stream=stream): # Each stream should have 1 or more records returned self.assertGreaterEqual(sync_record_count[stream], 1)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties chicken_catalog = found_catalogs[0] self.assertEqual('chicken_view', chicken_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for ROOT-CHICKEN_VIEW') md = menagerie.get_annotated_schema( conn_id, chicken_catalog['stream_id'])['metadata'] self.assertEqual( { (): { 'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': [] }, ('properties', 'fk_id'): { 'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True }, ('properties', 'name'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'age'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True }, ('properties', 'size'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'id'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True } }, metadata.to_map(md)) # 'ID' selected as view-key-properties replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': None, "replication-method": "FULL_TABLE", 'view-key-properties': ["id"] } }] connections.select_catalog_and_fields_via_metadata( conn_id, chicken_catalog, menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), replication_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'chicken_view': 1}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['chicken_view']['table_version'] self.assertEqual( records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['chicken_view']['messages'][2]['action'], 'activate_version') # verifications about individual records for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) actual_chicken_record = records_by_stream['chicken_view']['messages'][ 1]['data'] expected_chicken_record = { 'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'size': 'big' } self.assertEqual( actual_chicken_record, expected_chicken_record, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_chicken_record, actual_chicken_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertEqual( chicken_bookmark['version'], table_version, msg="expected bookmark for stream ROOT-CHICKEN to match version")
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties for c in found_catalogs: catalog_props_to_check = ['stream_name', 'tap_stream_id'] stream = c['stream_name'] for prop in catalog_props_to_check: self.assertEqual( c[prop], expected_catalogs[stream][prop], msg= "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`" .format(prop, stream, expected_catalogs[stream][prop], c[prop])) print("discovered streams are correct") print('checking discoverd metadata for tap_tester_mysql_0-incremental') incremental_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental' ][0] md = menagerie.get_annotated_schema( conn_id, incremental_catalog['stream_id'])['metadata'] incremental_stream_metadata = { 'database-name': 'tap_tester_mysql_0', 'row-count': 3, 'is-view': False, 'selected-by-default': False, 'table-key-properties': ['c_pk'] } self.assertEqual( sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': incremental_stream_metadata }, { 'breadcrumb': ['properties', 'c_dt'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'datetime' } }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }, { 'breadcrumb': ['properties', 'c_varchar_to_deselect'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) print('checking discovered metadata for tap_tester_mysql_1-view') view_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_1-view' ][0] view_catalog_key_properties_md = [{ 'breadcrumb': [], 'metadata': { 'view-key-properties': ['c_pk'] } }] connections.set_non_discoverable_metadata( conn_id, view_catalog, menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']), view_catalog_key_properties_md) md = menagerie.get_annotated_schema( conn_id, view_catalog['stream_id'])['metadata'] view_stream_metadata = { 'database-name': 'tap_tester_mysql_1', 'is-view': True, 'selected-by-default': False, 'view-key-properties': ['c_pk'] } self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': view_stream_metadata }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) #No selected-by-default MD for c_year because it is an unsupported type various_types_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types' ][0] md = menagerie.get_annotated_schema( conn_id, various_types_catalog['stream_id'])['metadata'] c_year_md = [ x for x in md if x['breadcrumb'] == ['properties', 'c_year'] ] self.assertEqual(c_year_md, [{ 'breadcrumb': ['properties', 'c_year'], 'metadata': { 'selected-by-default': False, 'sql-datatype': 'year(4)' } }]) ##select_simple_example catalogs_to_select = [ c for c in found_catalogs if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example' ] for a_catalog in catalogs_to_select: additional_md = [] unselected_fields = [] if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental': additional_md = [{ "breadcrumb": [], "metadata": { 'replication-key': 'c_dt', 'replication-method': 'INCREMENTAL' } }] unselected_fields = ['c_varchar_to_deselect'] elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view': additional_md = [{ "breadcrumb": [], "metadata": { 'view-key-properties': ['c_pk'], 'replication-method': 'FULL_TABLE' } }] else: additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, a_catalog, menagerie.get_annotated_schema(conn_id, a_catalog['stream_id']), additional_md, unselected_fields) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 8 # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() # verifications about individual records for stream, recs in records_by_stream.items(): # verify that activate version messages were sent in the proper position self.assertEqual( recs['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` to have action `activate_version`" .format(stream)) # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # verify that the target output the proper numeric and date representations expected_various_types_records = [{ 'c_time': '1970-01-01T12:34:56.000000Z', 'c_mediumint': 8388607, 'c_smallint': 32767, 'c_tinyint': 127, 'c_date': '2017-09-13T00:00:00.000000Z', 'c_bigint': 9223372036854775807, 'c_decimal': -1, 'c_int': 2147483647, 'c_bit': True, 'c_decimal_2': Decimal('123456789.0'), 'c_pk': 1, 'c_double': Decimal("1.234"), 'c_float': Decimal("1.234"), 'c_decimal_2_unsigned': Decimal("1.23"), 'c_tinyint_1': True }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': False, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 2, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': False }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': None, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 3, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': None }] actual_various_types_records = [ r['data'] for r in records_by_stream['various_types']['messages'][1:4] ] self.assertEqual( actual_various_types_records, expected_various_types_records, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_various_types_records, actual_various_types_records)) # verify that deselected property was not output expected_incremental_record = { 'c_pk': 1, 'c_dt': '2017-01-01T00:00:00.000000Z', 'c_varchar': 'a' } actual_incremental_record = records_by_stream['incremental'][ 'messages'][1]['data'] self.assertEqual( actual_incremental_record, expected_incremental_record, msg= "Expected first `incremental` upsert record data to be {}, but target output {}" .format(expected_incremental_record, actual_incremental_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:02.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") incremental_table_initial_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] #---------------------------------------------------------------------- # invoke the sync job again after some modifications #---------------------------------------------------------------------- print("adding a column to an existing table in the source db") connection = db_utils.get_db_connection(self.get_properties(), self.get_credentials()) with connection.cursor() as cursor: add_column_sql = ''' ALTER TABLE tap_tester_mysql_0.incremental ADD COLUMN favorite_number INTEGER; INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999); ''' cursor.execute(add_column_sql) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 7 # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() expected_schema_of_new_column = { 'maximum': 2147483647, 'selected': True, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } # verifications about individual records for stream, recs in records_by_stream.items(): # verify that a activate version messages were sent in the proper position if stream == 'incremental': self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` not to have action `activate_version`" .format(stream)) expected_schema_of_new_column = { 'maximum': 2147483647, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } self.assertEqual( records_by_stream[stream]['schema']['properties'] ['favorite_number'], expected_schema_of_new_column, msg= "Expected newly-added column to be present in schema for stream `{}`, but it was not." .format(stream)) else: self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'upsert', msg= "Expected first message sent for stream `{}` to have action `upsert`" .format(stream)) self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version', msg= "Expected last message sent for stream `{}` to have action `activate_version`" .format(stream)) state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:03.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") # verify incremental table_version didn't change incremental_table_new_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] self.assertEqual( incremental_table_initial_table_version, incremental_table_new_table_version, msg= "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations." )
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual( len(found_catalogs), 2, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog_cows = list( filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name']) test_catalog_chickens = list( filter( lambda c: c['stream_name' ] == 'postgres_logical_replication_test_chickens', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_cows, menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']), additional_md) connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_chickens, menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'postgres_logical_replication_test_cows': 1, 'postgres_logical_replication_test_chickens': 1 }) records_by_stream = runner.get_records_from_target_output() table_version_cows = records_by_stream[ 'postgres_logical_replication_test_cows']['table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][2]['action'], 'activate_version') table_version_chickens = records_by_stream[ 'postgres_logical_replication_test_chickens']['table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][2]['action'], 'activate_version') # verify state and bookmarks state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") bookmark_cows = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn") lsn_cows_1 = bookmark_cows['lsn'] self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version") bookmark_chickens = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn") lsn_chickens_1 = bookmark_chickens['lsn'] self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version") #---------------------------------------------------------------------- # invoke the sync job again after adding records #---------------------------------------------------------------------- print("inserting 2 more cows and 2 more chickens") with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor() as cur: # insert another cow self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21} insert_record(cur, test_table_name_cows, self.cows_rec_2) # update that cow's expected values self.cows_rec_2['id'] = 2 self.cows_rec_2['_sdc_deleted_at'] = None # insert another chicken self.chicken_rec_2 = { 'chicken_name': "burt chicken", 'chicken_age': 14 } insert_record(cur, test_table_name_chickens, self.chicken_rec_2) # update that cow's expected values self.chicken_rec_2['id'] = 2 self.chicken_rec_2['_sdc_deleted_at'] = None # and repeat... self.cows_rec_3 = {'cow_name': "cindy cow", 'cow_age': 10} insert_record(cur, test_table_name_cows, self.cows_rec_3) self.cows_rec_3['id'] = 3 self.cows_rec_3['_sdc_deleted_at'] = None self.chicken_rec_3 = { 'chicken_name': "carl chicken", 'chicken_age': 4 } insert_record(cur, test_table_name_chickens, self.chicken_rec_3) self.chicken_rec_3['id'] = 3 self.chicken_rec_3['_sdc_deleted_at'] = None sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'postgres_logical_replication_test_cows': 2, 'postgres_logical_replication_test_chickens': 2 }) records_by_stream = runner.get_records_from_target_output() chicken_messages = records_by_stream[ "postgres_logical_replication_test_chickens"]['messages'] cow_messages = records_by_stream[ "postgres_logical_replication_test_cows"]['messages'] self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data']) self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data']) self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data']) self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data']) print("inserted record is correct") state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") cows_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone( cows_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_cows_2 = cows_bookmark['lsn'] self.assertTrue(lsn_cows_2 >= lsn_cows_1) chickens_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone( chickens_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_chickens_2 = chickens_bookmark['lsn'] self.assertTrue(lsn_chickens_2 >= lsn_chickens_1) #table_version does NOT change self.assertEqual( chickens_bookmark['version'], table_version_chickens, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" ) #table_version does NOT change self.assertEqual( cows_bookmark['version'], table_version_cows, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" )
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_sync_streams(): self.assertGreaterEqual(record_count_by_stream[tap_stream_id], self.expected_row_counts()[tap_stream_id]) # Verify that we have 'initial_full_table_complete' bookmark state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # assert that the state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id] ['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][ 'version'] self.assertIsNotNone(first_versions[tap_stream_id]) # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_time']) self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_inc']) changed_ids = set() with get_test_connection() as client: # Delete two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) # Update two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 48})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 48}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 49})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 49}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 98})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 98}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 99})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 99}, {'$set': { 'int_field': -1 }}) # Insert two documents for each collection client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 50})[0]['_id']) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 51})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 100})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 101})[0]['_id']) # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got at least 6 records due to changes # (could be more due to overlap in gte oplog clause) for k, v in record_count_by_stream.items(): self.assertGreaterEqual(v, 6) # Verify that we got 2 records with _SDC_DELETED_AT self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_1'] if x['data'].get('_sdc_deleted_at') ])) self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_2'] if x['data'].get('_sdc_deleted_at') ])) # Verify that the _id of the records sent are the same set as the # _ids of the documents changed actual = set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_1'] ]).union( set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_2'] ])) self.assertEqual(changed_ids, actual)
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc # This needs to be consistent based on replication method so you can change replication methods table_version = records_by_stream[stream]['table_version'] # verify on the first sync you get activate version message before and after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is self.assertIsNone( bookmark.get('lsn'), msg= "expected bookmark for stream to have NO lsn because we are using full-table replication" ) self.assertEqual( bookmark['version'], table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def binlog_json_test(self): print("RUNNING {}\n\n".format(self.name())) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) expected_check_streams = {self.tap_stream_id()} expected_sync_streams = {self.table_name()} expected_pks = {self.table_name(): {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(self.table_name(), test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {self.table_name(): 1}) records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['activate_version', 'upsert', 'activate_version']) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] self.assertEqual([expected_rec_1], upsert_records) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # insert a new huge row data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)], literal=True) rec = {'id': 2, 'our_json': json.dumps(data)} with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, rec) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] expected_rec_2 = copy.deepcopy(rec) # check for expected records records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['upsert']) upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] del upsert_records[0]['_sdc_deleted_at'] expected_json = json.loads(expected_rec_2.get('our_json', {})) actual_json = json.loads(upsert_records[0].get('our_json', {})) self.assertTrue(len(actual_json.keys()) > 0) self.assertEqual(expected_json, actual_json)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])
def binlog_edge_test(self, expected_records=[]): """ Test binlog replication edge cases • Verify an initial sync returns expected records of various datatypes • Verify we bookmark correctly when a transaction spans multiple files • Insert and delete a record prior to sync. Verify both events are replicated • Insert and update a record prior to sync. Verify both events are replicated • Verify a valid log_file and log_pos state are persisted after each sync """ conn_id = connections.ensure_connection(self) # prior to first sync update a record... updated_timestamp = datetime.datetime.now() updated_id = 1 expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime( updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ") # insert a record and... inserted_record = self.generate_record_n(len(expected_records)) expected_records += [inserted_record] # TODO need to format # delete a record deleted_id = 2 with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: cur.execute( "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format( self.database_name(), self.table_name_1(), updated_timestamp, updated_id)) self.insert_record(cur, inserted_record, self.table_name_1()) delete_time = datetime.datetime.now() cur.execute("DELETE FROM {}.{} WHERE id = {}".format( self.database_name(), self.table_name_1(), deleted_id)) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \ "\nEVENTS: {} records updated".format(1) + \ "\n {} records deleted\n\n".format(1) ) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) t1 = self.table_name_1() t2 = self.table_name_2() expected_check_streams = { self.tap_stream_id(t1), self.tap_stream_id(t2) } expected_sync_streams = {t1, t2} expected_pks = {t1: {'id'}, t2: {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name']) self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] for catalog in found_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) _ = connections.select_catalog_and_fields_via_metadata( conn_id, catalog, catalog, additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258 # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream) records_for_stream = runner.get_records_from_target_output()[ self.table_name_1()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] # verify activate version messages are present self.assertEqual('activate_version', message_actions[0]) self.assertEqual('activate_version', message_actions[-1]) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] # we need to compare record by record since there are so many. # a failure comparing expected_records to upsert_records would result in # an output message greater in length than a standard tmux buffer # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257 # for expected_record in expected_records: # upsert_record = [rec for rec in upsert_records # if rec['id'] == expected_record['id']] # self.assertEqual(1, len(upsert_record), # msg="multiple upsert_recs with same pk: {}".format(upsert_record)) # self.assertEqual(expected_record, upsert_record.pop()) # TODO add check for _sdc_delete_at for deleted record once bug addressed # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # Create 1 more record prior to 2nd sync new_record = self.generate_record_n(len(expected_records)) with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, new_record, self.table_name_1()) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \ "\nEVENTS: {} records inserted".format(1) ) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: print("PATH A") self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] print("PATH B") self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) # Execute delete across tables using join prior to 3rd sync deleted_id = 4 with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: delete_time = datetime.datetime.now() # DELETE T1, T2 # FROM T1 # INNER JOIN T2 ON T1.key = T2.key # WHERE condition; db = self.database_name() db_t1 = db + "." + t1 db_t2 = db + "." + t2 t1_key = db_t1 + ".id" t2_key = db_t2 + ".id" statement = "DELETE {}, {} ".format(db_t1, db_t2) + \ "FROM {} ".format(t1) + \ "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \ "WHERE {} = {}".format(t1_key, deleted_id) cur.execute(statement) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \ "\nTABLE: {}".format(self.table_name_2()) + \ "\nEVENTS: {} records deleted\n\n".format(1) ) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) target_records = runner.get_records_from_target_output() records_stream_1 = target_records[self.table_name_1()] upsert_records_1 = [ m['data'] for m in records_stream_1['messages'] if m['action'] == 'upsert' ] records_stream_2 = target_records[self.table_name_2()] upsert_records_2 = [ m['data'] for m in records_stream_2['messages'] if m['action'] == 'upsert' ] # make sure the record is in the target for both tables with a delete time deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at') deleted_at_t1_timestamp = utils.strptime_to_utc( deleted_at_t1).timestamp() self.assertIsNotNone(deleted_at_t1) deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at') deleted_at_t2_timestamp = utils.strptime_to_utc( deleted_at_t2).timestamp() self.assertIsNotNone(deleted_at_t2) # the delete times should be equal since it was a single transaction self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp) time_delta = delete_time.timestamp() - deleted_at_t1_timestamp print("Delete time vs record: difference in seconds", time_delta) self.assertLess(time_delta, 3) # time delta less than 3 seconds in magnitude