def do_test(self, conn_id): # Select our catalogs our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream,{}).get('messages',[]) if stream in ['tickets', 'groups', 'users']: self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream)) for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
def test_run(self): self.setUpTestEnvironment(COMPRESSION_FOLDER_PATH) runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format( self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Clear state before our run menagerie.set_state(self.conn_id, {}) self.select_specific_catalog(found_catalogs, "gz_file_having_empty_csv") runner.run_sync_job_and_check_status(self) expected_records = 0 # Verify actual rows were synced records = runner.get_upserts_from_target_output() self.assertEqual(expected_records, len(records))
def test_run(self): # Select our catalogs # found_catalogs = menagerie.get_catalogs(conn_id) # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] # for c in our_catalogs: # c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) # c_metadata = metadata.to_map(c_annotated['metadata']) # connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) conn_id = self.create_connection() # Clear state before our run menagerie.set_state(conn_id, {}) # Select a stream found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = sum(record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream, {}).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] replication_methods = self.expected_replication_method() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): replication_method = replication_methods.get(stream) if replication_method is self.INCREMENTAL: self.assertTrue(stream in bookmarks) elif replication_method is self.FULL_TABLE: self.assertTrue(stream not in bookmarks) else: raise NotImplementedError( "stream {} has an invalid replication method {}".format(stream, replication_method) )
def first_sync_test(self, table_configs, conn_id): # run first full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() expected_pks = {} for config in table_configs: key = {config['HashKey']} if config.get('SortKey'): key |= {config.get('SortKey')} expected_pks[config['TableName']] = key # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, {x['TableName'] for x in table_configs}, expected_pks) state = menagerie.get_state(conn_id) state_version = menagerie.get_state_version(conn_id) first_versions = {} # assert that we get the correct number of records for each stream for config in table_configs: table_name = config['TableName'] self.assertEqual(config['num_rows'], record_count_by_stream[table_name]) # assert that an activate_version_message is first and last message sent for each stream self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][-1]['action']) # assert that the state has an initial_full_table_complete == True self.assertTrue( state['bookmarks'][table_name]['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[table_name] = state['bookmarks'][table_name][ 'version'] self.assertIsNotNone(first_versions[table_name]) # Write state with missing finished_shards so it # re-reads data from all shards # This should result in the next sync having same number of records # as the full table sync state['bookmarks'][table_name].pop('finished_shards') menagerie.set_state(conn_id, state, version=state_version)
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = {"currently_syncing" : None, "bookmarks": {"contacts" : {"offset" : {}, "versionTimestamp" : future_time}, "subscription_changes" : {"startTimestamp" : future_time, "offset" : {}}, "campaigns" : {"offset" : {}}, "forms" : {"updatedAt" : future_time}, "deals" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "workflows" : {"updatedAt" : future_time}, "owners" : {"updatedAt" : future_time}, "contact_lists" : {"updatedAt" : future_time, "offset" : {}}, "email_events" : {"startTimestamp" : future_time, "offset" : {}}, "companies" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "engagements" : {"lastUpdated" : future_time, "offset" : {}}}} menagerie.set_state(conn_id, future_bookmarks) record_count_by_stream = self.run_and_verify_sync(conn_id) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys()) self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data") bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'csv_with_empty_lines').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #Empty line should be ignored in emitted records. expected_records = [ {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2}, {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3}, {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4}, {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): conn_id = connections.ensure_connection(self) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema) # Clear State and run sync menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): # sync 1 conn_id = connections.ensure_connection(self) found_catalogs_1 = self.run_and_verify_check_mode(conn_id) self.perform_and_verify_table_and_field_selection(conn_id,found_catalogs_1) record_count_by_stream_1 = self.run_and_verify_sync(conn_id) # checking if we got any data from sync 1 self.assertGreater(sum(record_count_by_stream_1.values()), 0) for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual(self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream_1[tap_stream_id]) # getting state state = menagerie.get_state(conn_id) # creating file "table_1_fileB" with self.get_test_connection() as client: root_dir = os.getenv('TAP_SFTP_ROOT_DIR') client.chdir(root_dir + '/tap_tester/folderA') file_group = self.get_files()[0] with client.open('table_1_fileB.csv', 'w') as f: writer = csv.writer(f) lines = [file_group['headers']] + file_group['generator'](file_group['num_rows']) writer.writerows(lines) # adding some data to file "table_1_fileA" and "table_3_fileA" self.append_to_files() # setting state menagerie.set_state(conn_id, state) # sync 2 record_count_by_stream_2 = self.run_and_verify_sync(conn_id, second_sync = True) # checking if we got any data from sync 2 self.assertGreater(sum(record_count_by_stream_2.values()), 0) # checking if data after in 2nd sync is as expected # here as we have modified start date, so we should recieve only modified data # ie. after appending and creating file for tap_stream_id in self.expected_second_sync_streams(): self.assertEqual(self.expected_second_sync_row_counts()[tap_stream_id], record_count_by_stream_2[tap_stream_id])
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs)) #menagerie.post_annotated_catalogs(conn_id, selected_catalogs) for c in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, c, menagerie.get_annotated_schema(conn_id, c['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # bookmarks for the 4 streams should be 2015-03-16 states = menagerie.get_state(conn_id)["bookmarks"] end_date = self.get_properties()["end_date"].split()[0] for k, v in states.items(): if "insights" in k: bm_date = v.get("date_start") self.assertEqual(end_date, bm_date) print("bookmarks match end_date of {}".format(end_date))
def test_run(self): conn_id = self.create_connection() # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] self.assertTrue('orders' in bookmarks)
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select all Catalogs for catalog in found_catalogs: if catalog['tap_stream_id'] in self.expected_sync_streams(): connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) #clear state menagerie.set_state(conn_id, {}) record_count_by_stream = self.run_and_verify_sync(conn_id) max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date']) start_of_today = utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc)) max_bookmarks_from_records['subscription_changes'] = start_of_today max_bookmarks_from_records['email_events'] = start_of_today #if we didn't replicate data, the bookmark should be the start_date for k in self.expected_bookmarks().keys(): if max_bookmarks_from_records.get(k) is None: max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc)) state = menagerie.get_state(conn_id) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys()) #verify bookmarks and offsets for k,v in sorted(list(self.expected_bookmarks().items())): for w in v: bk_value = bookmarks.get(k,{}).get(w) self.assertEqual(utils.strptime_with_tz(bk_value), utils.strptime_with_tz(max_bookmarks_from_records[k]), "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k])) print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k])) for k,v in self.expected_offsets().items(): self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state)) print("offsets {} cleared".format(k)) diff = bookmark_streams.difference(self.acceptable_bookmarks()) self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks)) self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
def test_run(self): runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.check_all_streams_in_catalogs(found_catalogs) self.select_found_catalogs(found_catalogs) # clear state and run the actual sync menagerie.set_state(self.conn_id, {}) runner.run_sync_job_and_check_status(self) self.check_output_record_counts() max_bookmarks_from_records = runner.get_max_bookmarks_from_target(self) state = menagerie.get_state(self.conn_id) bookmarks = state.get("bookmarks", {}) self.check_bookmarks(bookmarks, max_bookmarks_from_records) self.check_offsets(bookmarks) self.look_for_unexpected_bookmarks(bookmarks) self.assertIsNone(state.get("currently_syncing"))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) #select certain... catalogs expected_streams = self.expected_sync_streams() allowed_catalogs = [ catalog for catalog in found_catalogs if not self.is_unsupported_by_bulk_api(catalog['stream_name']) and catalog['stream_name'] in expected_streams ] self.select_all_streams_and_fields(conn_id, allowed_catalogs) # Run sync menagerie.set_state(conn_id, {}) _ = self.run_and_verify_sync(conn_id)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) #select certain... catalogs # TODO: This might need to exclude Datacloud objects. So we don't blow up on permissions issues expected_streams = self.expected_sync_streams() allowed_catalogs = [ catalog for catalog in found_catalogs if not self.is_unsupported_by_bulk_api(catalog['stream_name']) and catalog['stream_name'] in expected_streams ] self.select_all_streams_and_fields(conn_id, allowed_catalogs) # Run sync menagerie.set_state(conn_id, {}) _ = self.run_and_verify_sync(conn_id)
def run_and_verify_sync(self, conn_id, state): """ Run a sync job and make sure it exited properly. Return a dictionary with keys of streams synced and values of records synced for each stream """ # reset state to the state at the start of the sync in case we got interrupted menagerie.set_state(conn_id, state) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) try: menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) except AssertionError as e: if exit_status['discovery_error_message'] or exit_status[ 'tap_error_message']: print( "*******************RETRYING SYNC FOR TAP/DISCOVERY FAILURE*******************" ) raise RetryableTapError(e) raise # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) self.assertGreater( sum(sync_record_count.values()), 0, msg="failed to replicate any data: {}".format(sync_record_count)) print("total replicated row count: {}".format( sum(sync_record_count.values()))) return sync_record_count
def run_and_verify_sync(self, conn_id): """ Clear the connections state in menagerie and Run a Sync. Verify the exit code following the sync. Return the connection id and record count by stream """ #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) return first_record_count_by_stream
def test_future_date_in_state(self): conn_id = connections.ensure_connection(self) expected_streams = self.streams_to_select() future_date = datetime.datetime.strftime( datetime.datetime.today() + datetime.timedelta(days=1), "%Y-%m-%dT00:00:00Z") state = {'bookmarks': dict()} replication_keys = self.expected_replication_keys() for stream in expected_streams: if self.is_incremental(stream): state['bookmarks'][stream] = dict() state['bookmarks'][stream]['field'] = next( iter(replication_keys[stream])) state['bookmarks'][stream]['last_record'] = future_date # set state for running sync mode menagerie.set_state(conn_id, state) runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams) # run sync mode self.run_and_verify_sync(conn_id) # get the state after running sync mode latest_state = menagerie.get_state(conn_id) # verify that the state passed before sync # and the state we got after sync are same self.assertEquals(latest_state, state)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) for k in self.expected_primary_keys()[c['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( c['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) # run a sync _ = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [ set(row['data'].keys()) for row in data['messages'] ] for record_keys in record_messages: # The symmetric difference should be empty self.assertEqual( record_keys, self.expected_automatic_fields().get(stream_name, set()))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])
def bookmarks_test(self, conn_id, testable_streams): # Select all streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) incremental_streams = { key for key, value in self.expected_replication_method().items() if value == self.INCREMENTAL and key in testable_streams } # Our test data sets for Shopify do not have any abandoned_checkouts our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in incremental_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) ################################# # Run first sync ################################# first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual(set(first_sync_record_count.keys()), incremental_streams) first_sync_bookmark = menagerie.get_state(conn_id) first_sync_records = runner.get_records_from_target_output() # BUG:TDL-17087 : State has additional values which are not streams # Need to remove additional values from bookmark value extra_stuff = { 'transaction_orders', 'metafield_products', 'refund_orders', 'product_variants' } for keys in list(first_sync_bookmark['bookmarks'].keys()): if keys in extra_stuff: first_sync_bookmark['bookmarks'].pop(keys) ####################### # Update State between Syncs ####################### new_state = {'bookmarks': dict()} #simulated_states = self.calculated_states_by_stream(first_sync_bookmark) # We are hardcoding the updated state to ensure that we get atleast 1 record in second sync. These values have been provided after reviewing the max bookmark value for each of the streams simulated_states = { 'products': { 'updated_at': '2021-12-20T05:10:05.000000Z' }, 'collects': { 'updated_at': '2021-09-01T09:08:28.000000Z' }, 'abandoned_checkouts': { 'updated_at': '2022-02-02T16:00:00.000000Z' }, 'inventory_levels': { 'updated_at': '2021-12-20T05:09:34.000000Z' }, 'locations': { 'updated_at': '2021-07-20T09:00:22.000000Z' }, 'events': { 'created_at': '2021-12-20T05:09:01.000000Z' }, 'inventory_items': { 'updated_at': '2021-09-15T19:44:11.000000Z' }, 'transactions': { 'created_at': '2021-12-20T00:08:52-05:00' }, 'metafields': { 'updated_at': '2021-09-07T21:18:05.000000Z' }, 'order_refunds': { 'created_at': '2021-05-01T17:41:18.000000Z' }, 'customers': { 'updated_at': '2021-12-20T05:08:17.000000Z' }, 'orders': { 'updated_at': '2021-12-20T05:09:01.000000Z' }, 'custom_collections': { 'updated_at': '2021-12-20T17:41:18.000000Z' } } for stream, updated_state in simulated_states.items(): new_state['bookmarks'][stream] = updated_state menagerie.set_state(conn_id, new_state) ############################### # Run Second Sync ############################### second_sync_record_count = self.run_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmark = menagerie.get_state(conn_id) for stream in testable_streams: with self.subTest(stream=stream): # expected values expected_replication_method = self.expected_replication_method( ) expected_replication_keys = self.expected_replication_keys() # information required for assertions from sync 1 and 2 based on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] first_bookmark_value = first_sync_bookmark.get( 'bookmarks', { stream: None }).get(stream) first_bookmark_value = list(first_bookmark_value.values())[0] second_bookmark_value = second_sync_bookmark.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_value = list(second_bookmark_value.values())[0] replication_key = next(iter(expected_replication_keys[stream])) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark = new_state['bookmarks'][stream] simulated_bookmark_value = list(simulated_bookmark.values())[0] # verify the syncs sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_value) self.assertTrue( self.is_expected_date_format(first_bookmark_value)) self.assertIsNotNone(second_bookmark_value) self.assertTrue( self.is_expected_date_format(second_bookmark_value)) # verify the 2nd bookmark is equal to 1st sync bookmark #NOT A BUG (IS the expected behaviour for shopify as they are using date windowing : TDL-17096 : 2nd bookmark value is getting assigned from the execution time rather than the actual bookmark time. This is an invalid assertion for shopify #self.assertEqual(first_bookmark_value, second_bookmark_value) for record in first_sync_messages: replication_key_value = record.get(replication_key) # verify 1st sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication key value was synced" ) for record in second_sync_messages: replication_key_value = record.get(replication_key) # verify the 2nd sync replication key value is greater or equal to the 1st sync bookmarks self.assertGreaterEqual( replication_key_value, simulated_bookmark_value, msg= "Second sync records do not respect the previous bookmark" ) # verify the 2nd sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication key value was synced" ) # verify that we get less data in the 2nd sync # collects has all the records with the same value of replication key, so we are removing from this assertion if stream not in ('collects'): self.assertLess( second_sync_count, first_sync_count, msg= "Second sync does not have less records, bookmark usage not verified" ) # verify that we get atleast 1 record in the second sync if stream not in ('collects'): self.assertGreater( second_sync_count, 0, msg="Second sync did not yield any records")
def bookmarks_test(self, expected_streams): """A Parametrized Bookmarks Test""" expected_replication_keys = self.expected_replication_keys() expected_replication_methods = self.expected_replication_method() expected_insights_buffer = -1 * int( self.get_properties()['insights_buffer_days']) # lookback window ########################################################################## ### First Sync ########################################################################## conn_id = connections.ensure_connection(self, original_properties=False) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Update State Between Syncs ########################################################################## new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream( first_sync_bookmarks) for stream, new_state in simulated_states.items(): new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## ### Second Sync ########################################################################## second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Test By Stream ########################################################################## for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_replication_method = expected_replication_methods[ stream] # collect information for assertions from syncs 1 & 2 base on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) if expected_replication_method == self.INCREMENTAL: # collect information specific to incremental streams from syncs 1 & 2 replication_key = next( iter(expected_replication_keys[stream])) first_bookmark_value = first_bookmark_key_value.get( replication_key) second_bookmark_value = second_bookmark_key_value.get( replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = new_states['bookmarks'][stream][ replication_key] simulated_bookmark_minus_lookback = self.timedelta_formatted( simulated_bookmark_value, days=expected_insights_buffer) if self.is_insight( stream) else simulated_bookmark_value # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) self.assertIsNotNone( first_bookmark_key_value.get(replication_key)) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) self.assertIsNotNone( second_bookmark_key_value.get(replication_key)) # Verify the second sync bookmark is Equal to the first sync bookmark self.assertEqual( second_bookmark_value, first_bookmark_value ) # assumes no changes to data during test for record in second_sync_messages: # Verify the second sync records respect the previous (simulated) bookmark value replication_key_value = record.get(replication_key) if stream == 'ads_insights_age_and_gender': # BUG | https://stitchdata.atlassian.net/browse/SRCE-4873 replication_key_value = datetime.datetime.strftime( dateutil.parser.parse(replication_key_value), self.BOOKMARK_COMPARISON_FORMAT) self.assertGreaterEqual( replication_key_value, simulated_bookmark_minus_lookback, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) elif expected_replication_method == self.FULL_TABLE: # Verify the syncs do not set a bookmark for full table streams self.assertIsNone(first_bookmark_key_value) self.assertIsNone(second_bookmark_key_value) # Verify the number of records in the second sync is the same as the first self.assertEqual(second_sync_count, first_sync_count) else: raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}" .format(stream, expected_replication_method)) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def test_run(self): """ Verify that for each stream you can do a sync which records bookmarks. That the bookmark is the maximum value sent to the target for the replication key. That a second sync respects the bookmark All data of the second sync is >= the bookmark from the first sync The number of records in the 2nd sync is less then the first (This assumes that new data added to the stream is done at a rate slow enough that you haven't doubled the amount of data from the start date to the first sync between the first sync and second sync run in this test) Verify that for full table stream, all data replicated in sync 1 is replicated again in sync 2. PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ expected_streams = self.expected_check_streams() expected_replication_keys = self.expected_replication_keys() expected_replication_methods = self.expected_replication_method() ########################################################################## # First Sync ########################################################################## conn_id = connections.ensure_connection(self) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection catalog_entries = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries) # Run a first sync job using orchestrator first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## # Update State Between Syncs ########################################################################## new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream( first_sync_bookmarks) for stream, new_state in simulated_states.items(): new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## # Second Sync ########################################################################## second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## # Test By Stream ########################################################################## for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_replication_method = expected_replication_methods[ stream] # collect information for assertions from syncs 1 & 2 base on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) if expected_replication_method == self.INCREMENTAL: # collect information specific to incremental streams from syncs 1 & 2 replication_key = next( iter(expected_replication_keys[stream])) first_bookmark_value = first_bookmark_key_value.get( replication_key) second_bookmark_value = second_bookmark_key_value.get( replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = self.convert_state_to_utc( new_states['bookmarks'][stream][replication_key]) # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) self.assertIsNotNone(first_bookmark_value) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) self.assertIsNotNone(second_bookmark_value) # Verify the second sync bookmark is Equal to the first sync bookmark # assumes no changes to data during test if not stream == "users": self.assertEqual(second_bookmark_value, first_bookmark_value) else: # For `users` stream it stores bookmark as 1 minute less than current time if `updated_at` of # last records less than it. So, if there is no data change then second_bookmark_value will be # 1 minute less than current time. Therefore second_bookmark_value will always be # greater or equal to first_bookmark_value self.assertGreaterEqual(second_bookmark_value, first_bookmark_value) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) # For `ticket` stream it stores bookmarks as int timestamp. So, converting it to the string. if stream == "tickets": replication_key_value = datetime.utcfromtimestamp( replication_key_value).strftime( '%Y-%m-%dT%H:%M:%SZ') self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in second_sync_messages: # Verify the second sync replication key value is Greater or Equal to the first sync bookmark replication_key_value = record.get(replication_key) if stream == "tickets": replication_key_value = datetime.utcfromtimestamp( replication_key_value).strftime( '%Y-%m-%dT%H:%M:%SZ') self.assertGreaterEqual( replication_key_value, simulated_bookmark_value, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) elif expected_replication_method == self.FULL_TABLE: # Verify the syncs do not set a bookmark for full table streams self.assertIsNone(first_bookmark_key_value) self.assertIsNone(second_bookmark_key_value) # Verify the number of records in the second sync is the same as the first # Given below streams are child stremas of parent stream `tickets` and tickets is incremental streams # Child streams also behave like incremental streams but does not save it's own state. So, it don't # have same no of record on second sync and first sync. if not stream in [ "ticket_comments", "ticket_audits", "ticket_metrics" ]: self.assertEqual(second_sync_count, first_sync_count) else: raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}" .format(stream, expected_replication_method)) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def test_run(self): expected_streams = self.expected_streams() expected_replication_keys = self.expected_replication_keys() expected_replication_methods = self.expected_replication_method() ########################################################################## ### First Sync ########################################################################## self.start_date_1 = self.get_properties().get("start_date") self.start_date_2 = self.timedelta_formatted(self.start_date_1, days=3) self.start_date = self.start_date_1 conn_id = connections.ensure_connection(self, original_properties=False) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Update State Between Syncs ########################################################################## new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream( first_sync_bookmarks) for stream, new_state in simulated_states.items(): new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) for stream in simulated_states.keys(): for state_key, state_value in simulated_states[stream].items(): if stream not in new_states['bookmarks']: new_states['bookmarks'][stream] = {} if state_key not in new_states['bookmarks'][stream]: new_states['bookmarks'][stream][state_key] = state_value ########################################################################## ### Second Sync ########################################################################## self.start_date = self.start_date_2 # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs_2_all_fields = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_2_all_fields, select_all_fields=True) second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Test By Stream ########################################################################## for stream in expected_streams: with self.subTest(stream=stream): expected_replication_method = expected_replication_methods[ stream] first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) # expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) # collect information for assertions from syncs 1 & 2 base on expected values first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] if expected_replication_method == self.INCREMENTAL: replication_key = next( iter(expected_replication_keys[stream])) if stream != 'forms': for form_key in self.get_forms(): first_bookmark_value = first_bookmark_key_value.get( form_key, {}).get(replication_key) second_bookmark_value = second_bookmark_key_value.get( form_key, {}).get(replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = new_states['bookmarks'][ stream][form_key] simulated_bookmark_minus_lookback = simulated_bookmark_value # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) # Verify the second sync bookmark is Greater or Equal to the first sync bookmark self.assertGreaterEqual( second_bookmark_value, first_bookmark_value ) # new responses could be picked up for the form in the second sync for record in second_sync_messages: # Verify the second sync records respect the previous (simulated) bookmark value replication_key_value = record.get( replication_key) self.assertGreaterEqual( replication_key_value, simulated_bookmark_minus_lookback, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get( replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) else: # collect information specific to incremental streams from syncs 1 & 2 first_bookmark_value = first_bookmark_key_value.get( replication_key) second_bookmark_value = second_bookmark_key_value.get( replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = new_states['bookmarks'][ stream][replication_key] simulated_bookmark_minus_lookback = simulated_bookmark_value # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) # Verify the second sync bookmark is Greater or Equal to the first sync bookmark self.assertGreaterEqual( second_bookmark_value, first_bookmark_value ) # new responses could be picked up for the form in the second sync for record in second_sync_messages: # Verify the second sync records respect the previous (simulated) bookmark value replication_key_value = record.get(replication_key) self.assertGreaterEqual( replication_key_value, simulated_bookmark_minus_lookback, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) elif expected_replication_method == self.FULL_TABLE: # Verify the syncs do not set a bookmark for full table streams self.assertIsNone(first_bookmark_key_value) self.assertIsNone(second_bookmark_key_value) # Verify the number of records in the second sync is the same as the first self.assertEqual(second_sync_count, first_sync_count) else: raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}" .format(stream, expected_replication_method)) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def binlog_json_test(self): print("RUNNING {}\n\n".format(self.name())) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) expected_check_streams = {self.tap_stream_id()} expected_sync_streams = {self.table_name()} expected_pks = {self.table_name(): {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(self.table_name(), test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {self.table_name(): 1}) records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['activate_version', 'upsert', 'activate_version']) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] self.assertEqual([expected_rec_1], upsert_records) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # insert a new huge row data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)], literal=True) rec = {'id': 2, 'our_json': json.dumps(data)} with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, rec) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] expected_rec_2 = copy.deepcopy(rec) # check for expected records records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['upsert']) upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] del upsert_records[0]['_sdc_deleted_at'] expected_json = json.loads(expected_rec_2.get('our_json', {})) actual_json = json.loads(upsert_records[0].get('our_json', {})) self.assertTrue(len(actual_json.keys()) > 0) self.assertEqual(expected_json, actual_json)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties chicken_catalog = found_catalogs[0] self.assertEqual('chicken_view', chicken_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for ROOT-CHICKEN_VIEW') md = menagerie.get_annotated_schema( conn_id, chicken_catalog['stream_id'])['metadata'] self.assertEqual( { (): { 'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': [] }, ('properties', 'fk_id'): { 'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True }, ('properties', 'name'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'age'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True }, ('properties', 'size'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'id'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True } }, metadata.to_map(md)) # 'ID' selected as view-key-properties replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': None, "replication-method": "FULL_TABLE", 'view-key-properties': ["id"] } }] connections.select_catalog_and_fields_via_metadata( conn_id, chicken_catalog, menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), replication_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'chicken_view': 1}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['chicken_view']['table_version'] self.assertEqual( records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['chicken_view']['messages'][2]['action'], 'activate_version') # verifications about individual records for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) actual_chicken_record = records_by_stream['chicken_view']['messages'][ 1]['data'] expected_chicken_record = { 'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'size': 'big' } self.assertEqual( actual_chicken_record, expected_chicken_record, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_chicken_record, actual_chicken_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertEqual( chicken_bookmark['version'], table_version, msg="expected bookmark for stream ROOT-CHICKEN to match version")
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties for c in found_catalogs: catalog_props_to_check = ['stream_name', 'tap_stream_id'] stream = c['stream_name'] for prop in catalog_props_to_check: self.assertEqual( c[prop], expected_catalogs[stream][prop], msg= "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`" .format(prop, stream, expected_catalogs[stream][prop], c[prop])) print("discovered streams are correct") print('checking discoverd metadata for tap_tester_mysql_0-incremental') incremental_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental' ][0] md = menagerie.get_annotated_schema( conn_id, incremental_catalog['stream_id'])['metadata'] incremental_stream_metadata = { 'database-name': 'tap_tester_mysql_0', 'row-count': 3, 'is-view': False, 'selected-by-default': False, 'table-key-properties': ['c_pk'] } self.assertEqual( sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': incremental_stream_metadata }, { 'breadcrumb': ['properties', 'c_dt'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'datetime' } }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }, { 'breadcrumb': ['properties', 'c_varchar_to_deselect'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) print('checking discovered metadata for tap_tester_mysql_1-view') view_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_1-view' ][0] view_catalog_key_properties_md = [{ 'breadcrumb': [], 'metadata': { 'view-key-properties': ['c_pk'] } }] connections.set_non_discoverable_metadata( conn_id, view_catalog, menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']), view_catalog_key_properties_md) md = menagerie.get_annotated_schema( conn_id, view_catalog['stream_id'])['metadata'] view_stream_metadata = { 'database-name': 'tap_tester_mysql_1', 'is-view': True, 'selected-by-default': False, 'view-key-properties': ['c_pk'] } self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': view_stream_metadata }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) #No selected-by-default MD for c_year because it is an unsupported type various_types_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types' ][0] md = menagerie.get_annotated_schema( conn_id, various_types_catalog['stream_id'])['metadata'] c_year_md = [ x for x in md if x['breadcrumb'] == ['properties', 'c_year'] ] self.assertEqual(c_year_md, [{ 'breadcrumb': ['properties', 'c_year'], 'metadata': { 'selected-by-default': False, 'sql-datatype': 'year(4)' } }]) ##select_simple_example catalogs_to_select = [ c for c in found_catalogs if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example' ] for a_catalog in catalogs_to_select: additional_md = [] unselected_fields = [] if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental': additional_md = [{ "breadcrumb": [], "metadata": { 'replication-key': 'c_dt', 'replication-method': 'INCREMENTAL' } }] unselected_fields = ['c_varchar_to_deselect'] elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view': additional_md = [{ "breadcrumb": [], "metadata": { 'view-key-properties': ['c_pk'], 'replication-method': 'FULL_TABLE' } }] else: additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, a_catalog, menagerie.get_annotated_schema(conn_id, a_catalog['stream_id']), additional_md, unselected_fields) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 8 # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() # verifications about individual records for stream, recs in records_by_stream.items(): # verify that activate version messages were sent in the proper position self.assertEqual( recs['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` to have action `activate_version`" .format(stream)) # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # verify that the target output the proper numeric and date representations expected_various_types_records = [{ 'c_time': '1970-01-01T12:34:56.000000Z', 'c_mediumint': 8388607, 'c_smallint': 32767, 'c_tinyint': 127, 'c_date': '2017-09-13T00:00:00.000000Z', 'c_bigint': 9223372036854775807, 'c_decimal': -1, 'c_int': 2147483647, 'c_bit': True, 'c_decimal_2': Decimal('123456789.0'), 'c_pk': 1, 'c_double': Decimal("1.234"), 'c_float': Decimal("1.234"), 'c_decimal_2_unsigned': Decimal("1.23"), 'c_tinyint_1': True }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': False, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 2, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': False }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': None, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 3, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': None }] actual_various_types_records = [ r['data'] for r in records_by_stream['various_types']['messages'][1:4] ] self.assertEqual( actual_various_types_records, expected_various_types_records, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_various_types_records, actual_various_types_records)) # verify that deselected property was not output expected_incremental_record = { 'c_pk': 1, 'c_dt': '2017-01-01T00:00:00.000000Z', 'c_varchar': 'a' } actual_incremental_record = records_by_stream['incremental'][ 'messages'][1]['data'] self.assertEqual( actual_incremental_record, expected_incremental_record, msg= "Expected first `incremental` upsert record data to be {}, but target output {}" .format(expected_incremental_record, actual_incremental_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:02.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") incremental_table_initial_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] #---------------------------------------------------------------------- # invoke the sync job again after some modifications #---------------------------------------------------------------------- print("adding a column to an existing table in the source db") connection = db_utils.get_db_connection(self.get_properties(), self.get_credentials()) with connection.cursor() as cursor: add_column_sql = ''' ALTER TABLE tap_tester_mysql_0.incremental ADD COLUMN favorite_number INTEGER; INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999); ''' cursor.execute(add_column_sql) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 7 # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() expected_schema_of_new_column = { 'maximum': 2147483647, 'selected': True, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } # verifications about individual records for stream, recs in records_by_stream.items(): # verify that a activate version messages were sent in the proper position if stream == 'incremental': self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` not to have action `activate_version`" .format(stream)) expected_schema_of_new_column = { 'maximum': 2147483647, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } self.assertEqual( records_by_stream[stream]['schema']['properties'] ['favorite_number'], expected_schema_of_new_column, msg= "Expected newly-added column to be present in schema for stream `{}`, but it was not." .format(stream)) else: self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'upsert', msg= "Expected first message sent for stream `{}` to have action `upsert`" .format(stream)) self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version', msg= "Expected last message sent for stream `{}` to have action `activate_version`" .format(stream)) state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:03.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") # verify incremental table_version didn't change incremental_table_new_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] self.assertEqual( incremental_table_initial_table_version, incremental_table_new_table_version, msg= "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations." )