def file_modified_test(self): # sync 1 conn_id_1 = connections.ensure_connection(self) found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1) record_count_by_stream_1 = self.run_and_verify_sync(conn_id_1) synced_records_1 = runner.get_records_from_target_output() # checking if we got any records self.assertGreater(sum(record_count_by_stream_1.values()), 0) # changing start date to "utcnow" self.START_DATE = dt.strftime(dt.utcnow(), "%Y-%m-%dT00:00:00Z") # adding some data to the file self.append_to_files() # sync 2 conn_id_2 = connections.ensure_connection(self, original_properties = False) found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2) self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2) record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2) synced_records_2 = runner.get_records_from_target_output() # checking if we got any data self.assertGreater(sum(record_count_by_stream_2.values()), 0) # verifying if we got more data in sync 2 than sync 1 self.assertGreater(sum(record_count_by_stream_2.values()), sum(record_count_by_stream_1.values())) for stream in self.expected_check_streams(): expected_primary_keys = self.expected_pks() record_count_sync_1 = record_count_by_stream_1.get(stream, 0) record_count_sync_2 = record_count_by_stream_2.get(stream, 0) primary_keys_list_1 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records_1.get(stream).get('messages') if message.get('action') == 'upsert'] primary_keys_list_2 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records_2.get(stream).get('messages') if message.get('action') == 'upsert'] primary_keys_sync_1 = set(primary_keys_list_1) primary_keys_sync_2 = set(primary_keys_list_2) # Verify the number of records replicated in sync 2 is greater than the number # of records replicated in sync 1 for stream self.assertGreater(record_count_sync_2, record_count_sync_1) # Verify the records replicated in sync 1 were also replicated in sync 2 self.assertTrue(primary_keys_sync_1.issubset(primary_keys_sync_2))
def do_test(self, conn_id): # Select our catalogs our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream,{}).get('messages',[]) if stream in ['tickets', 'groups', 'users']: self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream)) for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
def test_run(self): """ Verify that we can get multiple pages of unique records for each stream """ conn_id = connections.ensure_connection(self) self.run_and_verify_check_mode(conn_id) self.select_and_verify_fields(conn_id) record_count_by_stream = self.run_and_verify_sync(conn_id) all_records_by_stream = runner.get_records_from_target_output() page_size = int(self.get_properties()['page_size']) for stream in self.expected_sync_streams(): with self.subTest(stream=stream): # Assert all expected streams synced at least a full pages of records self.assertGreater( record_count_by_stream.get(stream, 0), page_size, msg="{} did not sync more than a page of records".format(stream) ) records = [ x['data'] for x in all_records_by_stream[stream]['messages']] unique_records = self.get_unique_records(stream, records) self.assertGreater(len(unique_records), page_size)
def test_run(self): """ Testing that all the automatic fields are replicated despite de-selecting them - Verify that only the automatic fields are sent to the target. - Verify that all replicated records have unique primary key values. """ conn_id = connections.ensure_connection(self) # we are getting duplicate records for 'id' fields for this stream # when asked support about this, but this is known behavior from the API side # Please refer card: https://jira.talendforge.org/browse/TDL-18686 for more details known_failing_streams = {"targeting_android_versions"} expected_streams = self.expected_streams( ) - known_failing_streams - self.stats_streams # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # de-select all the fields self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams, deselect_all_fields=True) # run sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys()[stream] expected_keys = expected_primary_keys | self.expected_replication_keys( )[stream] # collect actual values messages = synced_records.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in messages['messages'] ] # check if the stream has collected some records self.assertGreater(record_count_by_stream.get(stream, 0), 0) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys) # Verify we did not duplicate any records across pages records_pks_list = [ tuple([ message.get('data').get(primary_key) for primary_key in expected_primary_keys ]) for message in messages.get('messages') ] self.assertCountEqual( records_pks_list, set(records_pks_list), msg="We have duplicate records for {}".format(stream))
def run_test(self): conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalogs(conn_id) found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog)) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), found_catalog_names) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog if c['tap_stream_id'] == tap_stream_id ][0] schema_and_metadata = menagerie.get_annotated_schema( conn_id, found_stream['stream_id']) main_metadata = schema_and_metadata["metadata"] stream_metadata = [ mdata for mdata in main_metadata if mdata["breadcrumb"] == [] ] # assert that the pks are correct self.assertEqual( self.expected_pks()[tap_stream_id], set(stream_metadata[0]['metadata']['table-key-properties'])) for stream_catalog in catalog: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema['annotated-schema'], []) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual( self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id])
def run_test(self, only_automatic_fields=False): expected_streams = self.streams_to_select() conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) expected_stream_fields = dict() found_catalogs = menagerie.get_catalogs(conn_id) for catalog in found_catalogs: stream_name = catalog['stream_name'] catalog_entry = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) if not stream_name in expected_streams: continue # select catalog fields self.select_found_catalogs(conn_id, [catalog], only_streams=[stream_name], deselect_all_fields=True if only_automatic_fields else False, non_selected_props=[] if only_automatic_fields else self.non_selected_fields[stream_name]) # add expected fields for assertion fields_from_field_level_md = [md_entry['breadcrumb'][1] for md_entry in catalog_entry['metadata'] if md_entry['breadcrumb'] != []] if only_automatic_fields: expected_stream_fields[stream_name] = self.expected_primary_keys()[stream_name] | self.expected_replication_keys()[stream_name] else: expected_stream_fields[stream_name] = set(fields_from_field_level_md) - set(self.non_selected_fields[stream_name]) self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): expected_primary_keys = self.expected_primary_keys()[stream] # get expected keys expected_keys = expected_stream_fields[stream] # collect all actual values messages = synced_records.get(stream) # collect actual synced fields actual_keys = [set(message['data'].keys()) for message in messages['messages'] if message['action'] == 'upsert'][0] fields = self.fields_to_remove.get(stream) or [] expected_keys = expected_keys - set(fields) # verify expected and actual fields self.assertEqual(expected_keys, actual_keys, msg='Selected keys in catalog is not as expected') # Verify we did not duplicate any records across pages records_pks_set = {tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')} records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')] self.assertCountEqual(records_pks_set, records_pks_list, msg="We have duplicate records for {}".format(stream))
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ expected_streams = self.expected_streams() # instantiate connection conn_id = connections.ensure_connection(self) # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs_automatic_fields = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_automatic_fields, select_all_fields=False, ) # run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_keys = self.expected_automatic_fields().get(stream) # collect actual values data = synced_records.get(stream, {}) record_messages_keys = [ set(row.get('data').keys()) for row in data.get('messages', {}) ] # Verify that you get some records for each stream self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg= "The number of records is not over the stream max limit for the {} stream" .format(stream)) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys)
def test_run(self): page_size = 250 conn_id = connections.ensure_connection(self) # Checking pagination for streams with enough data expected_streams = [ "addresses", "customers", "discounts", "metafields_subscription", "onetimes", ] found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys() # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [ tuple( message.get('data').get(expected_pk) for expected_pk in expected_primary_keys[stream]) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert' ] # verify records are more than page size so multiple page is working self.assertGreater(record_count_sync, page_size) primary_keys_list_1 = primary_keys_list[:page_size] primary_keys_list_2 = primary_keys_list[page_size:2 * page_size] primary_keys_page_1 = set(primary_keys_list_1) primary_keys_page_2 = set(primary_keys_list_2) # Verify by private keys that data is unique for page self.assertEqual( len(primary_keys_page_1), page_size) # verify there are no dupes on a page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2) ) # verify there are no dupes between pages
def verify_day_column(self): synced_records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): for message in synced_records[stream]['messages']: if message['action'] == 'upsert' and stream not in { 'accounts', 'ads', 'campaigns', 'ad_groups' }: self.assertIsNotNone(message['data'].get('day'))
def test_run(self): # Select our catalogs # found_catalogs = menagerie.get_catalogs(conn_id) # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] # for c in our_catalogs: # c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) # c_metadata = metadata.to_map(c_annotated['metadata']) # connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) conn_id = self.create_connection() # Clear state before our run menagerie.set_state(conn_id, {}) # Select a stream found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = sum(record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream, {}).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] replication_methods = self.expected_replication_method() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): replication_method = replication_methods.get(stream) if replication_method is self.INCREMENTAL: self.assertTrue(stream in bookmarks) elif replication_method is self.FULL_TABLE: self.assertTrue(stream not in bookmarks) else: raise NotImplementedError( "stream {} has an invalid replication method {}".format(stream, replication_method) )
def first_sync_test(self, table_configs, conn_id): # run first full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() expected_pks = {} for config in table_configs: key = {config['HashKey']} if config.get('SortKey'): key |= {config.get('SortKey')} expected_pks[config['TableName']] = key # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, {x['TableName'] for x in table_configs}, expected_pks) state = menagerie.get_state(conn_id) state_version = menagerie.get_state_version(conn_id) first_versions = {} # assert that we get the correct number of records for each stream for config in table_configs: table_name = config['TableName'] self.assertEqual(config['num_rows'], record_count_by_stream[table_name]) # assert that an activate_version_message is first and last message sent for each stream self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][-1]['action']) # assert that the state has an initial_full_table_complete == True self.assertTrue( state['bookmarks'][table_name]['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[table_name] = state['bookmarks'][table_name][ 'version'] self.assertIsNotNone(first_versions[table_name]) # Write state with missing finished_shards so it # re-reads data from all shards # This should result in the next sync having same number of records # as the full table sync state['bookmarks'][table_name].pop('finished_shards') menagerie.set_state(conn_id, state, version=state_version)
def test_run(self): """ • Verify we can deselect all fields except when inclusion=automatic, which is handled by base.py methods • Verify that only the automatic fields are sent to the target. • Verify that all replicated records have unique primary key values. """ # We are not able to generate test data so skipping two streams(mark_as_spam, dropped_email) expected_streams = self.expected_streams() - {"mark_as_spam", "dropped_email"} conn_id = connections.ensure_connection(self) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams] # Select all streams and no fields within streams self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs, select_all_fields=False) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_keys = self.expected_automatic_fields().get(stream) expected_primary_keys = self.expected_primary_keys()[stream] # collect actual values data = synced_records.get(stream, {}) record_messages_keys = [set(row['data'].keys()) for row in data.get('messages', [])] primary_keys_list = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in data.get('messages') if message.get('action') == 'upsert'] unique_primary_keys_list = set(primary_keys_list) # Verify that you get some records for each stream self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg="The number of records is not over the stream min limit") # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys, msg="The fields sent to the target are not the automatic fields") #Verify that all replicated records have unique primary key values. self.assertEqual(len(primary_keys_list), len(unique_primary_keys_list), msg="Replicated record does not have unique primary key values.")
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'csv_with_empty_lines').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #Empty line should be ignored in emitted records. expected_records = [ {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2}, {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3}, {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4}, {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def test_run(self): """ Ensure running the tap with all streams selected and all fields deselected results in the replication of just the primary keys and replication keys (automatic fields). - Verify we can deselect all fields except when inclusion=automatic (SaaS Taps). - Verify that only the automatic fields are sent to the target. """ expected_streams = self.expected_sync_streams() # instantiate connection conn_id = connections.ensure_connection(self) # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs_automatic_fields = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_automatic_fields, select_all_fields=False, ) # run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_keys = self.expected_automatic_fields().get(stream) # collect actual values messages = synced_records.get(stream) record_messages_keys = [ set(message['data'].keys()) for message in messages['messages'] if message['action'] == 'upsert' ] # Verify that you get some records for each stream self.assertGreater(record_count_by_stream.get(stream, -1), 0) # Verify that only the automatic fields are sent to the target # BUG TDL-14241 | Replication keys are not automatic if stream == "file_metadata": expected_keys.remove('modifiedTime') for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys)
def pagination_test_run(self): """ Testing that sync creates the appropriate catalog with valid metadata. • Verify that all fields and all streams have selected set to True in the metadata """ page_size = 100 # Page size for events conn_id = connections.ensure_connection(self) # Expected stream is only events expected_streams = ["events"] found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys()[stream] # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [ tuple( message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert' ] # verify records are more than page size so multiple page is working self.assertGreater(record_count_sync, page_size) if record_count_sync > page_size: primary_keys_list_1 = primary_keys_list[:page_size] primary_keys_list_2 = primary_keys_list[page_size:2 * page_size] primary_keys_page_1 = set(primary_keys_list_1) primary_keys_page_2 = set(primary_keys_list_2) #Verify by private keys that data is unique for page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2))
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables expected_streams = self.expected_streams() catalog_entries = [ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams] self.select_all_streams_and_fields(conn_id, catalog_entries, select_all_fields=False) # Verify our selection worked as expected catalogs_selection = menagerie.get_catalogs(conn_id) for cat in catalogs_selection: catalog_entry = menagerie.get_annotated_schema(conn_id, cat['stream_id']) # Verify the expected stream tables are selected selected = catalog_entry.get('annotated-schema').get('selected') print("Validating selection on {}: {}".format(cat['stream_name'], selected)) if cat['stream_name'] not in expected_streams: self.assertFalse(selected, msg="Stream selected, but not testable.") continue # Skip remaining assertions if we aren't selecting this stream self.assertTrue(selected, msg="Stream not selected.") # Verify only automatic fields are selected expected_automatic_fields = self.expected_automatic_fields().get(cat['tap_stream_id']) selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata']) self.assertEqual(expected_automatic_fields, selected_fields, msg='for stream {}, expected: {} actual: {}'.format(cat['stream_name'], expected_automatic_fields, selected_fields)) # Run a sync job using orchestrator sync_record_count = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() # Assert the records for each stream for stream in self.expected_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) if not data: print('WARNING: Add data for {}'.format(stream)) continue record_messages_keys = [set(row['data'].keys()) for row in data['messages']] expected_keys = self.expected_automatic_fields().get(stream) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(expected_keys), set(), msg="Expected automatic fields and nothing else.") # Verify the sync meets or exceeds the default record count record_count = sync_record_count.get(stream, 0) self.assertLessEqual(1, record_count)
def test_run(self): page_size = 1 conn_id = connections.ensure_connection(self) # "ad_analytics_by_creative" and "ad_analytics_by_campaign" does not support pagination # Documentation: https://docs.microsoft.com/en-us/linkedin/marketing/integrations/ads-reporting/ads-reporting?tabs=http expected_streams = self.expected_streams() - set( {"ad_analytics_by_campaign", "ad_analytics_by_creative"}) found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys() # collect information for assertions from sync based on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [ (message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert' ] # verify records are more than page size so multiple page is working self.assertGreater(record_count_sync, page_size) if record_count_sync > page_size: primary_keys_list_1 = primary_keys_list[:page_size] primary_keys_list_2 = primary_keys_list[page_size:2 * page_size] primary_keys_page_1 = set(primary_keys_list_1) primary_keys_page_2 = set(primary_keys_list_2) # Verify by private keys that data is unique for page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2))
def test_run(self): # page size for "deals" page_size = 100 conn_id = connections.ensure_connection(self) # Checking pagination for "deals" stream expected_streams = ["deals"] found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys() # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [ (message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert' ] # verify records are more than page size so multiple page is working self.assertGreater(record_count_sync, page_size) if record_count_sync > page_size: primary_keys_list_1 = primary_keys_list[:page_size] primary_keys_list_2 = primary_keys_list[page_size:2 * page_size] primary_keys_page_1 = set(primary_keys_list_1) primary_keys_page_2 = set(primary_keys_list_2) # Verify by private keys that data is unique for page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2))
def verify_synthetic_columns(self): our_ccids = set(os.getenv('TAP_ADWORDS_CUSTOMER_IDS').split(",")) synced_records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): for message in synced_records[stream]['messages']: if message['action'] == 'upsert': self.assertIn( message.get('data').get('_sdc_customer_id'), our_ccids) if stream in {'accounts', 'ads', 'campaigns', 'ad_groups'}: self.assertIsNone( message.get('data').get('_sdc_report_datetime')) else: self.assertIsNotNone( message.get('data').get('_sdc_report_datetime'))
def test_run(self): conn_id = self.create_connection() # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] self.assertTrue('orders' in bookmarks)
def automatic_fields_test_run(self): """ Testing that all the automatic fields are replicated despite de-selecting them - Verify that only the automatic fields are sent to the target. - Verify that all replicated records have unique primary key values. """ untestable_streams = {'quotes'} # For V2, we have 0 records for 'quotes' stream # Skipping streams virtual_bank_accounts, gifts and orders as we are not able to generate data expected_streams = self.expected_streams() - {'virtual_bank_accounts', 'gifts', 'orders'} # skip quotes for product catalog V2 if not self.is_product_catalog_v1: expected_streams = expected_streams - untestable_streams conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select all streams and no fields within streams self.perform_and_verify_table_and_field_selection(conn_id, found_catalogs, select_all_fields=False) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys()[stream] expected_keys = self.expected_automatic_fields().get(stream) # collect actual values messages = synced_records.get(stream) record_messages_keys = [set(row['data'].keys()) for row in messages['messages']] # check if the stream has collected some records self.assertGreater(record_count_by_stream.get(stream, 0), 0) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys) # Verify we did not duplicate any records across pages records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')] self.assertCountEqual(records_pks_list, set(records_pks_list), msg="We have duplicate records for {}".format(stream))
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables expected_streams = self.expected_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.select_all_streams_and_fields(conn_id, catalog_entries, select_all_fields=True) sync_record_count = self.run_and_verify_sync(conn_id) sync_records = runner.get_records_from_target_output() # Test by stream for stream in self.expected_streams(): with self.subTest(stream=stream): record_count = sync_record_count.get(stream, 0) sync_messages = sync_records.get(stream, { 'messages': [] }).get('messages') primary_key = self.expected_primary_keys().get(stream).pop() # Verify the sync meets or exceeds the default record count stream_page_size = self.expected_page_size()[stream] self.assertLess(stream_page_size, record_count) # Verify we did not duplicate any records across pages records_pks_set = { message.get('data').get(primary_key) for message in sync_messages } records_pks_list = [ message.get('data').get(primary_key) for message in sync_messages ] self.assertCountEqual( records_pks_set, records_pks_list, msg="We have duplicate records for {}".format(stream))
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables expected_streams = self.testable_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] for catalog_entry in catalog_entries: stream_schema = menagerie.get_annotated_schema( conn_id, catalog_entry['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog_entry, stream_schema) # Run sync first_record_count_by_stream = self.run_and_verify_sync(conn_id) replicated_row_count = sum(first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Test by Stream for stream in self.testable_streams(): with self.subTest(stream=stream): expected_fields = set( synced_records.get(stream)['schema']['properties'].keys()) print('Number of expected keys ', len(expected_fields)) actual_fields = set( runner.examine_target_output_for_fields()[stream]) print('Number of actual keys ', len(actual_fields)) print('Number of known missing keys ', len(KNOWN_MISSING_FIELDS[stream])) unexpected_fields = actual_fields & KNOWN_MISSING_FIELDS[stream] if unexpected_fields: print('WARNING: Found new fields: {}'.format( unexpected_fields)) self.assertSetEqual( expected_fields, actual_fields | KNOWN_MISSING_FIELDS[stream])
def test_run(self): """ Verify shop information fields are present in catalog for every streams. Verify shop information fields are present in every records of all streams. """ conn_id = self.create_connection(original_properties=False, original_credentials=False) # Select all streams and all fields within streams and run both mode found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_streams()] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) sync_records_count = self.run_sync(conn_id) sync_records = runner.get_records_from_target_output() expected_shop_info_fields = {'_sdc_shop_id', '_sdc_shop_name', '_sdc_shop_myshopify_domain'} for stream in self.expected_streams(): with self.subTest(stream=stream): # Verify that every stream schema contains shop info fields catalog = next(iter([catalog for catalog in found_catalogs if catalog["stream_name"] == stream])) schema_and_metadata = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) metadata = schema_and_metadata["metadata"] actual_stream_fields = {item.get("breadcrumb", ["properties", None])[1] for item in metadata if item.get("breadcrumb", []) != []} self.assertTrue(expected_shop_info_fields.issubset(actual_stream_fields)) # Verify that every records of stream contains shop info fields stream_records = sync_records.get(stream, {}) upsert_messages = [m for m in stream_records.get('messages') if m['action'] == 'upsert'] for message in upsert_messages: actual_record_fields = set(message['data'].keys()) self.assertTrue(expected_shop_info_fields.issubset(actual_record_fields))
def run_test(self, child_streams): """ Testing that tap is working fine if only child streams are selected - Verify that if only child streams are selected then only child stream are replicated. """ # instantiate connection conn_id = connections.ensure_connection(self) # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection self.select_found_catalogs(conn_id, found_catalogs, only_streams=child_streams) # run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() # Verify no unexpected streams were replicated synced_stream_names = set(synced_records.keys()) self.assertSetEqual(child_streams, synced_stream_names)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) for k in self.expected_primary_keys()[c['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( c['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) # run a sync _ = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [ set(row['data'].keys()) for row in data['messages'] ] for record_keys in record_messages: # The symmetric difference should be empty self.assertEqual( record_keys, self.expected_automatic_fields().get(stream_name, set()))
def test_run(self): """ Verify for each stream that you can do a sync which records bookmarks. Verify that the bookmark is the max value sent to the target for the `date` PK field Verify that the 2nd sync respects the bookmark Verify that all data of the 2nd sync is >= the bookmark from the first sync Verify that the number of records in the 2nd sync is less then the first Verify inclusivivity of bookmarks PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ print("\n\nTESTING IN SQUARE_ENVIRONMENT: {}".format( os.getenv('TAP_SQUARE_ENVIRONMENT'))) print("\n\nRUNNING {}\n\n".format(self.name())) # Instatiate static start date self.START_DATE = self.STATIC_START_DATE # Ensure tested streams have data expected_records_first_sync = self.create_test_data( self.testable_streams_static(), self.START_DATE) # Instantiate connection with default start conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Select all testable streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) streams_to_select = self.testable_streams_static() our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_to_select ] self.select_all_streams_and_fields(conn_id, our_catalogs) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual( streams_to_select, set(first_sync_record_count.keys()), msg= "Expect first_sync_record_count keys {} to equal testable streams {}," " first_sync_record_count was {}".format( first_sync_record_count.keys(), streams_to_select, first_sync_record_count)) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync runner.get_records_from_target_output() # Set expectations for 2nd sync expected_records_second_sync = {x: [] for x in self.expected_streams()} # adjust expectations for full table streams to include the expected records from sync 1 for stream in self.testable_streams_static(): if stream in self.expected_full_table_streams(): for record in expected_records_first_sync.get(stream, []): expected_records_second_sync[stream].append(record) # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() second_sync_state = menagerie.get_state(conn_id) # Loop first_sync_records and compare against second_sync_records for stream in self.testable_streams_static(): with self.subTest(stream=stream): second_sync_data = [ record.get("data") for record in second_sync_records.get( stream, {}).get("messages", {"data": {}}) ] # TESTING INCREMENTAL STREAMS if stream in self.expected_incremental_streams(): # Verify both syncs write / keep the same bookmark self.assertEqual( set(first_sync_state.get('bookmarks', {}).keys()), set(second_sync_state.get('bookmarks', {}).keys())) # Verify second sync's bookmarks move past the first sync's self.assertGreater( second_sync_state.get('bookmarks', { stream: {} }).get(stream, { 'updated_at': -1 }).get('updated_at'), first_sync_state.get('bookmarks', { stream: {} }).get(stream, { 'updated_at': -1 }).get('updated_at')) # verify that there is more than 1 record of data - setup necessary self.assertGreater( first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get no data on the 2nd sync self.assertGreaterEqual( 0, second_sync_record_count.get(stream, 0), msg= "first sync didn't have more records, bookmark usage not verified" ) elif stream in self.expected_full_table_streams(): # TESTING FULL TABLE STREAMS # Verify no bookmarks are present first_state = first_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, first_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(first_sync_state) + \ "\tBookmark: {}".format(first_state)) second_state = second_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, second_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(second_sync_state) + \ "\tBookmark: {}".format(second_state)) # TESTING APPLICABLE TO ALL STREAMS # Verify that the expected records are replicated in the 2nd sync # For incremental streams we should see 0 records # For full table streams we should see the same records from the first sync expected_records = expected_records_second_sync.get(stream, []) self.assertEqual( len(expected_records), len(second_sync_data), msg= "Expected number of records do not match actual for 2nd sync.\n" + "Expected: {}\nActual: {}".format( len(expected_records), len(second_sync_data)))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])