def file_not_modified_test(self): # sync 1 conn_id_1 = connections.ensure_connection(self) found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1) record_count_by_stream1 = self.run_and_verify_sync(conn_id_1) self.assertGreater(sum(record_count_by_stream1.values()), 0) # changing start date to "utcnow" self.START_DATE = dt.strftime(dt.utcnow(), "%Y-%m-%dT00:00:00Z") # sync 2 conn_id_2 = connections.ensure_connection(self, original_properties = False) found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2) self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2) # as we have not added any data, so file is not modified and # we should not get any data and recieve error: failed to replicate any data try: self.run_and_verify_sync(conn_id_2) except AssertionError as e: self.assertRegex(str(e), r'failed to replicate any data')
def test_run(self): """Parametrized automatic fields test running against each replication method.""" # Test running a sync with no fields selected using full-table replication self.default_replication_method = self.FULL_TABLE full_table_conn_id = connections.ensure_connection(self) self.automatic_fields_test(full_table_conn_id) # NB | We expect primary keys and replication keys to have inclusion automatic for # key-based incremental replication. But that is only true for primary keys. # As a result we cannot run a sync with no fields selected. This BUG should not # be carried over into hp-postgres, but will not be fixed for this tap. # Test running a sync with no fields selected using key-based incremental replication # self.default_replication_method = self.INCREMENTAL # incremental_conn_id = connections.ensure_connection(self, original_properties=False) # self.automatic_fields_test(incremental_conn_id) # Test running a sync with no fields selected using logical replication self.default_replication_method = self.LOG_BASED with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: db_utils.ensure_replication_slot(cur, test_db) log_based_conn_id = connections.ensure_connection( self, original_properties=False) self.automatic_fields_test(log_based_conn_id)
def file_modified_test(self): # sync 1 conn_id_1 = connections.ensure_connection(self) found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1) record_count_by_stream_1 = self.run_and_verify_sync(conn_id_1) synced_records_1 = runner.get_records_from_target_output() # checking if we got any records self.assertGreater(sum(record_count_by_stream_1.values()), 0) # changing start date to "utcnow" self.START_DATE = dt.strftime(dt.utcnow(), "%Y-%m-%dT00:00:00Z") # adding some data to the file self.append_to_files() # sync 2 conn_id_2 = connections.ensure_connection(self, original_properties = False) found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2) self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2) record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2) synced_records_2 = runner.get_records_from_target_output() # checking if we got any data self.assertGreater(sum(record_count_by_stream_2.values()), 0) # verifying if we got more data in sync 2 than sync 1 self.assertGreater(sum(record_count_by_stream_2.values()), sum(record_count_by_stream_1.values())) for stream in self.expected_check_streams(): expected_primary_keys = self.expected_pks() record_count_sync_1 = record_count_by_stream_1.get(stream, 0) record_count_sync_2 = record_count_by_stream_2.get(stream, 0) primary_keys_list_1 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records_1.get(stream).get('messages') if message.get('action') == 'upsert'] primary_keys_list_2 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records_2.get(stream).get('messages') if message.get('action') == 'upsert'] primary_keys_sync_1 = set(primary_keys_list_1) primary_keys_sync_2 = set(primary_keys_list_2) # Verify the number of records replicated in sync 2 is greater than the number # of records replicated in sync 1 for stream self.assertGreater(record_count_sync_2, record_count_sync_1) # Verify the records replicated in sync 1 were also replicated in sync 2 self.assertTrue(primary_keys_sync_1.issubset(primary_keys_sync_2))
def test_run(self): # sync 1 conn_id_1 = connections.ensure_connection(self) found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1) self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1) record_count_by_stream_1 = self.run_and_verify_sync(conn_id_1) # checking if we got any data from sync 1 self.assertGreater(sum(record_count_by_stream_1.values()), 0) # checking if data after in 1st sync is as expected for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual(self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream_1[tap_stream_id]) # creating file "table_1_fileB" with self.get_test_connection() as client: root_dir = os.getenv('TAP_SFTP_ROOT_DIR') client.chdir(root_dir + '/tap_tester/folderA') file_group = self.get_files()[0] with client.open('table_1_fileB.csv', 'w') as f: writer = csv.writer(f) lines = [file_group['headers']] + file_group['generator'](file_group['num_rows']) writer.writerows(lines) # adding some data to file "table_1_fileA" and "table_3_fileA" self.append_to_files() # sync 2 conn_id_2 = connections.ensure_connection(self) found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2) self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2) record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2, second_sync = True) # checking if we got any data from sync 2 self.assertGreater(sum(record_count_by_stream_2.values()), 0) # checking if data after in 2nd sync is as expected # here as we have not modified start date, so we should recieve all the data # ie. before appending and after appending for tap_stream_id in self.expected_second_sync_streams(): self.assertEqual(self.expected_second_sync_row_counts()[tap_stream_id], record_count_by_stream_2[tap_stream_id])
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK")
def test_run(self): conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) found_catalog = menagerie.get_catalog(conn_id) for catalog_entry in found_catalog['streams']: field_names_in_schema = set( [k for k in catalog_entry['schema']['properties'].keys()]) field_names_in_breadcrumbs = set([ x['breadcrumb'][1] for x in catalog_entry['metadata'] if len(x['breadcrumb']) == 2 ]) self.assertEqual(field_names_in_schema, field_names_in_breadcrumbs) inclusions_set = set([ (x['breadcrumb'][1], x['metadata']['inclusion']) for x in catalog_entry['metadata'] if len(x['breadcrumb']) == 2 ]) # Validate that all fields are in metadata self.assertEqual(len(inclusions_set), len(field_names_in_schema)) self.assertEqual(set([i[0] for i in inclusions_set]), field_names_in_schema) # Validate that all metadata['inclusion'] are 'available' unique_inclusions = set([i[1] for i in inclusions_set]) self.assertTrue( len(unique_inclusions) == 1 and 'available' in unique_inclusions)
def test_run(self): """ Testing that sync creates the appropriate catalog with valid metadata. Verify that all fields and all streams have selected set to True in the metadata """ conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # removed "ad_analytics_by_campaign" and "ad_analytics_by_creative" as # it makes lots of api calls so sync canary test for these streams is covered in the start date test expected_streams = self.expected_streams() - set( {"ad_analytics_by_campaign", "ad_analytics_by_creative"}) test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) # check if all streams have collected records for stream in expected_streams: self.assertGreater(record_count_by_stream.get(stream, 0), 0)
def run_standard_sync(self, environment, data_type, select_all_fields=True): """ Run the tap in check mode. Perform table selection based on testable streams. Select all fields or no fields based on the select_all_fields param. Run a sync. """ conn_id = connections.ensure_connection(self, original_properties=False) found_catalogs = self.run_and_verify_check_mode(conn_id) streams_to_select = self.testable_streams(environment, data_type) print("\n\nRUNNING {}".format(self.name())) print("WITH STREAMS: {}".format(streams_to_select)) print("WITH START DATE: {}\n\n".format(self.START_DATE)) self.perform_and_verify_table_and_field_selection( conn_id, found_catalogs, streams_to_select, select_all_fields=select_all_fields) return self.run_and_verify_sync(conn_id)
def test_run(self): """ Verify that we can get multiple pages of unique records for each stream """ conn_id = connections.ensure_connection(self) self.run_and_verify_check_mode(conn_id) self.select_and_verify_fields(conn_id) record_count_by_stream = self.run_and_verify_sync(conn_id) all_records_by_stream = runner.get_records_from_target_output() page_size = int(self.get_properties()['page_size']) for stream in self.expected_sync_streams(): with self.subTest(stream=stream): # Assert all expected streams synced at least a full pages of records self.assertGreater( record_count_by_stream.get(stream, 0), page_size, msg="{} did not sync more than a page of records".format(stream) ) records = [ x['data'] for x in all_records_by_stream[stream]['messages']] unique_records = self.get_unique_records(stream, records) self.assertGreater(len(unique_records), page_size)
def test_run(self): """ Verify that we can get multiple pages of automatic fields for each stream """ conn_id = connections.ensure_connection(self) self.run_and_verify_check_mode(conn_id) self.select_and_verify_fields(conn_id, select_all_fields=False) record_count_by_stream = self.run_and_verify_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() # Assert all expected streams synced at least a full pages of records for stream in self.expected_sync_streams(): with self.subTest(stream=stream): self.assertGreater( record_count_by_stream.get(stream, 0), int(self.get_properties()['page_size']), msg="{} did not sync more than a page of records".format( stream)) for stream_name, actual_fields in actual_fields_by_stream.items(): with self.subTest(stream=stream_name): self.assertSetEqual( self.expected_automatic_fields()[stream_name], actual_fields)
def test_run(self): """ Testing that all the automatic fields are replicated despite de-selecting them - Verify that only the automatic fields are sent to the target. - Verify that all replicated records have unique primary key values. """ conn_id = connections.ensure_connection(self) # we are getting duplicate records for 'id' fields for this stream # when asked support about this, but this is known behavior from the API side # Please refer card: https://jira.talendforge.org/browse/TDL-18686 for more details known_failing_streams = {"targeting_android_versions"} expected_streams = self.expected_streams( ) - known_failing_streams - self.stats_streams # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # de-select all the fields self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams, deselect_all_fields=True) # run sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys()[stream] expected_keys = expected_primary_keys | self.expected_replication_keys( )[stream] # collect actual values messages = synced_records.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in messages['messages'] ] # check if the stream has collected some records self.assertGreater(record_count_by_stream.get(stream, 0), 0) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys) # Verify we did not duplicate any records across pages records_pks_list = [ tuple([ message.get('data').get(primary_key) for primary_key in expected_primary_keys ]) for message in messages.get('messages') ] self.assertCountEqual( records_pks_list, set(records_pks_list), msg="We have duplicate records for {}".format(stream))
def test_run(self): """ Run tap in check mode, then select all streams and all fields within streams. Run a sync and verify exit codes do not throw errors. This is meant to be a smoke test for the tap. If this is failing do not expect any other tests to pass. """ expected_streams = self.expected_streams() conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs, select_all_fields=True) record_count_by_stream = self.run_and_verify_sync(conn_id) # Assert all expected streams synced at least one record for stream in self.expected_streams(): with self.subTest(stream=stream): self.assertGreater( record_count_by_stream.get(stream, 0), 0, msg="{} did not sync any records".format(stream))
def setUp(self): required_creds = { "client_id": 'TAP_XERO_CLIENT_ID', "client_secret": 'TAP_XERO_CLIENT_SECRET', "refresh_token": 'TAP_XERO_REFRESH_TOKEN', } required_props = { "tenant_id": 'TAP_XERO_TENANT_ID', "xero_user_id": 'TAP_XERO_USER_ID' } missing_creds = [ v for v in required_creds.values() if not os.getenv(v) ] missing_props = [ v for v in required_props.values() if not os.getenv(v) ] if missing_creds or missing_props: missing_envs = missing_creds + missing_props raise Exception("set " + ", ".join(missing_envs)) self._credentials = { k: os.getenv(v) for k, v in required_creds.items() } self.conn_id = connections.ensure_connection( self, payload_hook=preserve_refresh_token)
def test_organizations_dynamic_fields(self): """ Run tap in check mode and verify more than one page is retruned for dynamic fields. """ conn_id = connections.ensure_connection(self) # run and verify the tap in discovermode. found_catalog = self.run_and_verify_check_mode(conn_id) # Verify number of dynamic fields in organizations stream metadata # (Need enough dynamic fields for organizations) for catalog in found_catalog: if catalog['stream_name'] == "organizations": organization_fields_page_limit = 100 schema_and_metadata = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) schema_fields = schema_and_metadata.get( 'annotated-schema').get('properties').keys() organizations_dynamic_fields = [ field for field in schema_fields if field not in self.organizations_static_fields() ] #Verify count of dynamic fields is more than page limit for organization fields(Pagination) self.assertGreater(len(organizations_dynamic_fields), organization_fields_page_limit)
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ expected_streams = self.expected_streams() # instantiate connection conn_id = connections.ensure_connection(self) # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs_automatic_fields = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_automatic_fields, select_all_fields=False, ) # run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_keys = self.expected_automatic_fields().get(stream) # collect actual values data = synced_records.get(stream, {}) record_messages_keys = [ set(row.get('data').keys()) for row in data.get('messages', {}) ] # Verify that you get some records for each stream self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg= "The number of records is not over the stream max limit for the {} stream" .format(stream)) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys)
def run_test(self, only_automatic_fields=False): expected_streams = self.streams_to_select() conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) expected_stream_fields = dict() found_catalogs = menagerie.get_catalogs(conn_id) for catalog in found_catalogs: stream_name = catalog['stream_name'] catalog_entry = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) if not stream_name in expected_streams: continue # select catalog fields self.select_found_catalogs(conn_id, [catalog], only_streams=[stream_name], deselect_all_fields=True if only_automatic_fields else False, non_selected_props=[] if only_automatic_fields else self.non_selected_fields[stream_name]) # add expected fields for assertion fields_from_field_level_md = [md_entry['breadcrumb'][1] for md_entry in catalog_entry['metadata'] if md_entry['breadcrumb'] != []] if only_automatic_fields: expected_stream_fields[stream_name] = self.expected_primary_keys()[stream_name] | self.expected_replication_keys()[stream_name] else: expected_stream_fields[stream_name] = set(fields_from_field_level_md) - set(self.non_selected_fields[stream_name]) self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): expected_primary_keys = self.expected_primary_keys()[stream] # get expected keys expected_keys = expected_stream_fields[stream] # collect all actual values messages = synced_records.get(stream) # collect actual synced fields actual_keys = [set(message['data'].keys()) for message in messages['messages'] if message['action'] == 'upsert'][0] fields = self.fields_to_remove.get(stream) or [] expected_keys = expected_keys - set(fields) # verify expected and actual fields self.assertEqual(expected_keys, actual_keys, msg='Selected keys in catalog is not as expected') # Verify we did not duplicate any records across pages records_pks_set = {tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')} records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')] self.assertCountEqual(records_pks_set, records_pks_list, msg="We have duplicate records for {}".format(stream))
def run_test(self): conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalogs(conn_id) found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog)) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), found_catalog_names) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog if c['tap_stream_id'] == tap_stream_id ][0] schema_and_metadata = menagerie.get_annotated_schema( conn_id, found_stream['stream_id']) main_metadata = schema_and_metadata["metadata"] stream_metadata = [ mdata for mdata in main_metadata if mdata["breadcrumb"] == [] ] # assert that the pks are correct self.assertEqual( self.expected_pks()[tap_stream_id], set(stream_metadata[0]['metadata']['table-key-properties'])) for stream_catalog in catalog: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema['annotated-schema'], []) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual( self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id])
def test_run(self): page_size = 250 conn_id = connections.ensure_connection(self) # Checking pagination for streams with enough data expected_streams = [ "addresses", "customers", "discounts", "metafields_subscription", "onetimes", ] found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys() # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [ tuple( message.get('data').get(expected_pk) for expected_pk in expected_primary_keys[stream]) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert' ] # verify records are more than page size so multiple page is working self.assertGreater(record_count_sync, page_size) primary_keys_list_1 = primary_keys_list[:page_size] primary_keys_list_2 = primary_keys_list[page_size:2 * page_size] primary_keys_page_1 = set(primary_keys_list_1) primary_keys_page_2 = set(primary_keys_list_2) # Verify by private keys that data is unique for page self.assertEqual( len(primary_keys_page_1), page_size) # verify there are no dupes on a page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2) ) # verify there are no dupes between pages
def setUp(self): missing_envs = [x for x in [os.getenv('TAP_TOGGL_API_TOKEN'), os.getenv('TAP_TOGGL_DETAILED_REPORT_TRAILING_DAYS')] if x == None] if len(missing_envs) != 0: #pylint: disable=line-too-long raise Exception("set TAP_TOGGL_API_TOKEN, TAP_TOGGL_DETAILED_REPORT_TRAILING_DAYS") self.conn_id = connections.ensure_connection(self)
def pre_sync_test(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # tap discovered the right streams catalog = menagerie.get_catalog(conn_id) table_configs = self.expected_table_config() for stream in catalog['streams']: # schema is open {} for each stream self.assertEqual({'type': 'object'}, stream['schema']) expected_streams = {x['TableName'] for x in table_configs} # assert we find the correct streams self.assertEqual(expected_streams, {c['tap_stream_id'] for c in catalog['streams']}) # Verify that the table_name is in the format <collection_name> for each stream self.assertEqual(expected_streams, {c['table_name'] for c in catalog['streams']}) for tap_stream_id in expected_streams: found_stream = [c for c in catalog['streams'] if c['tap_stream_id'] == tap_stream_id][0] stream_metadata = [x['metadata'] for x in found_stream['metadata'] if x['breadcrumb'] == []][0] expected_config = [x for x in table_configs if x['TableName'] == tap_stream_id][0] # table-key-properties metadata keys = [expected_config['HashKey']] if expected_config.get('SortKey'): keys.append(expected_config.get('SortKey')) self.assertEqual(set(keys), set(stream_metadata.get('table-key-properties'))) # Assert the hash key is the first key in the list self.assertEqual(expected_config['HashKey'], stream_metadata.get('table-key-properties')[0]) # row-count metadata self.assertEqual(expected_config['num_rows'], stream_metadata.get('row-count')) # selected metadata is None for all streams self.assertNotIn('selected', stream_metadata.keys()) # is-view metadata is False self.assertFalse(stream_metadata.get('is-view')) # no forced-replication-method metadata self.assertNotIn('forced-replication-method', stream_metadata.keys()) return (table_configs, conn_id, expected_streams)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = {"currently_syncing" : None, "bookmarks": {"contacts" : {"offset" : {}, "versionTimestamp" : future_time}, "subscription_changes" : {"startTimestamp" : future_time, "offset" : {}}, "campaigns" : {"offset" : {}}, "forms" : {"updatedAt" : future_time}, "deals" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "workflows" : {"updatedAt" : future_time}, "owners" : {"updatedAt" : future_time}, "contact_lists" : {"updatedAt" : future_time, "offset" : {}}, "email_events" : {"startTimestamp" : future_time, "offset" : {}}, "companies" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "engagements" : {"lastUpdated" : future_time, "offset" : {}}}} menagerie.set_state(conn_id, future_bookmarks) record_count_by_stream = self.run_and_verify_sync(conn_id) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys()) self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data") bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def test_run(self): """ • Verify we can deselect all fields except when inclusion=automatic, which is handled by base.py methods • Verify that only the automatic fields are sent to the target. • Verify that all replicated records have unique primary key values. """ # We are not able to generate test data so skipping two streams(mark_as_spam, dropped_email) expected_streams = self.expected_streams() - {"mark_as_spam", "dropped_email"} conn_id = connections.ensure_connection(self) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams] # Select all streams and no fields within streams self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs, select_all_fields=False) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_keys = self.expected_automatic_fields().get(stream) expected_primary_keys = self.expected_primary_keys()[stream] # collect actual values data = synced_records.get(stream, {}) record_messages_keys = [set(row['data'].keys()) for row in data.get('messages', [])] primary_keys_list = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in data.get('messages') if message.get('action') == 'upsert'] unique_primary_keys_list = set(primary_keys_list) # Verify that you get some records for each stream self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg="The number of records is not over the stream min limit") # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys, msg="The fields sent to the target are not the automatic fields") #Verify that all replicated records have unique primary key values. self.assertEqual(len(primary_keys_list), len(unique_primary_keys_list), msg="Replicated record does not have unique primary key values.")
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def test_run(self): """ Ensure running the tap with all streams selected and all fields deselected results in the replication of just the primary keys and replication keys (automatic fields). - Verify we can deselect all fields except when inclusion=automatic (SaaS Taps). - Verify that only the automatic fields are sent to the target. """ expected_streams = self.expected_sync_streams() # instantiate connection conn_id = connections.ensure_connection(self) # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs_automatic_fields = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_automatic_fields, select_all_fields=False, ) # run initial sync record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_keys = self.expected_automatic_fields().get(stream) # collect actual values messages = synced_records.get(stream) record_messages_keys = [ set(message['data'].keys()) for message in messages['messages'] if message['action'] == 'upsert' ] # Verify that you get some records for each stream self.assertGreater(record_count_by_stream.get(stream, -1), 0) # Verify that only the automatic fields are sent to the target # BUG TDL-14241 | Replication keys are not automatic if stream == "file_metadata": expected_keys.remove('modifiedTime') for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys)
def ensure_connection(self, original=True): def preserve_refresh_token(existing_conns, payload): if not existing_conns: return payload conn_with_creds = connections.fetch_existing_connection_with_creds(existing_conns[0]['id']) # Even though is a credential, this API posts the entire payload using properties payload['properties']['refresh_token'] = conn_with_creds['credentials']['refresh_token'] return payload conn_id = connections.ensure_connection(self, payload_hook=preserve_refresh_token, original_properties = original) return conn_id
def pagination_test_run(self): """ Testing that sync creates the appropriate catalog with valid metadata. • Verify that all fields and all streams have selected set to True in the metadata """ page_size = 100 # Page size for events conn_id = connections.ensure_connection(self) # Expected stream is only events expected_streams = ["events"] found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs = [ catalog for catalog in found_catalogs if catalog.get('stream_name') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys()[stream] # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync = record_count_by_stream.get(stream, 0) primary_keys_list = [ tuple( message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records.get(stream).get('messages') if message.get('action') == 'upsert' ] # verify records are more than page size so multiple page is working self.assertGreater(record_count_sync, page_size) if record_count_sync > page_size: primary_keys_list_1 = primary_keys_list[:page_size] primary_keys_list_2 = primary_keys_list[page_size:2 * page_size] primary_keys_page_1 = set(primary_keys_list_1) primary_keys_page_2 = set(primary_keys_list_2) #Verify by private keys that data is unique for page self.assertTrue( primary_keys_page_1.isdisjoint(primary_keys_page_2))
def create_connection(self, original_properties: bool = True): """Create a new connection with the test name""" # Create the connection conn_id = connections.ensure_connection(self, original_properties) # Run a check job using orchestrator (discovery) check_job_name = runner.run_check_mode(self, conn_id) # Assert that the check job succeeded exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) return conn_id
def test_run(self): conn_id = connections.ensure_connection(self) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema) # Clear State and run sync menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))