def test_run(self): conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) found_catalog = menagerie.get_catalog(conn_id) for catalog_entry in found_catalog['streams']: field_names_in_schema = set( [k for k in catalog_entry['schema']['properties'].keys()]) field_names_in_breadcrumbs = set([ x['breadcrumb'][1] for x in catalog_entry['metadata'] if len(x['breadcrumb']) == 2 ]) self.assertEqual(field_names_in_schema, field_names_in_breadcrumbs) inclusions_set = set([ (x['breadcrumb'][1], x['metadata']['inclusion']) for x in catalog_entry['metadata'] if len(x['breadcrumb']) == 2 ]) # Validate that all fields are in metadata self.assertEqual(len(inclusions_set), len(field_names_in_schema)) self.assertEqual(set([i[0] for i in inclusions_set]), field_names_in_schema) # Validate that all metadata['inclusion'] are 'available' unique_inclusions = set([i[1] for i in inclusions_set]) self.assertTrue( len(unique_inclusions) == 1 and 'available' in unique_inclusions)
def run_test(self, only_automatic_fields=False): expected_streams = self.streams_to_select() conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) expected_stream_fields = dict() found_catalogs = menagerie.get_catalogs(conn_id) for catalog in found_catalogs: stream_name = catalog['stream_name'] catalog_entry = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) if not stream_name in expected_streams: continue # select catalog fields self.select_found_catalogs(conn_id, [catalog], only_streams=[stream_name], deselect_all_fields=True if only_automatic_fields else False, non_selected_props=[] if only_automatic_fields else self.non_selected_fields[stream_name]) # add expected fields for assertion fields_from_field_level_md = [md_entry['breadcrumb'][1] for md_entry in catalog_entry['metadata'] if md_entry['breadcrumb'] != []] if only_automatic_fields: expected_stream_fields[stream_name] = self.expected_primary_keys()[stream_name] | self.expected_replication_keys()[stream_name] else: expected_stream_fields[stream_name] = set(fields_from_field_level_md) - set(self.non_selected_fields[stream_name]) self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): expected_primary_keys = self.expected_primary_keys()[stream] # get expected keys expected_keys = expected_stream_fields[stream] # collect all actual values messages = synced_records.get(stream) # collect actual synced fields actual_keys = [set(message['data'].keys()) for message in messages['messages'] if message['action'] == 'upsert'][0] fields = self.fields_to_remove.get(stream) or [] expected_keys = expected_keys - set(fields) # verify expected and actual fields self.assertEqual(expected_keys, actual_keys, msg='Selected keys in catalog is not as expected') # Verify we did not duplicate any records across pages records_pks_set = {tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')} records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')] self.assertCountEqual(records_pks_set, records_pks_list, msg="We have duplicate records for {}".format(stream))
def test_run(self): conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=self.streams_to_select()) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
def test_run(self): expected_streams = self.streams_to_select() conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams) sync_record_count = self.run_and_verify_sync(conn_id) for stream in expected_streams: self.assertGreater(sync_record_count.get(stream, 0), 0)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK")
def run_and_verify_check_mode(self, conn_id): """ Run the tap in check mode and verify it succeeds. This should be ran prior to field selection and initial sync. Return the connection id and found catalogs from menagerie. """ # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") return found_catalogs
def run_test(self): conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalogs(conn_id) found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog)) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), found_catalog_names) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog if c['tap_stream_id'] == tap_stream_id ][0] schema_and_metadata = menagerie.get_annotated_schema( conn_id, found_stream['stream_id']) main_metadata = schema_and_metadata["metadata"] stream_metadata = [ mdata for mdata in main_metadata if mdata["breadcrumb"] == [] ] # assert that the pks are correct self.assertEqual( self.expected_pks()[tap_stream_id], set(stream_metadata[0]['metadata']['table-key-properties'])) for stream_catalog in catalog: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema['annotated-schema'], []) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual( self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id])
def pre_sync_test(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # tap discovered the right streams catalog = menagerie.get_catalog(conn_id) table_configs = self.expected_table_config() for stream in catalog['streams']: # schema is open {} for each stream self.assertEqual({'type': 'object'}, stream['schema']) expected_streams = {x['TableName'] for x in table_configs} # assert we find the correct streams self.assertEqual(expected_streams, {c['tap_stream_id'] for c in catalog['streams']}) # Verify that the table_name is in the format <collection_name> for each stream self.assertEqual(expected_streams, {c['table_name'] for c in catalog['streams']}) for tap_stream_id in expected_streams: found_stream = [c for c in catalog['streams'] if c['tap_stream_id'] == tap_stream_id][0] stream_metadata = [x['metadata'] for x in found_stream['metadata'] if x['breadcrumb'] == []][0] expected_config = [x for x in table_configs if x['TableName'] == tap_stream_id][0] # table-key-properties metadata keys = [expected_config['HashKey']] if expected_config.get('SortKey'): keys.append(expected_config.get('SortKey')) self.assertEqual(set(keys), set(stream_metadata.get('table-key-properties'))) # Assert the hash key is the first key in the list self.assertEqual(expected_config['HashKey'], stream_metadata.get('table-key-properties')[0]) # row-count metadata self.assertEqual(expected_config['num_rows'], stream_metadata.get('row-count')) # selected metadata is None for all streams self.assertNotIn('selected', stream_metadata.keys()) # is-view metadata is False self.assertFalse(stream_metadata.get('is-view')) # no forced-replication-method metadata self.assertNotIn('forced-replication-method', stream_metadata.keys()) return (table_configs, conn_id, expected_streams)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def create_connection(self, original_properties: bool = True): """Create a new connection with the test name""" # Create the connection conn_id = connections.ensure_connection(self, original_properties) # Run a check job using orchestrator (discovery) check_job_name = runner.run_check_mode(self, conn_id) # Assert that the check job succeeded exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) return conn_id
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # tap discovered the right streams catalog = menagerie.get_catalog(conn_id) for stream in catalog['streams']: # schema is open {} for each stream self.assertEqual({'type': 'object'}, stream['schema']) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in catalog['streams']}) # Verify that the table_name is in the format <collection_name> for each stream self.assertEqual(self.expected_table_names(), {c['table_name'] for c in catalog['streams']}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog['streams'] if c['tap_stream_id'] == tap_stream_id ][0] stream_metadata = [ x['metadata'] for x in found_stream['metadata'] if x['breadcrumb'] == [] ][0] # table-key-properties metadata self.assertEqual(self.expected_pks()[tap_stream_id], set(stream_metadata.get('table-key-properties'))) # row-count metadata self.assertEqual(self.expected_row_counts()[tap_stream_id], stream_metadata.get('row-count')) # selected metadata is None for all streams self.assertNotIn('selected', stream_metadata.keys()) # is-view metadata is False self.assertFalse(stream_metadata.get('is-view')) # no forced-replication-method metadata self.assertNotIn('forced-replication-method', stream_metadata.keys())
def starter(self): """ Instantiate connection, run discovery, and initial sync. This entire process needs to retry if we get rate limited so that we are using a fresh connection and can test the activate version messages. """ ########################################################################## ### Instantiate connection ########################################################################## self.conn_id = connections.ensure_connection(self) ########################################################################## ### Discovery without the backoff ########################################################################## check_job_name = runner.run_check_mode(self, self.conn_id) exit_status = menagerie.get_exit_status(self.conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs)) self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") print("discovered schemas are OK") # table and field selection test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in self.expected_test_streams] self.perform_and_verify_table_and_field_selection( self.conn_id, test_catalogs, select_all_fields=True, ) ########################################################################## ### Initial sync without the backoff ########################################################################## sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) self.record_count_by_stream_1 = runner.examine_target_output_file( self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) self.assertGreater( sum(self.record_count_by_stream_1.values()), 0, msg="failed to replicate any data: {}".format(self.record_count_by_stream_1) ) print("total replicated row count: {}".format(sum(self.record_count_by_stream_1.values())))
def run_and_verify_check_mode(self, conn_id): """ Run the tap in check mode and verify it succeeds. This should be ran prior to field selection and initial sync. """ # Run a check job using orchestrator (discovery) check_job_name = runner.run_check_mode(self, conn_id) # Assert that the check job succeeded exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): conn_id = connections.ensure_connection(self) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema) # Clear State and run sync menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): conn_id = connections.ensure_connection(self) # Run discovery check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # There should not be any tables in this database with db_utils.get_test_connection('discovery0') as conn: cur = conn.cursor() cur.execute("DROP TABLE {}".format( canonicalized_table_name(test_schema_name, test_table_name, cur))) # Run discovery again check_job_name = runner.run_check_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, check_job_name) # When discovery mode finds 0 tables, the tap returns an error self.assertEqual(exit_status['discovery_exit_status'], 1)
def test_future_date_as_start_date(self): self.START_DATE = datetime.datetime.strftime( datetime.datetime.today() + datetime.timedelta(days=1), "%Y-%m-%dT00:00:00Z") conn_id = connections.ensure_connection(self, original_properties=False) expected_streams = self.streams_to_select() runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams) # run sync mode sync_record_count = self.run_and_verify_sync(conn_id) for stream in expected_streams: if self.is_incremental(stream): # verify that we got no record for incremental streams self.assertIsNone(sync_record_count.get(stream))
def test_future_date_in_state(self): conn_id = connections.ensure_connection(self) expected_streams = self.streams_to_select() future_date = datetime.datetime.strftime( datetime.datetime.today() + datetime.timedelta(days=1), "%Y-%m-%dT00:00:00Z") state = {'bookmarks': dict()} replication_keys = self.expected_replication_keys() for stream in expected_streams: if self.is_incremental(stream): state['bookmarks'][stream] = dict() state['bookmarks'][stream]['field'] = next( iter(replication_keys[stream])) state['bookmarks'][stream]['last_record'] = future_date # set state for running sync mode menagerie.set_state(conn_id, state) runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams) # run sync mode self.run_and_verify_sync(conn_id) # get the state after running sync mode latest_state = menagerie.get_state(conn_id) # verify that the state passed before sync # and the state we got after sync are same self.assertEquals(latest_state, state)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs)) #menagerie.post_annotated_catalogs(conn_id, selected_catalogs) for c in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, c, menagerie.get_annotated_schema(conn_id, c['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # bookmarks for the 4 streams should be 2015-03-16 states = menagerie.get_state(conn_id)["bookmarks"] end_date = self.get_properties()["end_date"].split()[0] for k, v in states.items(): if "insights" in k: bm_date = v.get("date_start") self.assertEqual(end_date, bm_date) print("bookmarks match end_date of {}".format(end_date))
def run_test(self): """ Test to verify that the error is raise when passing attribution window other than 1, 7 or 28 """ # create connection conn_id = connections.ensure_connection(self) # run check mode check_job_name = runner.run_check_mode(self, conn_id) # get exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) # get discovery error message discovery_error_message = exit_status.get('discovery_error_message') # validate the error message self.assertEquals(discovery_error_message, "The attribution window must be 1, 7 or 28.") self.assertIsNone(exit_status.get('target_exit_status')) self.assertIsNone(exit_status.get('tap_exit_status'))
def run_check_mode(self, conn_id): # Run a check job using orchestrator (discovery) check_job_name = runner.run_check_mode(self, conn_id) # Assert that the check job succeeded exit_status = menagerie.get_exit_status(conn_id, check_job_name) try: menagerie.verify_check_exit_status(self, exit_status, check_job_name) except AssertionError as e: if exit_status['discovery_error_message']: print( "*******************RETRYING CHECK FOR DISCOVERY FAILURE*******************" ) raise RetryableTapError(e) raise
def test_run(self): # Default test setup # Create the connection for Zendesk conn_id = connections.ensure_connection(self) # Run a check job using orchestrator check_job_name = runner.run_check_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify schemas discovered were discovered self.found_catalogs = menagerie.get_catalogs(conn_id) self.assertEqual(len(self.found_catalogs), len(self.expected_check_streams())) # Verify the schemas discovered were exactly what we expect found_catalog_names = { catalog['tap_stream_id'] for catalog in self.found_catalogs if catalog['tap_stream_id'] in self.expected_check_streams() } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # Get the Streams for Organizations and Users streams = [ c for c in self.found_catalogs if c['stream_name'] in ['organizations', 'users'] ] # Create an array of arrays where the first element is the word minus the last letter ie: "organization" # and the second element is the annotated schema schemas = [(s['stream_name'][:-1], menagerie.get_annotated_schema(conn_id, s['stream_id'])) for s in streams] # Loop over them for schema in schemas: properties = schema[1]['annotated-schema']['properties'] # Ensure that "organization_fields" or "user_fields" are objects in the annotated schema # with their own set of properties self.assertIsNotNone(properties.get('{}_fields'.format(schema[0]), {}).get('properties'), msg='{}_fields not present in schema!'.format( schema[0]))
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) self.assertEqual( len(found_catalogs), len(self.expected_check_streams()), msg="Expected {} streams, actual was {} for connection {}," " actual {}".format(len(self.expected_check_streams()), len(found_catalogs), found_catalogs, conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) self.assertEqual(set(self.expected_check_streams()), set(found_catalog_names), msg="Expected streams don't match actual streams") # Verify stream names follow naming convention # streams should only have lowercase alphas and underscores self.assertTrue(all( [re.fullmatch(r"[a-z_]+", name) for name in found_catalog_names]), msg="One or more streams don't follow standard naming") diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK")
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = set(self.expected_check_streams().keys()).symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") for catalog in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) stream = catalog['stream_name'] automatic_fields = self.expected_check_streams()[stream] for field in automatic_fields: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == field), None) print("Validating inclusion on {}: {}".format( catalog['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic')
def test_run(self): conn_id = self.ensure_connection() # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) # Select all tables and fields self.select_all_streams_and_fields(conn_id, found_catalogs) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) # Examine target output for stream in self.expected_streams(): with self.subTest(stream=stream): # Each stream should have 1 or more records returned self.assertGreaterEqual(sync_record_count[stream], 1)
def run_test(self, start_dt_1, start_dt_2, streams): start_date_1 = start_dt_1 start_date_2 = start_dt_2 start_date_1_epoch = self.dt_to_ts(start_date_1) start_date_2_epoch = self.dt_to_ts(start_date_2) ########################################################################## ### Update Start Date for 1st sync ########################################################################## self.START_DATE = start_date_1 ########################################################################## ### Frist Sync ########################################################################## expected_streams = streams conn_id_1 = connections.ensure_connection(self, original_properties=False) runner.run_check_mode(self, conn_id_1) found_catalogs_1 = menagerie.get_catalogs(conn_id_1) self.select_found_catalogs(conn_id_1, found_catalogs_1, only_streams=expected_streams) sync_record_count_1 = self.run_and_verify_sync(conn_id_1) synced_records_1 = runner.get_records_from_target_output() ########################################################################## ### Update Start Date for 2nd sync ########################################################################## self.START_DATE = start_date_2 ########################################################################## ### Second Sync ########################################################################## conn_id_2 = connections.ensure_connection(self, original_properties=False) runner.run_check_mode(self, conn_id_2) found_catalogs_2 = menagerie.get_catalogs(conn_id_2) self.select_found_catalogs(conn_id_2, found_catalogs_2, only_streams=expected_streams) sync_record_count_2 = self.run_and_verify_sync(conn_id_2) synced_records_2 = runner.get_records_from_target_output() self.assertGreaterEqual(sum(sync_record_count_1.values()), sum(sync_record_count_2.values())) for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_primary_keys = self.expected_primary_keys()[stream] expected_replication_keys = self.expected_replication_keys( )[stream] # collect information for assertions from syncs 1 & 2 base on expected values record_count_sync_1 = sync_record_count_1.get(stream, 0) record_count_sync_2 = sync_record_count_2.get(stream, 0) primary_keys_list_1 = [ tuple( message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records_1.get(stream, {}).get('messages') if message.get('action') == 'upsert' ] primary_keys_list_2 = [ tuple( message.get('data').get(expected_pk) for expected_pk in expected_primary_keys) for message in synced_records_2.get(stream, {}).get('messages') if message.get('action') == 'upsert' ] primary_keys_sync_1 = set(primary_keys_list_1) primary_keys_sync_2 = set(primary_keys_list_2) if self.is_incremental(stream): # Expected bookmark key is one element in set so directly access it start_date_keys_list_1 = [ message.get('data').get( next(iter(expected_replication_keys))) for message in synced_records_1.get(stream).get('messages') if message.get('action') == 'upsert' ] start_date_keys_list_2 = [ message.get('data').get( next(iter(expected_replication_keys))) for message in synced_records_2.get(stream).get('messages') if message.get('action') == 'upsert' ] start_date_key_sync_1 = set(start_date_keys_list_1) start_date_key_sync_2 = set(start_date_keys_list_2) # Verify bookmark key values are greater than or equal to start date of sync 1 for start_date_key_value in start_date_key_sync_1: start_date_key_value_parsed = parse( start_date_key_value).strftime( "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( self.dt_to_ts(start_date_key_value_parsed), start_date_1_epoch) # Verify bookmark key values are greater than or equal to start date of sync 2 for start_date_key_value in start_date_key_sync_2: start_date_key_value_parsed = parse( start_date_key_value).strftime( "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( self.dt_to_ts(start_date_key_value_parsed), start_date_2_epoch) # Verify the number of records replicated in sync 1 is greater than the number # of records replicated in sync 2 for stream self.assertGreater(record_count_sync_1, record_count_sync_2) # Verify the records replicated in sync 2 were also replicated in sync 1 self.assertTrue( primary_keys_sync_2.issubset(primary_keys_sync_1)) else: # Verify that the 2nd sync with a later start date replicates the same number of # records as the 1st sync. self.assertEqual(record_count_sync_2, record_count_sync_1) # Verify by primary key the same records are replicated in the 1st and 2nd syncs self.assertSetEqual(primary_keys_sync_1, primary_keys_sync_2)
def test_run(self): """ Verify for each stream that you can do a sync which records bookmarks. Verify that the bookmark is the max value sent to the target for the `date` PK field Verify that the 2nd sync respects the bookmark Verify that all data of the 2nd sync is >= the bookmark from the first sync Verify that the number of records in the 2nd sync is less then the first Verify inclusivivity of bookmarks PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ print("\n\nTESTING IN SQUARE_ENVIRONMENT: {}".format( os.getenv('TAP_SQUARE_ENVIRONMENT'))) print("\n\nRUNNING {}\n\n".format(self.name())) # Instatiate static start date self.START_DATE = self.STATIC_START_DATE # Ensure tested streams have data expected_records_first_sync = self.create_test_data( self.testable_streams_static(), self.START_DATE) # Instantiate connection with default start conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Select all testable streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) streams_to_select = self.testable_streams_static() our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_to_select ] self.select_all_streams_and_fields(conn_id, our_catalogs) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual( streams_to_select, set(first_sync_record_count.keys()), msg= "Expect first_sync_record_count keys {} to equal testable streams {}," " first_sync_record_count was {}".format( first_sync_record_count.keys(), streams_to_select, first_sync_record_count)) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync runner.get_records_from_target_output() # Set expectations for 2nd sync expected_records_second_sync = {x: [] for x in self.expected_streams()} # adjust expectations for full table streams to include the expected records from sync 1 for stream in self.testable_streams_static(): if stream in self.expected_full_table_streams(): for record in expected_records_first_sync.get(stream, []): expected_records_second_sync[stream].append(record) # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() second_sync_state = menagerie.get_state(conn_id) # Loop first_sync_records and compare against second_sync_records for stream in self.testable_streams_static(): with self.subTest(stream=stream): second_sync_data = [ record.get("data") for record in second_sync_records.get( stream, {}).get("messages", {"data": {}}) ] # TESTING INCREMENTAL STREAMS if stream in self.expected_incremental_streams(): # Verify both syncs write / keep the same bookmark self.assertEqual( set(first_sync_state.get('bookmarks', {}).keys()), set(second_sync_state.get('bookmarks', {}).keys())) # Verify second sync's bookmarks move past the first sync's self.assertGreater( second_sync_state.get('bookmarks', { stream: {} }).get(stream, { 'updated_at': -1 }).get('updated_at'), first_sync_state.get('bookmarks', { stream: {} }).get(stream, { 'updated_at': -1 }).get('updated_at')) # verify that there is more than 1 record of data - setup necessary self.assertGreater( first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get no data on the 2nd sync self.assertGreaterEqual( 0, second_sync_record_count.get(stream, 0), msg= "first sync didn't have more records, bookmark usage not verified" ) elif stream in self.expected_full_table_streams(): # TESTING FULL TABLE STREAMS # Verify no bookmarks are present first_state = first_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, first_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(first_sync_state) + \ "\tBookmark: {}".format(first_state)) second_state = second_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, second_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(second_sync_state) + \ "\tBookmark: {}".format(second_state)) # TESTING APPLICABLE TO ALL STREAMS # Verify that the expected records are replicated in the 2nd sync # For incremental streams we should see 0 records # For full table streams we should see the same records from the first sync expected_records = expected_records_second_sync.get(stream, []) self.assertEqual( len(expected_records), len(second_sync_data), msg= "Expected number of records do not match actual for 2nd sync.\n" + "Expected: {}\nActual: {}".format( len(expected_records), len(second_sync_data)))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])