def test_run(self): # Select our catalogs # found_catalogs = menagerie.get_catalogs(conn_id) # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] # for c in our_catalogs: # c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) # c_metadata = metadata.to_map(c_annotated['metadata']) # connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) conn_id = self.create_connection() # Clear state before our run menagerie.set_state(conn_id, {}) # Select a stream found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams() ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator state = menagerie.get_state(conn_id) record_count_by_stream = self.run_and_verify_sync(conn_id, state) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream, {}).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] replication_methods = self.expected_replication_method() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): replication_method = replication_methods.get(stream) if replication_method is self.INCREMENTAL: self.assertTrue(stream in bookmarks) elif replication_method is self.FULL_TABLE: self.assertTrue(stream not in bookmarks) else: raise NotImplementedError( "stream {} has an invalid replication method {}". format(stream, replication_method))
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ self.start_date = '2020-11-10T00:00:00Z' conn_id = self.create_connection(original_properties=False) # Select all parent streams and no fields within streams # Select all (testable) report streams and only fields which are automatic and/or required by bing to genereate a report found_catalogs = menagerie.get_catalogs(conn_id) test_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()] # BUG_SRCE-4313 (https://stitchdata.atlassian.net/browse/SRCE-4313) streams missing automatic fields specific_fields = {**self.report_automatic_fields(), **self.parent_automatic_fields()} # COMMENT to reproduce # specific_fields = {**self.report_measure_fields(), **self.parent_automatic_fields()} # UNCOMMENT to reproduce # specific_fields = self.report_measure_fields() # TODO Use this line once bugs addressed. self.perform_and_verify_adjusted_selection( conn_id, test_catalogs, select_all_fields=False, specific_fields=specific_fields ) # COMMENT EVERYTHING DOWN FROM HERE TO ADDRESS BUG_SRCE-4313 # Run a sync job using orchestrator state = menagerie.get_state(conn_id) record_count_by_stream = self.run_and_verify_sync(conn_id, state) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): if stream == 'goals_and_funnels_report': # SKIP TESTING FOR THIS STREAM continue # There is no data available, since we would need to implement a tracking script on singer's site # verify that you get some records for each stream self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg="The number of records is not over the stream max limit") # verify that only the automatic fields are sent to the target for parent streams, and that # automatic fields, _sdc_report_datetime, AND specific measure fields are sent to target for report streams actual = actual_fields_by_stream.get(stream) or set() expected = self.expected_automatic_fields().get(stream, set()) if stream.endswith('_report'): # update expectations for report streams expected_measure = 'Assists' if stream.startswith('goals') else 'Clicks' expected.update({ '_sdc_report_datetime', # tap applies sdc value as pk for all reports expected_measure # reports require a perf measure (which is intentionally not automatic) }) self.assertSetEqual(expected, actual)
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = {"currently_syncing" : None, "bookmarks": {"contacts" : {"offset" : {}, "versionTimestamp" : future_time}, "subscription_changes" : {"startTimestamp" : future_time, "offset" : {}}, "campaigns" : {"offset" : {}}, "forms" : {"updatedAt" : future_time}, "deals" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "workflows" : {"updatedAt" : future_time}, "owners" : {"updatedAt" : future_time}, "contact_lists" : {"updatedAt" : future_time, "offset" : {}}, "email_events" : {"startTimestamp" : future_time, "offset" : {}}, "companies" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "engagements" : {"lastUpdated" : future_time, "offset" : {}}}} menagerie.set_state(conn_id, future_bookmarks) record_count_by_stream = self.run_and_verify_sync(conn_id) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys()) self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data") bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def first_sync_test(self, table_configs, conn_id): # run first full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() expected_pks = {} for config in table_configs: key = {config['HashKey']} if config.get('SortKey'): key |= {config.get('SortKey')} expected_pks[config['TableName']] = key # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, {x['TableName'] for x in table_configs}, expected_pks) state = menagerie.get_state(conn_id) state_version = menagerie.get_state_version(conn_id) first_versions = {} # assert that we get the correct number of records for each stream for config in table_configs: table_name = config['TableName'] self.assertEqual(config['num_rows'], record_count_by_stream[table_name]) # assert that an activate_version_message is first and last message sent for each stream self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][-1]['action']) # assert that the state has an initial_full_table_complete == True self.assertTrue( state['bookmarks'][table_name]['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[table_name] = state['bookmarks'][table_name][ 'version'] self.assertIsNotNone(first_versions[table_name]) # Write state with missing finished_shards so it # re-reads data from all shards # This should result in the next sync having same number of records # as the full table sync state['bookmarks'][table_name].pop('finished_shards') menagerie.set_state(conn_id, state, version=state_version)
def test_run(self): # Select our catalogs # found_catalogs = menagerie.get_catalogs(conn_id) # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] # for c in our_catalogs: # c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) # c_metadata = metadata.to_map(c_annotated['metadata']) # connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) conn_id = self.create_connection() # Clear state before our run menagerie.set_state(conn_id, {}) # Select a stream found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams() ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = sum(record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream, {}).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] for stream in self.expected_sync_streams(): self.assertTrue(stream in bookmarks)
def test_run(self): # sync 1 conn_id = connections.ensure_connection(self) found_catalogs_1 = self.run_and_verify_check_mode(conn_id) self.perform_and_verify_table_and_field_selection(conn_id,found_catalogs_1) record_count_by_stream_1 = self.run_and_verify_sync(conn_id) # checking if we got any data from sync 1 self.assertGreater(sum(record_count_by_stream_1.values()), 0) for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual(self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream_1[tap_stream_id]) # getting state state = menagerie.get_state(conn_id) # creating file "table_1_fileB" with self.get_test_connection() as client: root_dir = os.getenv('TAP_SFTP_ROOT_DIR') client.chdir(root_dir + '/tap_tester/folderA') file_group = self.get_files()[0] with client.open('table_1_fileB.csv', 'w') as f: writer = csv.writer(f) lines = [file_group['headers']] + file_group['generator'](file_group['num_rows']) writer.writerows(lines) # adding some data to file "table_1_fileA" and "table_3_fileA" self.append_to_files() # setting state menagerie.set_state(conn_id, state) # sync 2 record_count_by_stream_2 = self.run_and_verify_sync(conn_id, second_sync = True) # checking if we got any data from sync 2 self.assertGreater(sum(record_count_by_stream_2.values()), 0) # checking if data after in 2nd sync is as expected # here as we have modified start date, so we should recieve only modified data # ie. after appending and creating file for tap_stream_id in self.expected_second_sync_streams(): self.assertEqual(self.expected_second_sync_row_counts()[tap_stream_id], record_count_by_stream_2[tap_stream_id])
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs)) #menagerie.post_annotated_catalogs(conn_id, selected_catalogs) for c in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, c, menagerie.get_annotated_schema(conn_id, c['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # bookmarks for the 4 streams should be 2015-03-16 states = menagerie.get_state(conn_id)["bookmarks"] end_date = self.get_properties()["end_date"].split()[0] for k, v in states.items(): if "insights" in k: bm_date = v.get("date_start") self.assertEqual(end_date, bm_date) print("bookmarks match end_date of {}".format(end_date))
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select all Catalogs for catalog in found_catalogs: if catalog['tap_stream_id'] in self.expected_sync_streams(): connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) #clear state menagerie.set_state(conn_id, {}) record_count_by_stream = self.run_and_verify_sync(conn_id) max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date']) start_of_today = utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc)) max_bookmarks_from_records['subscription_changes'] = start_of_today max_bookmarks_from_records['email_events'] = start_of_today #if we didn't replicate data, the bookmark should be the start_date for k in self.expected_bookmarks().keys(): if max_bookmarks_from_records.get(k) is None: max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc)) state = menagerie.get_state(conn_id) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys()) #verify bookmarks and offsets for k,v in sorted(list(self.expected_bookmarks().items())): for w in v: bk_value = bookmarks.get(k,{}).get(w) self.assertEqual(utils.strptime_with_tz(bk_value), utils.strptime_with_tz(max_bookmarks_from_records[k]), "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k])) print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k])) for k,v in self.expected_offsets().items(): self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state)) print("offsets {} cleared".format(k)) diff = bookmark_streams.difference(self.acceptable_bookmarks()) self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks)) self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue(subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata(conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = {k for k, v in record_count_by_stream.items() if v == 0} self.assertFalse(zero_count_streams, msg="The following streams did not sync any rows {}".format(zero_count_streams)) # Verify that bookmark values are correct after incremental sync start_date = os.getenv(configuration['properties']['start_date']) bookmark_props = configuration['bookmark'] current_state = menagerie.get_state(conn_id) test_bookmark = current_state['bookmarks'][bookmark_props['bookmark_key']] print(test_bookmark) self.assertTrue(test_bookmark['updated'] > start_date, msg="The bookmark value does not match the expected result")
def test_run(self): runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.check_all_streams_in_catalogs(found_catalogs) self.select_found_catalogs(found_catalogs) # clear state and run the actual sync menagerie.set_state(self.conn_id, {}) runner.run_sync_job_and_check_status(self) self.check_output_record_counts() max_bookmarks_from_records = runner.get_max_bookmarks_from_target(self) state = menagerie.get_state(self.conn_id) bookmarks = state.get("bookmarks", {}) self.check_bookmarks(bookmarks, max_bookmarks_from_records) self.check_offsets(bookmarks) self.look_for_unexpected_bookmarks(bookmarks) self.assertIsNone(state.get("currently_syncing"))
def test_future_date_in_state(self): conn_id = connections.ensure_connection(self) expected_streams = self.streams_to_select() future_date = datetime.datetime.strftime( datetime.datetime.today() + datetime.timedelta(days=1), "%Y-%m-%dT00:00:00Z") state = {'bookmarks': dict()} replication_keys = self.expected_replication_keys() for stream in expected_streams: if self.is_incremental(stream): state['bookmarks'][stream] = dict() state['bookmarks'][stream]['field'] = next( iter(replication_keys[stream])) state['bookmarks'][stream]['last_record'] = future_date # set state for running sync mode menagerie.set_state(conn_id, state) runner.run_check_mode(self, conn_id) found_catalogs = menagerie.get_catalogs(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams) # run sync mode self.run_and_verify_sync(conn_id) # get the state after running sync mode latest_state = menagerie.get_state(conn_id) # verify that the state passed before sync # and the state we got after sync are same self.assertEquals(latest_state, state)
def test_run(self): # append some data to particular files to test the modified date self.append_to_files( ["table_1_file.csv", "table_3_file.csv", "table_4_file.csv"]) # sync conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) self.perform_and_verify_table_and_field_selection( conn_id, found_catalogs) record_count_by_stream = self.run_and_verify_sync(conn_id) state = menagerie.get_state(conn_id) # checking if we got any data from sync self.assertGreater(sum(record_count_by_stream.values()), 0) # checking if data after sync is as expected for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual(self.expected_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id]) # getting maximum of last mofified dates from all files max_date = max(self.get_last_modified()).replace(microsecond=0) expected_date = max_date.timestamp() # getting bookmark actual_date = datetime.datetime.fromisoformat( state['bookmarks']['table']['modified_since']).timestamp() # checking if maximum last modified date is set as bookmark self.assertEqual(int(expected_date), int(actual_date))
def test_run(self): expected_streams = self.expected_streams() expected_replication_keys = self.expected_replication_keys() expected_replication_methods = self.expected_replication_method() ########################################################################## ### First Sync ########################################################################## self.start_date_1 = self.get_properties().get("start_date") self.start_date_2 = self.timedelta_formatted(self.start_date_1, days=3) self.start_date = self.start_date_1 conn_id = connections.ensure_connection(self, original_properties=False) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Update State Between Syncs ########################################################################## new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream( first_sync_bookmarks) for stream, new_state in simulated_states.items(): new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) for stream in simulated_states.keys(): for state_key, state_value in simulated_states[stream].items(): if stream not in new_states['bookmarks']: new_states['bookmarks'][stream] = {} if state_key not in new_states['bookmarks'][stream]: new_states['bookmarks'][stream][state_key] = state_value ########################################################################## ### Second Sync ########################################################################## self.start_date = self.start_date_2 # run check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection test_catalogs_2_all_fields = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, test_catalogs_2_all_fields, select_all_fields=True) second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Test By Stream ########################################################################## for stream in expected_streams: with self.subTest(stream=stream): expected_replication_method = expected_replication_methods[ stream] first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) # expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) # collect information for assertions from syncs 1 & 2 base on expected values first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] if expected_replication_method == self.INCREMENTAL: replication_key = next( iter(expected_replication_keys[stream])) if stream != 'forms': for form_key in self.get_forms(): first_bookmark_value = first_bookmark_key_value.get( form_key, {}).get(replication_key) second_bookmark_value = second_bookmark_key_value.get( form_key, {}).get(replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = new_states['bookmarks'][ stream][form_key] simulated_bookmark_minus_lookback = simulated_bookmark_value # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) # Verify the second sync bookmark is Greater or Equal to the first sync bookmark self.assertGreaterEqual( second_bookmark_value, first_bookmark_value ) # new responses could be picked up for the form in the second sync for record in second_sync_messages: # Verify the second sync records respect the previous (simulated) bookmark value replication_key_value = record.get( replication_key) self.assertGreaterEqual( replication_key_value, simulated_bookmark_minus_lookback, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get( replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) else: # collect information specific to incremental streams from syncs 1 & 2 first_bookmark_value = first_bookmark_key_value.get( replication_key) second_bookmark_value = second_bookmark_key_value.get( replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = new_states['bookmarks'][ stream][replication_key] simulated_bookmark_minus_lookback = simulated_bookmark_value # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) # Verify the second sync bookmark is Greater or Equal to the first sync bookmark self.assertGreaterEqual( second_bookmark_value, first_bookmark_value ) # new responses could be picked up for the form in the second sync for record in second_sync_messages: # Verify the second sync records respect the previous (simulated) bookmark value replication_key_value = record.get(replication_key) self.assertGreaterEqual( replication_key_value, simulated_bookmark_minus_lookback, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) elif expected_replication_method == self.FULL_TABLE: # Verify the syncs do not set a bookmark for full table streams self.assertIsNone(first_bookmark_key_value) self.assertIsNone(second_bookmark_key_value) # Verify the number of records in the second sync is the same as the first self.assertEqual(second_sync_count, first_sync_count) else: raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}" .format(stream, expected_replication_method)) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def binlog_json_test(self): print("RUNNING {}\n\n".format(self.name())) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) expected_check_streams = {self.tap_stream_id()} expected_sync_streams = {self.table_name()} expected_pks = {self.table_name(): {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(self.table_name(), test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {self.table_name(): 1}) records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['activate_version', 'upsert', 'activate_version']) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] self.assertEqual([expected_rec_1], upsert_records) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # insert a new huge row data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)], literal=True) rec = {'id': 2, 'our_json': json.dumps(data)} with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, rec) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] expected_rec_2 = copy.deepcopy(rec) # check for expected records records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['upsert']) upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] del upsert_records[0]['_sdc_deleted_at'] expected_json = json.loads(expected_rec_2.get('our_json', {})) actual_json = json.loads(upsert_records[0].get('our_json', {})) self.assertTrue(len(actual_json.keys()) > 0) self.assertEqual(expected_json, actual_json)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties chicken_catalog = found_catalogs[0] self.assertEqual('chicken_view', chicken_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for ROOT-CHICKEN_VIEW') md = menagerie.get_annotated_schema( conn_id, chicken_catalog['stream_id'])['metadata'] self.assertEqual( { (): { 'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': [] }, ('properties', 'fk_id'): { 'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True }, ('properties', 'name'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'age'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True }, ('properties', 'size'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'id'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True } }, metadata.to_map(md)) # 'ID' selected as view-key-properties replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': None, "replication-method": "FULL_TABLE", 'view-key-properties': ["id"] } }] connections.select_catalog_and_fields_via_metadata( conn_id, chicken_catalog, menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), replication_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'chicken_view': 1}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['chicken_view']['table_version'] self.assertEqual( records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['chicken_view']['messages'][2]['action'], 'activate_version') # verifications about individual records for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) actual_chicken_record = records_by_stream['chicken_view']['messages'][ 1]['data'] expected_chicken_record = { 'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'size': 'big' } self.assertEqual( actual_chicken_record, expected_chicken_record, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_chicken_record, actual_chicken_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertEqual( chicken_bookmark['version'], table_version, msg="expected bookmark for stream ROOT-CHICKEN to match version")
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties for c in found_catalogs: catalog_props_to_check = ['stream_name', 'tap_stream_id'] stream = c['stream_name'] for prop in catalog_props_to_check: self.assertEqual( c[prop], expected_catalogs[stream][prop], msg= "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`" .format(prop, stream, expected_catalogs[stream][prop], c[prop])) print("discovered streams are correct") print('checking discoverd metadata for tap_tester_mysql_0-incremental') incremental_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental' ][0] md = menagerie.get_annotated_schema( conn_id, incremental_catalog['stream_id'])['metadata'] incremental_stream_metadata = { 'database-name': 'tap_tester_mysql_0', 'row-count': 3, 'is-view': False, 'selected-by-default': False, 'table-key-properties': ['c_pk'] } self.assertEqual( sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': incremental_stream_metadata }, { 'breadcrumb': ['properties', 'c_dt'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'datetime' } }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }, { 'breadcrumb': ['properties', 'c_varchar_to_deselect'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) print('checking discovered metadata for tap_tester_mysql_1-view') view_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_1-view' ][0] view_catalog_key_properties_md = [{ 'breadcrumb': [], 'metadata': { 'view-key-properties': ['c_pk'] } }] connections.set_non_discoverable_metadata( conn_id, view_catalog, menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']), view_catalog_key_properties_md) md = menagerie.get_annotated_schema( conn_id, view_catalog['stream_id'])['metadata'] view_stream_metadata = { 'database-name': 'tap_tester_mysql_1', 'is-view': True, 'selected-by-default': False, 'view-key-properties': ['c_pk'] } self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': view_stream_metadata }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) #No selected-by-default MD for c_year because it is an unsupported type various_types_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types' ][0] md = menagerie.get_annotated_schema( conn_id, various_types_catalog['stream_id'])['metadata'] c_year_md = [ x for x in md if x['breadcrumb'] == ['properties', 'c_year'] ] self.assertEqual(c_year_md, [{ 'breadcrumb': ['properties', 'c_year'], 'metadata': { 'selected-by-default': False, 'sql-datatype': 'year(4)' } }]) ##select_simple_example catalogs_to_select = [ c for c in found_catalogs if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example' ] for a_catalog in catalogs_to_select: additional_md = [] unselected_fields = [] if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental': additional_md = [{ "breadcrumb": [], "metadata": { 'replication-key': 'c_dt', 'replication-method': 'INCREMENTAL' } }] unselected_fields = ['c_varchar_to_deselect'] elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view': additional_md = [{ "breadcrumb": [], "metadata": { 'view-key-properties': ['c_pk'], 'replication-method': 'FULL_TABLE' } }] else: additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, a_catalog, menagerie.get_annotated_schema(conn_id, a_catalog['stream_id']), additional_md, unselected_fields) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 8 # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() # verifications about individual records for stream, recs in records_by_stream.items(): # verify that activate version messages were sent in the proper position self.assertEqual( recs['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` to have action `activate_version`" .format(stream)) # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # verify that the target output the proper numeric and date representations expected_various_types_records = [{ 'c_time': '1970-01-01T12:34:56.000000Z', 'c_mediumint': 8388607, 'c_smallint': 32767, 'c_tinyint': 127, 'c_date': '2017-09-13T00:00:00.000000Z', 'c_bigint': 9223372036854775807, 'c_decimal': -1, 'c_int': 2147483647, 'c_bit': True, 'c_decimal_2': Decimal('123456789.0'), 'c_pk': 1, 'c_double': Decimal("1.234"), 'c_float': Decimal("1.234"), 'c_decimal_2_unsigned': Decimal("1.23"), 'c_tinyint_1': True }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': False, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 2, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': False }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': None, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 3, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': None }] actual_various_types_records = [ r['data'] for r in records_by_stream['various_types']['messages'][1:4] ] self.assertEqual( actual_various_types_records, expected_various_types_records, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_various_types_records, actual_various_types_records)) # verify that deselected property was not output expected_incremental_record = { 'c_pk': 1, 'c_dt': '2017-01-01T00:00:00.000000Z', 'c_varchar': 'a' } actual_incremental_record = records_by_stream['incremental'][ 'messages'][1]['data'] self.assertEqual( actual_incremental_record, expected_incremental_record, msg= "Expected first `incremental` upsert record data to be {}, but target output {}" .format(expected_incremental_record, actual_incremental_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:02.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") incremental_table_initial_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] #---------------------------------------------------------------------- # invoke the sync job again after some modifications #---------------------------------------------------------------------- print("adding a column to an existing table in the source db") connection = db_utils.get_db_connection(self.get_properties(), self.get_credentials()) with connection.cursor() as cursor: add_column_sql = ''' ALTER TABLE tap_tester_mysql_0.incremental ADD COLUMN favorite_number INTEGER; INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999); ''' cursor.execute(add_column_sql) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 7 # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() expected_schema_of_new_column = { 'maximum': 2147483647, 'selected': True, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } # verifications about individual records for stream, recs in records_by_stream.items(): # verify that a activate version messages were sent in the proper position if stream == 'incremental': self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` not to have action `activate_version`" .format(stream)) expected_schema_of_new_column = { 'maximum': 2147483647, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } self.assertEqual( records_by_stream[stream]['schema']['properties'] ['favorite_number'], expected_schema_of_new_column, msg= "Expected newly-added column to be present in schema for stream `{}`, but it was not." .format(stream)) else: self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'upsert', msg= "Expected first message sent for stream `{}` to have action `upsert`" .format(stream)) self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version', msg= "Expected last message sent for stream `{}` to have action `activate_version`" .format(stream)) state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:03.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") # verify incremental table_version didn't change incremental_table_new_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] self.assertEqual( incremental_table_initial_table_version, incremental_table_new_table_version, msg= "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations." )
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual( len(found_catalogs), 2, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog_cows = list( filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name']) test_catalog_chickens = list( filter( lambda c: c['stream_name' ] == 'postgres_logical_replication_test_chickens', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_cows, menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']), additional_md) connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_chickens, menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'postgres_logical_replication_test_cows': 1, 'postgres_logical_replication_test_chickens': 1 }) records_by_stream = runner.get_records_from_target_output() table_version_cows = records_by_stream[ 'postgres_logical_replication_test_cows']['table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][2]['action'], 'activate_version') table_version_chickens = records_by_stream[ 'postgres_logical_replication_test_chickens']['table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][2]['action'], 'activate_version') # verify state and bookmarks state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") bookmark_cows = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn") lsn_cows_1 = bookmark_cows['lsn'] self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version") bookmark_chickens = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn") lsn_chickens_1 = bookmark_chickens['lsn'] self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version") #---------------------------------------------------------------------- # invoke the sync job again after adding records #---------------------------------------------------------------------- print("inserting 2 more cows and 2 more chickens") with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor() as cur: # insert another cow self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21} insert_record(cur, test_table_name_cows, self.cows_rec_2) # update that cow's expected values self.cows_rec_2['id'] = 2 self.cows_rec_2['_sdc_deleted_at'] = None # insert another chicken self.chicken_rec_2 = { 'chicken_name': "burt chicken", 'chicken_age': 14 } insert_record(cur, test_table_name_chickens, self.chicken_rec_2) # update that cow's expected values self.chicken_rec_2['id'] = 2 self.chicken_rec_2['_sdc_deleted_at'] = None # and repeat... self.cows_rec_3 = {'cow_name': "cindy cow", 'cow_age': 10} insert_record(cur, test_table_name_cows, self.cows_rec_3) self.cows_rec_3['id'] = 3 self.cows_rec_3['_sdc_deleted_at'] = None self.chicken_rec_3 = { 'chicken_name': "carl chicken", 'chicken_age': 4 } insert_record(cur, test_table_name_chickens, self.chicken_rec_3) self.chicken_rec_3['id'] = 3 self.chicken_rec_3['_sdc_deleted_at'] = None sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'postgres_logical_replication_test_cows': 2, 'postgres_logical_replication_test_chickens': 2 }) records_by_stream = runner.get_records_from_target_output() chicken_messages = records_by_stream[ "postgres_logical_replication_test_chickens"]['messages'] cow_messages = records_by_stream[ "postgres_logical_replication_test_cows"]['messages'] self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data']) self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data']) self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data']) self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data']) print("inserted record is correct") state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") cows_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone( cows_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_cows_2 = cows_bookmark['lsn'] self.assertTrue(lsn_cows_2 >= lsn_cows_1) chickens_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone( chickens_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_chickens_2 = chickens_bookmark['lsn'] self.assertTrue(lsn_chickens_2 >= lsn_chickens_1) #table_version does NOT change self.assertEqual( chickens_bookmark['version'], table_version_chickens, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" ) #table_version does NOT change self.assertEqual( cows_bookmark['version'], table_version_cows, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" )
def bookmarks_test(self, testable_streams): """ Verify for each stream that you can do a sync which records bookmarks. Verify that the bookmark is the max value sent to the target for the `date` PK field Verify that the 2nd sync respects the bookmark Verify that all data of the 2nd sync is >= the bookmark from the first sync Verify that the number of records in the 2nd sync is less then the first Verify inclusivivity of bookmarks PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ print("\n\nRUNNING {}\n\n".format(self.name())) # Ensure tested streams have existing records expected_records_first_sync = self.create_test_data(testable_streams, self.START_DATE, force_create_records=True) # Instantiate connection with default start conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Select all testable streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) streams_to_select = testable_streams our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_to_select] self.select_all_streams_and_fields(conn_id, our_catalogs) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual(streams_to_select, set(first_sync_record_count.keys()), msg="Expect first_sync_record_count keys {} to equal testable streams {}," " first_sync_record_count was {}".format( first_sync_record_count.keys(), streams_to_select, first_sync_record_count)) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync first_sync_records = runner.get_records_from_target_output() # Add data before next sync via insert and update, and set expectations created_records = {x: [] for x in self.expected_streams()} updated_records = {x: [] for x in self.expected_streams()} expected_records_second_sync = {x: [] for x in self.expected_streams()} # We should expect any records with rep-keys equal to the bookmark from the first sync to be returned by the second if 'orders' in testable_streams: for order in first_sync_records['orders']['messages']: if order['data']['updated_at'] == first_sync_state.get('bookmarks', {}).get('orders', {}).get('updated_at'): expected_records_second_sync['orders'].append(order['data']) streams_to_create_records = list(testable_streams) if 'payments' in testable_streams: streams_to_create_records.remove('payments') streams_to_create_records.append('payments') for stream in streams_to_create_records: new_records = [] if stream == 'refunds': # a CREATE for refunds is equivalent to an UPDATE for payments # a CREATE for refunds will result in a new payments object (new_refund, payment) = self.client.create_refund(start_date=self.START_DATE) new_records = new_refund created_records['payments'].append(payment) expected_records_second_sync['payments'].append(payment) else: # TEST_ISSUE_1 | get the time that the customer record was created if stream == 'customers': customers_create_time = perf_counter() # Create new_records = self.client.create(stream, start_date=self.START_DATE) assert new_records, "Failed to create a {} record".format(stream) assert len(new_records) == 1, "Created too many {} records: {}".format(stream, len(new_records)) expected_records_second_sync[stream] += new_records created_records[stream] += new_records for stream in testable_streams.difference(self.cannot_update_streams()): first_rec = None # Update all streams (but save payments for last) if stream == 'payments': continue if stream == 'orders': # Use the first available order that is still 'OPEN' for message in first_sync_records.get(stream).get('messages'): if message.get('data')['state'] not in ['COMPLETED', 'CANCELED']: first_rec = message.get('data') break if not first_rec: raise RuntimeError("Unable to find any any orders with state other than COMPLETED") elif stream == 'roles': # Use the first available role that has limited permissions (where is_owner = False) for message in first_sync_records.get(stream).get('messages'): data = message.get('data') if not data['is_owner'] and 'role' in data['name']: first_rec = message.get('data') break if not first_rec: raise RuntimeError("Unable to find any any orders with state other than COMPLETED") else: # By default we want the last created record last_message = first_sync_records.get(stream).get('messages')[-1] if last_message.get('data') and not last_message.get('data').get('is_deleted'): first_rec = last_message.get('data') else: # If last record happens to be deleted grab first available that wasn't LOGGER.warning("The last created record for %s was deleted.", stream) for message in first_sync_records.get(stream).get('messages'): data = message.get('data') if not data.get('is_deleted'): first_rec = message.get('data') break if not first_rec: raise RuntimeError("Cannot find any {} records that were not deleted .".format(stream)) if stream == 'inventories': # This is an append only stream, we will make multiple 'updates' first_rec_catalog_obj_id = first_rec.get('catalog_object_id') first_rec_location_id = first_rec.get('location_id') # IN_STOCK -> SOLD [quantity -1] updated_record = self.client.create_specific_inventory_adjustment( first_rec_catalog_obj_id, first_rec_location_id, from_state='IN_STOCK', to_state='SOLD', quantity='1.0') assert len(updated_record) == 1, "Failed to update the {} records as intended".format(stream) # UNLINKED_RETURN -> IN_STOCK [quantity +1] updated_record = self.client.create_specific_inventory_adjustment( first_rec_catalog_obj_id, first_rec_location_id, from_state='UNLINKED_RETURN', to_state='IN_STOCK', quantity='2.0') assert len(updated_record) == 1, "Failed to update the {} records as intended".format(stream) # NONE -> IN_STOCK [quantity +2] updated_record = self.client.create_specific_inventory_adjustment( first_rec_catalog_obj_id, first_rec_location_id, from_state='NONE', to_state='IN_STOCK', quantity='1.0') assert len(updated_record) == 1, "Failed to update the {} records as intended".format(stream) # IN_STOCK -> WASTE [quantity +1] updated_record = self.client.create_specific_inventory_adjustment( first_rec_catalog_obj_id, first_rec_location_id, from_state='IN_STOCK', to_state='WASTE', quantity='1.0') # creates 2 records assert len(updated_record) == 2, "Failed to update the {} records as intended".format(stream) else: first_rec_id = first_rec.get('id') first_rec_version = first_rec.get('version') if stream == 'customers': # TEST_ISSUE_1 get the time that the customer record was updated customers_update_time = perf_counter() updated_record = self.client.update(stream, obj_id=first_rec_id, version=first_rec_version, obj=first_rec, start_date=self.START_DATE) assert updated_record, "Failed to update a {} record".format(stream) assert len(updated_record) == 1, "Updated too many {} records".format(stream) expected_records_second_sync[stream] += updated_record updated_records[stream] += updated_record if 'payments' in testable_streams: # Update a Payment AFTER all other streams have been updated # Payments which have already completed/cancelled can't be done so again so find first APPROVED payment first_rec = dict() for message in first_sync_records.get('payments').get('messages'): if message.get('data')['status'] == 'APPROVED': first_rec = message.get('data') break if not first_rec: raise RuntimeError("Unable to find any any payment with status APPROVED") first_rec_id = first_rec.get('id') first_rec_version = first_rec.get('version') updated_record = self.client.update('payments', first_rec_id, first_rec_version) assert updated_record, "Failed to update a {} record".format('payments') assert len(updated_record) == 1, "Updated too many {} records".format('payments') expected_records_second_sync['payments'] += updated_record[0] updated_records['payments'] += updated_record[0] # adjust expectations for full table streams to include the expected records from sync 1 for stream in self.expected_full_table_streams(): if stream == 'inventories': primary_keys = self.makeshift_primary_keys().get(stream) else: primary_keys = list(self.expected_primary_keys().get(stream)) updated_pk_values = {tuple([record.get(pk) for pk in primary_keys]) for record in updated_records[stream]} for record in expected_records_first_sync.get(stream, []): record_pk_values = tuple([record.get(pk) for pk in primary_keys]) if record_pk_values in updated_pk_values: continue # do not add the orginal of the updated record expected_records_second_sync[stream].append(record) # Adjust expectations for datetime format for record_desc, records in [("created", created_records), ("updated", updated_records), ("2nd sync expected records", expected_records_second_sync)]: print("Adjusting epxectations for {} records".format(record_desc)) for stream, expected_records in records.items(): print("\tadjusting for stream: {}".format(stream)) self.modify_expected_records(expected_records) # ensure validity of expected_records_second_sync for stream in testable_streams: if stream in self.expected_incremental_streams(): if stream in self.cannot_update_streams(): self.assertEqual(1, len(expected_records_second_sync.get(stream)), msg="Expectations are invalid for incremental stream {}".format(stream)) elif stream == 'orders': # ORDERS are returned inclusive on the datetime queried self.assertEqual(3, len(expected_records_second_sync.get(stream)), msg="Expectations are invalid for incremental stream {}".format(stream)) else: # Most streams will have 2 records from the Update and Insert self.assertEqual(2, len(expected_records_second_sync.get(stream)), msg="Expectations are invalid for incremental stream {}".format(stream)) if stream in self.expected_full_table_streams(): if stream == 'inventories': # Typically changes to inventories object will replace an IN_STOCK record with two records # 1 IN_STOCK -> 1 IN_STOCK, 1 WASTE # if a given combination of {'catalog_object_id', 'location_id', 'state'} already has a # WASTE record then both records will be replaced # 1 IN_STOCK, 1 WASTE -> 1 IN_STOCK, 1 WASTE self.assertLessEqual( len(expected_records_second_sync.get(stream)), len(expected_records_first_sync.get(stream)) + len(created_records[stream]) + 1, msg="Expectations are invalid for full table stream {}".format(stream)) self.assertGreaterEqual( len(expected_records_second_sync.get(stream)), len(expected_records_first_sync.get(stream)) + len(created_records[stream]), msg="Expectations are invalid for full table stream {}".format(stream)) continue self.assertEqual(len(expected_records_second_sync.get(stream)), len(expected_records_first_sync.get(stream)) + len(created_records[stream]), msg="Expectations are invalid for full table stream {}".format(stream)) # Run a second sync job using orchestrator second_sync_time_start = perf_counter() # TEST_ISSUE_1 get the time that the 2nd sync starts second_sync_record_count = self.run_sync(conn_id) second_sync_time_end = perf_counter() # TEST_ISSUE_1 get the time that the 2nd sync ends # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() second_sync_state = menagerie.get_state(conn_id) # BUG_1 | https://stitchdata.atlassian.net/browse/SRCE-4975 PARENT_FIELD_MISSING_SUBFIELDS = {'payments': {'card_details'}} # BUG_2 | https://stitchdata.atlassian.net/browse/SRCE-5143 MISSING_FROM_SCHEMA = {'payments': {'capabilities', 'version_token', 'approved_money'}} # Loop first_sync_records and compare against second_sync_records for stream in testable_streams: with self.subTest(stream=stream): second_sync_data = [record.get("data") for record in second_sync_records.get(stream, {}).get("messages", [])] stream_replication_keys = self.expected_replication_keys() stream_primary_keys = self.expected_primary_keys() # TESTING INCREMENTAL STREAMS if stream in self.expected_incremental_streams(): replication_keys = stream_replication_keys.get(stream) # Verify both syncs write / keep the same bookmark self.assertEqual(set(first_sync_state.get('bookmarks', {}).keys()), set(second_sync_state.get('bookmarks', {}).keys())) # verify that there is more than 1 record of data - setup necessary self.assertGreater(first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get less/same amount of data on the 2nd sync self.assertGreater( first_sync_record_count.get(stream, 0), second_sync_record_count.get(stream, 0), msg="first sync didn't have more records, bookmark usage not verified") for replication_key in replication_keys: # Verify second sync's bookmarks move past the first sync's self.assertGreater( second_sync_state.get('bookmarks', {stream: {}}).get( stream, {replication_key: -1}).get(replication_key), first_sync_state.get('bookmarks', {stream: {}}).get( stream, {replication_key: -1}).get(replication_key) ) # Verify that all data of the 2nd sync is >= the bookmark from the first sync first_sync_bookmark = first_sync_state.get('bookmarks').get(stream).get(replication_key) for record in second_sync_data: date_value = record[replication_key] self.assertGreaterEqual(date_value, first_sync_bookmark, msg="A 2nd sync record has a replication-key that is less than or equal to the 1st sync bookmark.") elif stream in self.expected_full_table_streams(): # TESTING FULL TABLE STREAMS # Verify no bookmarks are present first_state = first_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, first_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(first_sync_state) + \ "\tBookmark: {}".format(first_state)) second_state = second_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, second_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(second_sync_state) + \ "\tBookmark: {}".format(second_state)) if stream == 'customers' and len(second_sync_data) == 0: # BUG https://stitchdata.atlassian.net/browse/SRCE-4639 # NOTE: Square sometimes lags on the customers stream, so we'll give them one more shot # before we say this stream fails in catching the create and update. This was tested # manually while syncing all streams and while sycning only the customers stream # and we were unable to produce a scenario in which a subsequent sync failed to pick # up the create and update after failing to catch them in the 2nd sync. # TEST_ISSUE_1 | Log the time diffs for record created, updated, second sync ran LOGGER.warning( 'Second sync missed %s records that were just created and updated.\n' + 'Time between record create and: \n\tsync start = %s\tsync end: %s\n' + 'Time between record update and: \n\tsync start = %s\tsync end: %s', stream, second_sync_time_start - customers_create_time, second_sync_time_end - customers_create_time, second_sync_time_start - customers_update_time, second_sync_time_end - customers_update_time, ) # TODO TIMING | get the time the third sync ran # Run another sync since square can't keep up third_sync_time_start = perf_counter() # TEST_ISSUE_1 get the time that the 3rd sync starts _ = self.run_sync(conn_id) third_sync_time_end = perf_counter() # TEST_ISSUE_1 get the time that the 3rd sync ends # Get the set of records from a thrid sync and apply third_sync_records = runner.get_records_from_target_output() second_sync_data = [record.get("data") for record in third_sync_records.get(stream, {}).get("messages", [])] else: # TEST_ISSUE_1 third_sync_time_start = perf_counter() third_sync_time_end = perf_counter() # TESTING APPLICABLE TO ALL STREAMS # Verify that the expected records are replicated in the 2nd sync # For incremental streams we should see at least 2 records (a new record and an updated record) # but we may see more as the bookmmark is inclusive. # For full table streams we should see 1 more record than the first sync expected_records = expected_records_second_sync.get(stream) if stream == 'inventories': primary_keys = self.makeshift_primary_keys().get(stream) else: primary_keys = stream_primary_keys.get(stream) updated_pk_values = {tuple([record.get(pk) for pk in primary_keys]) for record in updated_records[stream]} if stream == 'customers' and len(second_sync_data) != len(expected_records): # TEST_ISSUE_1 # TEST_ISSUE_1 | Log the time diffs for record created, updated, third sync ran LOGGER.warning( 'Third sync missed %s records that were just created and updated.\n' + 'Time between record create and: \n\tsync start = %s\tsync end: %s\n' + 'Time between record update and: \n\tsync start = %s\tsync end: %s', stream, third_sync_time_start - customers_create_time, third_sync_time_end - customers_create_time, third_sync_time_start - customers_update_time, third_sync_time_end - customers_update_time, ) self.assertLessEqual( len(expected_records), len(second_sync_data), msg="Expected number of records are not less than or equal to actual for 2nd sync.\n" + "Expected: {}\nActual: {}".format(len(expected_records), len(second_sync_data)) ) if (len(second_sync_data) - len(expected_records)) > 0: LOGGER.warning('Second sync replicated %s records more than our create and update for %s', len(second_sync_data), stream) if not primary_keys: raise NotImplementedError("PKs are needed for comparing records") # Verify that the inserted records are replicated by the 2nd sync and match our expectations for created_record in created_records.get(stream): record_pk_values = tuple([created_record.get(pk) for pk in primary_keys]) sync_records = [sync_record for sync_record in second_sync_data if tuple([sync_record.get(pk) for pk in primary_keys]) == record_pk_values] self.assertTrue(len(sync_records), msg="An inserted record is missing from our sync: \nRECORD: {}".format(created_record)) self.assertEqual(1, len(sync_records), msg="A duplicate record was found in the sync for {}\nRECORD: {}.".format(stream, sync_records)) sync_record = sync_records[0] # Test Workaround Start ############################## if stream == 'payments': off_keys = MISSING_FROM_SCHEMA[stream] # BUG_2 self.assertParentKeysEqualWithOffKeys( created_record, sync_record, off_keys ) off_keys = PARENT_FIELD_MISSING_SUBFIELDS[stream] | MISSING_FROM_SCHEMA[stream] # BUG_1 | # BUG_2 self.assertDictEqualWithOffKeys( created_record, sync_record, off_keys ) # Test Workaround End ############################## else: self.assertRecordsEqual(stream, created_record, sync_record) # Verify that the updated records are replicated by the 2nd sync and match our expectations for updated_record in updated_records.get(stream): if stream not in self.cannot_update_streams(): record_pk_values = tuple([updated_record.get(pk) for pk in primary_keys]) sync_records = [sync_record for sync_record in second_sync_data if tuple([sync_record.get(pk) for pk in primary_keys]) == record_pk_values] if stream != 'modifier_lists': self.assertTrue(len(sync_records), msg="An updated record is missing from our sync: \nRECORD: {}".format(updated_record)) self.assertEqual(1, len(sync_records), msg="A duplicate record was found in the sync for {}\nRECORDS: {}.".format(stream, sync_records)) sync_record = sync_records[0] # Test Workaround Start ############################## if stream == 'payments': off_keys = MISSING_FROM_SCHEMA[stream] # BUG_2 self.assertParentKeysEqualWithOffKeys( updated_record, sync_record, off_keys ) off_keys = PARENT_FIELD_MISSING_SUBFIELDS[stream] | MISSING_FROM_SCHEMA[stream] # BUG_1 | # BUG_2 self.assertDictEqualWithOffKeys( updated_record, sync_record, off_keys ) # Test Workaround End ############################## else: self.assertRecordsEqual(stream, updated_record, sync_record)
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_sync_streams(): self.assertGreaterEqual(record_count_by_stream[tap_stream_id], self.expected_row_counts()[tap_stream_id]) # Verify that we have 'initial_full_table_complete' bookmark state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # assert that the state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id] ['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][ 'version'] self.assertIsNotNone(first_versions[tap_stream_id]) # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_time']) self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_inc']) changed_ids = set() with get_test_connection() as client: # Delete two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) # Update two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 48})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 48}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 49})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 49}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 98})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 98}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 99})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 99}, {'$set': { 'int_field': -1 }}) # Insert two documents for each collection client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 50})[0]['_id']) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 51})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 100})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 101})[0]['_id']) # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got at least 6 records due to changes # (could be more due to overlap in gte oplog clause) for k, v in record_count_by_stream.items(): self.assertGreaterEqual(v, 6) # Verify that we got 2 records with _SDC_DELETED_AT self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_1'] if x['data'].get('_sdc_deleted_at') ])) self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_2'] if x['data'].get('_sdc_deleted_at') ])) # Verify that the _id of the records sent are the same set as the # _ids of the documents changed actual = set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_1'] ]).union( set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_2'] ])) self.assertEqual(changed_ids, actual)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])
def bookmarks_test(self, expected_streams): """A Parametrized Bookmarks Test""" expected_replication_keys = self.expected_replication_keys() expected_replication_methods = self.expected_replication_method() expected_insights_buffer = -1 * int( self.get_properties()['insights_buffer_days']) # lookback window ########################################################################## ### First Sync ########################################################################## conn_id = connections.ensure_connection(self, original_properties=False) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Update State Between Syncs ########################################################################## new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream( first_sync_bookmarks) for stream, new_state in simulated_states.items(): new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## ### Second Sync ########################################################################## second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## ### Test By Stream ########################################################################## for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_replication_method = expected_replication_methods[ stream] # collect information for assertions from syncs 1 & 2 base on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream).get('messages') if record.get('action') == 'upsert' ] first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) if expected_replication_method == self.INCREMENTAL: # collect information specific to incremental streams from syncs 1 & 2 replication_key = next( iter(expected_replication_keys[stream])) first_bookmark_value = first_bookmark_key_value.get( replication_key) second_bookmark_value = second_bookmark_key_value.get( replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = new_states['bookmarks'][stream][ replication_key] simulated_bookmark_minus_lookback = self.timedelta_formatted( simulated_bookmark_value, days=expected_insights_buffer) if self.is_insight( stream) else simulated_bookmark_value # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) self.assertIsNotNone( first_bookmark_key_value.get(replication_key)) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) self.assertIsNotNone( second_bookmark_key_value.get(replication_key)) # Verify the second sync bookmark is Equal to the first sync bookmark self.assertEqual( second_bookmark_value, first_bookmark_value ) # assumes no changes to data during test for record in second_sync_messages: # Verify the second sync records respect the previous (simulated) bookmark value replication_key_value = record.get(replication_key) if stream == 'ads_insights_age_and_gender': # BUG | https://stitchdata.atlassian.net/browse/SRCE-4873 replication_key_value = datetime.datetime.strftime( dateutil.parser.parse(replication_key_value), self.BOOKMARK_COMPARISON_FORMAT) self.assertGreaterEqual( replication_key_value, simulated_bookmark_minus_lookback, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) elif expected_replication_method == self.FULL_TABLE: # Verify the syncs do not set a bookmark for full table streams self.assertIsNone(first_bookmark_key_value) self.assertIsNone(second_bookmark_key_value) # Verify the number of records in the second sync is the same as the first self.assertEqual(second_sync_count, first_sync_count) else: raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}" .format(stream, expected_replication_method)) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual('postgres_logical_replication_test', test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] #don't selcted our_text_2 _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md, ['our_text_2']) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 4}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['postgres_logical_replication_test'][ 'table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [2]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [3]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [4]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [5]['action'], 'activate_version') # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(bookmark['lsn'], msg="expected bookmark for stream to have an lsn") lsn_1 = bookmark['lsn'] self.assertEqual(bookmark['version'], table_version, msg="expected bookmark for stream to match version") #---------------------------------------------------------------------- # invoke the sync job again after adding a record #---------------------------------------------------------------------- print("inserting a record 5") with db_utils.get_test_connection(test_db) as conn: conn.autocommit = True with conn.cursor() as cur: #insert fixture data 3 our_ts = datetime.datetime(1993, 3, 3, 3, 3, 3, 333333) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(3, 4, 5) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1933, 3, 3) my_uuid = str(uuid.uuid1()) #STRINGS: #OUR TS: '1993-03-03 03:03:03.333333' #OUR TS TZ: '1993-03-03 08:03:03.333333+00' #'OUR TIME': '03:04:05' #'OUR TIME TZ': '03:04:05+00' self.rec_5 = { 'our_varchar': "our_varchar 5", # str 'our_varchar_10': "varchar13", # str 'our_text': "some text 3", #str 'our_text_2': "NOT SELECTED", 'our_integer': 96000, #int 'our_smallint': 3, # int 'our_bigint': 3000000, #int 'our_decimal': decimal.Decimal( '1234567890.03' ), #1234567890.03 / our_decimal is a <class 'float'> quote_ident('OUR TS', cur): our_ts, # str '1993-03-03 03:03:03.333333' quote_ident('OUR TS TZ', cur): our_ts_tz, #str '1993-03-03 08:03:03.333333+00' quote_ident('OUR TIME', cur): our_time, # str '03:04:05' quote_ident('OUR TIME TZ', cur): our_time_tz, # str '03:04:05+00' quote_ident('OUR DATE', cur): our_date, #1933-03-03 / OUR DATE is a <class 'str'> 'our_double': 3.3, #3.3 / our_double is a <class 'float'> 'our_real': 6.6, #6.6 / our_real is a <class 'float'> 'our_boolean': True, #boolean 'our_bit': '1', #string 'our_json': json.dumps({'secret': 33}), #string 'our_jsonb': json.dumps(['burgers make me hungry']), 'our_uuid': my_uuid, #string 'our_store': 'jumps=>"high",name=>"betty"', #string 'our_citext': 'maGICKal 3', 'our_cidr': '192.168.102.128/32', 'our_inet': '192.168.102.128/32', 'our_mac': '08:00:2b:01:02:05', 'our_money': '$412.1234' } insert_record(cur, test_table_name, self.rec_5) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 1}) records_by_stream = runner.get_records_from_target_output() self.assertTrue(len(records_by_stream) > 0) for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertDictEqual(recs['schema'], expected_schemas[stream]) self.assertEqual( 1, len(records_by_stream['postgres_logical_replication_test'] ['messages'])) actual_record_2 = records_by_stream[ 'postgres_logical_replication_test']['messages'][0]['data'] actual_sdc_lsn_2 = int(actual_record_2['_sdc_lsn']) del actual_record_2['_sdc_lsn'] expected_inserted_record = { 'our_text': 'some text 3', 'our_real': decimal.Decimal('6.6'), '_sdc_deleted_at': None, 'our_store': { 'name': 'betty', 'jumps': 'high' }, 'our_bigint': 3000000, 'our_varchar': 'our_varchar 5', 'our_double': decimal.Decimal('3.3'), 'our_bit': True, 'our_uuid': self.rec_5['our_uuid'], 'OUR TS': '1993-03-03T03:03:03.333333+00:00', 'OUR TS TZ': '1993-03-03T08:03:03.333333+00:00', 'OUR TIME': '03:04:05', 'OUR TIME TZ': '03:04:05-04:00', 'OUR DATE': '1933-03-03T00:00:00+00:00', 'our_decimal': decimal.Decimal('1234567890.03'), 'id': 5, 'our_varchar_10': 'varchar13', 'our_json': '{"secret": 33}', 'our_jsonb': self.rec_5['our_jsonb'], 'our_smallint': 3, 'our_integer': 96000, 'our_boolean': True, 'our_citext': 'maGICKal 3', 'our_cidr': self.rec_5['our_cidr'], 'our_inet': '192.168.102.128', 'our_mac': self.rec_5['our_mac'], 'our_alignment_enum': None, 'our_money': '$412.12' } self.assertDictEqual(expected_inserted_record, actual_record_2) self.assertEqual( records_by_stream['postgres_logical_replication_test']['messages'] [0]['action'], 'upsert') print("inserted record is correct") state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone( chicken_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_2 = chicken_bookmark['lsn'] self.assertTrue(lsn_2 >= lsn_1) #table_version does NOT change self.assertEqual( chicken_bookmark['version'], table_version, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" ) #---------------------------------------------------------------------- # invoke the sync job again after deleting a record #---------------------------------------------------------------------- print("delete row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute("DELETE FROM {} WHERE id = 3".format( canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # verify the inserted record's lsn is less than or equal to the bookmarked lsn self.assertGreaterEqual(lsn_2, actual_sdc_lsn_2) expected_record_count = 1 if actual_sdc_lsn_2 < lsn_2 else 2 self.assertEqual( record_count_by_stream, {'postgres_logical_replication_test': expected_record_count}) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # if there are 2 records... if expected_record_count == 2: # the 1st message will be the previous insert insert_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][0]['data'] del insert_message['_sdc_lsn'] self.assertDictEqual(insert_message, expected_inserted_record) #the 2nd message will be the delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][ expected_record_count - 1] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 3) print("deleted record is correct") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark['lsn'], msg="expected bookmark for stream ROOT-CHICKEN to have an scn") lsn_3 = bookmark['lsn'] self.assertTrue(lsn_3 >= lsn_2) #---------------------------------------------------------------------- # invoke the sync job again after deleting a record using the 'id IN (SELECT ...)' format #---------------------------------------------------------------------- print("delete row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( "DELETE FROM {} WHERE id IN (SELECT id FROM {} WHERE id=2)" .format( canonicalized_table_name(test_schema_name, test_table_name, cur), canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 2}) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) #first record will be the previous delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][0] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 3) #the 2nd message will be the more recent delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][1] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 2) print("deleted record is correct") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark['lsn'], msg="expected bookmark for stream ROOT-CHICKEN to have an scn") lsn_4 = bookmark['lsn'] self.assertTrue(lsn_4 >= lsn_3) #table_version does NOT change self.assertEqual( bookmark['version'], table_version, msg= "expected bookmark for stream postgres_logical_replication_test to match version" ) #---------------------------------------------------------------------- # invoke the sync job again after deleting a record using the 'id IN (<id>, <id>)' format #---------------------------------------------------------------------- print("delete row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute("DELETE FROM {} WHERE id IN (4, 5)".format( canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 3}) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) #first record will be the previous delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][0] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 2) #the 2nd message will be the more recent delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][1] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 4) print("deleted record is correct") #the 3rd message will be the more recent delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][2] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 5) print("deleted record is correct") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark['lsn'], msg="expected bookmark for stream ROOT-CHICKEN to have an scn") lsn_5 = bookmark['lsn'] self.assertTrue(lsn_5 >= lsn_4) #table_version does NOT change self.assertEqual( bookmark['version'], table_version, msg= "expected bookmark for stream postgres_logical_replication_test to match version" ) #---------------------------------------------------------------------- # invoke the sync job again after updating a record #---------------------------------------------------------------------- print("updating row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute( "UPDATE {} SET our_varchar = 'THIS HAS BEEN UPDATED', our_money = '$56.811', our_decimal = 'NaN', our_real = '+Infinity', our_double = 'NaN' WHERE id = 1" .format( canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 3}) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) self.assertEqual( len(records_by_stream['postgres_logical_replication_test'] ['messages']), 3) #first record will be the previous first delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][0] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 4) #second record will be the previous second delete delete_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][1] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 5) #third record will be the new update updated_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][2] del updated_message['data']['_sdc_lsn'] self.assertEqual(updated_message['action'], 'upsert') expected_updated_rec = { 'our_varchar': 'THIS HAS BEEN UPDATED', 'id': 1, 'our_varchar_10': "varchar_10", 'our_text': "some text", 'our_integer': 44100, 'our_smallint': 1, 'our_bigint': 1000000, 'our_decimal': None, 'OUR TS': '1997-02-02T02:02:02.722184+00:00', 'OUR TS TZ': '1997-02-02T07:02:02.722184+00:00', 'OUR TIME': '12:11:10', 'OUR TIME TZ': '12:11:10-04:00', 'OUR DATE': '1998-03-04T00:00:00+00:00', 'our_double': None, 'our_real': None, 'our_boolean': True, 'our_bit': False, 'our_json': '{"secret": 55}', 'our_jsonb': self.rec_1['our_jsonb'], 'our_uuid': self.rec_1['our_uuid'], '_sdc_deleted_at': None, 'our_store': { 'name': 'betty', 'size': 'small' }, 'our_citext': 'maGICKal', 'our_cidr': self.rec_1['our_cidr'], 'our_inet': self.rec_1['our_inet'], 'our_mac': self.rec_1['our_mac'], 'our_alignment_enum': 'bad', 'our_money': '$56.81' } self.assertDictEqual(expected_updated_rec, updated_message['data']) print("updated record is correct") #check state again state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") chicken_bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone( chicken_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_6 = chicken_bookmark['lsn'] self.assertTrue(lsn_6 >= lsn_5) #table_version does NOT change self.assertEqual( chicken_bookmark['version'], table_version, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" ) #---------------------------------------------------------------------- # invoke the sync job one last time. should only get the PREVIOUS update #---------------------------------------------------------------------- sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # we will get the previous update record again self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 1}) # TODO the next line is not grabing the record from the latest sync, opening potential for false negatives update_message = records_by_stream[ 'postgres_logical_replication_test']['messages'][2] self.assertEqual(update_message['action'], 'upsert') self.assertEqual( set(update_message['data'].keys()), set(expected_updated_rec.keys()), msg="keys for expected_record_1 are wrong: {}".format( set(update_message['data'].keys()).symmetric_difference( set(expected_updated_rec.keys())))) for k, v in update_message['data'].items(): self.assertEqual(v, expected_updated_rec[k], msg="{} != {} for key {}".format( v, expected_updated_rec[k], k)) #check state again state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks'][ 'logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone( chicken_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_7 = chicken_bookmark['lsn'] self.assertTrue(lsn_7 >= lsn_6) #table_version does NOT change self.assertEqual( chicken_bookmark['version'], table_version, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" )
def test_run(self): """ Verify for each stream that you can do a sync which records bookmarks. Verify that the bookmark is the max value sent to the target for the `date` PK field Verify that the 2nd sync respects the bookmark Verify that all data of the 2nd sync is >= the bookmark from the first sync Verify that the number of records in the 2nd sync is less then the first Verify inclusivivity of bookmarks PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ untested_streams = self.child_streams().union({ 'transfers', 'payout_transactions', # BUG see create test 'balance_transactions', # join stream, can't be updated 'disputes', }) cannot_update_streams = { 'invoice_line_items', # updates not available via api } # Ensure tested streams have existing records expected_records_first_sync = {stream: [] for stream in self.streams_to_create} for _ in range(2): # create 3 records for each stream but only expect the 3rd for stream in self.streams_to_create: self.new_objects[stream].append(create_object(stream)) for stream in self.streams_to_create: self.new_objects[stream].append(create_object(stream)) expected_records_first_sync[stream].append({"id": self.new_objects[stream][-1]['id']}) self.START_DATE = self.get_properties().get('start_date') # Instantiate connection with default start conn_id = connections.ensure_connection(self) # run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # Select all testable streams and all fields within streams streams_to_select = self.expected_incremental_streams().difference(untested_streams) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_to_select] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator first_sync_start = self.local_to_utc(dt.utcnow()) first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_end = self.local_to_utc(dt.utcnow()) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual( streams_to_select, set(first_sync_record_count.keys()), msg="Expected only testable streams to be replicated: {}".format(first_sync_record_count) ) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync first_sync_records = runner.get_records_from_target_output() # Add data before next sync via insert and update, and set expectations created_records = {x: [] for x in self.expected_streams()} updated_records = {x: [] for x in self.expected_streams()} expected_records_second_sync = {x: [] for x in self.expected_streams()} # Update one record from each stream prior to 2nd sync first_sync_created, _ = self.split_records_into_created_and_updated(first_sync_records) for stream in self.streams_to_create.difference(cannot_update_streams): # There needs to be some test data for each stream, otherwise this will break record = expected_records_first_sync[stream][0] updated_record = update_object(stream, record["id"]) updated_records[stream].append(updated_record) expected_records_second_sync[stream].append({"id": updated_record['id']}) # Ensure different times between udpates and inserts sleep(2) # Insert (create) one record for each stream prior to 2nd sync for stream in self.streams_to_create: created_record = create_object(stream) self.new_objects[stream].append(created_record) created_records[stream].append(created_record) expected_records_second_sync[stream].append({"id": created_record['id']}) # ensure validity of expected_records_second_sync for stream in self.streams_to_create: if stream in self.expected_incremental_streams(): if stream in cannot_update_streams: # Some streams will have only 1 record from the Insert self.assertEqual(1, len(expected_records_second_sync.get(stream)), msg="Expectations are invalid for incremental stream {}".format(stream) ) continue # Most streams will have 2 records from the Update and Insert self.assertEqual(2, len(expected_records_second_sync.get(stream)), msg="Expectations are invalid for incremental stream {}".format(stream) ) elif stream in self.expected_full_table_streams(): self.assertEqual( len(expected_records_second_sync.get(stream)), len(expected_records_first_sync.get(stream)) + len(created_records[stream]), msg="Expectations are invalid for full table stream {}".format(stream) ) # created_records[stream] = self.records_data_type_conversions(created_records.get(stream)) # updated_records[stream] = self.records_data_type_conversions(updated_records.get(stream)) # Run a second sync job using orchestrator second_sync_start = self.local_to_utc(dt.utcnow()) second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_end = self.local_to_utc(dt.utcnow()) second_sync_state = menagerie.get_state(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() second_sync_created, second_sync_updated = self.split_records_into_created_and_updated(second_sync_records) # Loop first_sync_records and compare against second_sync_records for stream in self.streams_to_create.difference(untested_streams): with self.subTest(stream=stream): second_sync_data = [record.get("data") for record in second_sync_records.get(stream, {}).get("messages", [])] stream_replication_keys = self.expected_replication_keys() stream_primary_keys = self.expected_primary_keys() # TESTING INCREMENTAL STREAMS if stream in self.expected_incremental_streams(): replication_keys = stream_replication_keys.get(stream) # Verify both syncs write / keep the same bookmark keys self.assertEqual(set(first_sync_state.get('bookmarks', {}).keys()), set(second_sync_state.get('bookmarks', {}).keys())) # verify that there is more than 1 record of data - setup necessary self.assertGreater(first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get less data on the 2nd sync self.assertGreater( first_sync_record_count.get(stream, 0), second_sync_record_count.get(stream, 0), msg="first sync didn't have more records, bookmark usage not verified") if stream in self.streams_to_create: for replication_key in replication_keys: updates_replication_key = "updates_created" updates_stream = stream + "_events" # Verify second sync's bookmarks move past the first sync's self.assertGreater( second_sync_state.get('bookmarks', {updates_stream: {}}).get( updates_stream, {replication_key: -1}).get(updates_replication_key), first_sync_state.get('bookmarks', {updates_stream: {}}).get( updates_stream, {updates_replication_key: -1}).get(updates_replication_key) ) # Verify that all data of the 2nd sync is >= the bookmark from the first sync first_sync_bookmark = dt.fromtimestamp( first_sync_state.get('bookmarks').get(updates_stream).get(updates_replication_key) ) for record in second_sync_data: date_value = record["updated"] self.assertGreaterEqual(date_value, dt.strftime(first_sync_bookmark, self.COMPARISON_FORMAT), msg="A 2nd sync record has a replication-key that is less than or equal to the 1st sync bookmark.") elif stream in self.expected_full_table_streams(): raise Exception("Expectations changed, but this test was not updated to reflect them.") # TESTING APPLICABLE TO ALL STREAMS # Verify that the expected records are replicated in the 2nd sync # For incremental streams we should see at least 2 records (a new record and an updated record) # but we may see more as the bookmmark is inclusive and there are hidden creates/updates due to # dependencies between streams. # For full table streams we should see 1 more record than the first sync expected_records = expected_records_second_sync.get(stream) primary_keys = stream_primary_keys.get(stream) updated_pk_values = {tuple([record.get(pk) for pk in primary_keys]) for record in updated_records[stream]} self.assertLessEqual( len(expected_records), len(second_sync_data), msg="Expected number of records are not less than or equal to actual for 2nd sync.\n" + "Expected: {}\nActual: {}".format(len(expected_records), len(second_sync_data)) ) if (len(second_sync_data) - len(expected_records)) > 0: logging.warn('Second sync replicated %s records more than our create and update for %s', len(second_sync_data), stream) if not primary_keys: raise NotImplementedError("PKs are needed for comparing records") # Verify that the inserted and updated records are replicated by the 2nd sync for expected_record in expected_records: expected_pk_value = expected_record.get('id') sync_pk_values = [sync_record.get('id') for sync_record in second_sync_data if sync_record.get('id') == expected_pk_value] self.assertTrue( len(sync_pk_values) > 0, msg="A record is missing from our sync: \nSTREAM: {}\tPK: {}".format(stream, expected_pk_value) ) self.assertIn(expected_pk_value, sync_pk_values) # Verify updated fields are replicated as expected for updated_record in updated_records[stream]: expected_updated_key = 'metadata' expected_updated_value_substring = 'bob' updated_pk_value = updated_record.get('id') sync_records_metadata = [sync_record.get('metadata') for sync_record in second_sync_data if sync_record.get('id') == updated_pk_value] self.assertTrue(len(sync_records_metadata) == 1) self.assertIn(expected_updated_value_substring, sync_records_metadata[0].get('test_value'))
def test_run(self): """ Verify that for each stream you can do a sync which records bookmarks. That the bookmark is the maximum value sent to the target for the replication key. That a second sync respects the bookmark All data of the second sync is >= the bookmark from the first sync The number of records in the 2nd sync is less then the first (This assumes that new data added to the stream is done at a rate slow enough that you haven't doubled the amount of data from the start date to the first sync between the first sync and second sync run in this test) Verify that for full table stream, all data replicated in sync 1 is replicated again in sync 2. PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ expected_streams = self.expected_check_streams() expected_replication_keys = self.expected_replication_keys() expected_replication_methods = self.expected_replication_method() ########################################################################## # First Sync ########################################################################## conn_id = connections.ensure_connection(self) # Run in check mode found_catalogs = self.run_and_verify_check_mode(conn_id) # table and field selection catalog_entries = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in expected_streams ] self.perform_and_verify_table_and_field_selection( conn_id, catalog_entries) # Run a first sync job using orchestrator first_sync_record_count = self.run_and_verify_sync(conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## # Update State Between Syncs ########################################################################## new_states = {'bookmarks': dict()} simulated_states = self.calculated_states_by_stream( first_sync_bookmarks) for stream, new_state in simulated_states.items(): new_states['bookmarks'][stream] = new_state menagerie.set_state(conn_id, new_states) ########################################################################## # Second Sync ########################################################################## second_sync_record_count = self.run_and_verify_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmarks = menagerie.get_state(conn_id) ########################################################################## # Test By Stream ########################################################################## for stream in expected_streams: with self.subTest(stream=stream): # expected values expected_replication_method = expected_replication_methods[ stream] # collect information for assertions from syncs 1 & 2 base on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks', { stream: None }).get(stream) if expected_replication_method == self.INCREMENTAL: # collect information specific to incremental streams from syncs 1 & 2 replication_key = next( iter(expected_replication_keys[stream])) first_bookmark_value = first_bookmark_key_value.get( replication_key) second_bookmark_value = second_bookmark_key_value.get( replication_key) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark_value = self.convert_state_to_utc( new_states['bookmarks'][stream][replication_key]) # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) self.assertIsNotNone(first_bookmark_value) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) self.assertIsNotNone(second_bookmark_value) # Verify the second sync bookmark is Equal to the first sync bookmark # assumes no changes to data during test if not stream == "users": self.assertEqual(second_bookmark_value, first_bookmark_value) else: # For `users` stream it stores bookmark as 1 minute less than current time if `updated_at` of # last records less than it. So, if there is no data change then second_bookmark_value will be # 1 minute less than current time. Therefore second_bookmark_value will always be # greater or equal to first_bookmark_value self.assertGreaterEqual(second_bookmark_value, first_bookmark_value) for record in first_sync_messages: # Verify the first sync bookmark value is the max replication key value for a given stream replication_key_value = record.get(replication_key) # For `ticket` stream it stores bookmarks as int timestamp. So, converting it to the string. if stream == "tickets": replication_key_value = datetime.utcfromtimestamp( replication_key_value).strftime( '%Y-%m-%dT%H:%M:%SZ') self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) for record in second_sync_messages: # Verify the second sync replication key value is Greater or Equal to the first sync bookmark replication_key_value = record.get(replication_key) if stream == "tickets": replication_key_value = datetime.utcfromtimestamp( replication_key_value).strftime( '%Y-%m-%dT%H:%M:%SZ') self.assertGreaterEqual( replication_key_value, simulated_bookmark_value, msg= "Second sync records do not repect the previous bookmark." ) # Verify the second sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced." ) elif expected_replication_method == self.FULL_TABLE: # Verify the syncs do not set a bookmark for full table streams self.assertIsNone(first_bookmark_key_value) self.assertIsNone(second_bookmark_key_value) # Verify the number of records in the second sync is the same as the first # Given below streams are child stremas of parent stream `tickets` and tickets is incremental streams # Child streams also behave like incremental streams but does not save it's own state. So, it don't # have same no of record on second sync and first sync. if not stream in [ "ticket_comments", "ticket_audits", "ticket_metrics" ]: self.assertEqual(second_sync_count, first_sync_count) else: raise NotImplementedError( "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}" .format(stream, expected_replication_method)) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def bookmarks_test(self, conn_id, testable_streams): # Select all streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) incremental_streams = { key for key, value in self.expected_replication_method().items() if value == self.INCREMENTAL and key in testable_streams } # Our test data sets for Shopify do not have any abandoned_checkouts our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in incremental_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) ################################# # Run first sync ################################# first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual(set(first_sync_record_count.keys()), incremental_streams) first_sync_bookmark = menagerie.get_state(conn_id) first_sync_records = runner.get_records_from_target_output() # BUG:TDL-17087 : State has additional values which are not streams # Need to remove additional values from bookmark value extra_stuff = { 'transaction_orders', 'metafield_products', 'refund_orders', 'product_variants' } for keys in list(first_sync_bookmark['bookmarks'].keys()): if keys in extra_stuff: first_sync_bookmark['bookmarks'].pop(keys) ####################### # Update State between Syncs ####################### new_state = {'bookmarks': dict()} #simulated_states = self.calculated_states_by_stream(first_sync_bookmark) # We are hardcoding the updated state to ensure that we get atleast 1 record in second sync. These values have been provided after reviewing the max bookmark value for each of the streams simulated_states = { 'products': { 'updated_at': '2021-12-20T05:10:05.000000Z' }, 'collects': { 'updated_at': '2021-09-01T09:08:28.000000Z' }, 'abandoned_checkouts': { 'updated_at': '2022-02-02T16:00:00.000000Z' }, 'inventory_levels': { 'updated_at': '2021-12-20T05:09:34.000000Z' }, 'locations': { 'updated_at': '2021-07-20T09:00:22.000000Z' }, 'events': { 'created_at': '2021-12-20T05:09:01.000000Z' }, 'inventory_items': { 'updated_at': '2021-09-15T19:44:11.000000Z' }, 'transactions': { 'created_at': '2021-12-20T00:08:52-05:00' }, 'metafields': { 'updated_at': '2021-09-07T21:18:05.000000Z' }, 'order_refunds': { 'created_at': '2021-05-01T17:41:18.000000Z' }, 'customers': { 'updated_at': '2021-12-20T05:08:17.000000Z' }, 'orders': { 'updated_at': '2021-12-20T05:09:01.000000Z' }, 'custom_collections': { 'updated_at': '2021-12-20T17:41:18.000000Z' } } for stream, updated_state in simulated_states.items(): new_state['bookmarks'][stream] = updated_state menagerie.set_state(conn_id, new_state) ############################### # Run Second Sync ############################### second_sync_record_count = self.run_sync(conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_bookmark = menagerie.get_state(conn_id) for stream in testable_streams: with self.subTest(stream=stream): # expected values expected_replication_method = self.expected_replication_method( ) expected_replication_keys = self.expected_replication_keys() # information required for assertions from sync 1 and 2 based on expected values first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) first_sync_messages = [ record.get('data') for record in first_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] second_sync_messages = [ record.get('data') for record in second_sync_records.get( stream, {}).get('messages', []) if record.get('action') == 'upsert' ] first_bookmark_value = first_sync_bookmark.get( 'bookmarks', { stream: None }).get(stream) first_bookmark_value = list(first_bookmark_value.values())[0] second_bookmark_value = second_sync_bookmark.get( 'bookmarks', { stream: None }).get(stream) second_bookmark_value = list(second_bookmark_value.values())[0] replication_key = next(iter(expected_replication_keys[stream])) first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) simulated_bookmark = new_state['bookmarks'][stream] simulated_bookmark_value = list(simulated_bookmark.values())[0] # verify the syncs sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_value) self.assertTrue( self.is_expected_date_format(first_bookmark_value)) self.assertIsNotNone(second_bookmark_value) self.assertTrue( self.is_expected_date_format(second_bookmark_value)) # verify the 2nd bookmark is equal to 1st sync bookmark #NOT A BUG (IS the expected behaviour for shopify as they are using date windowing : TDL-17096 : 2nd bookmark value is getting assigned from the execution time rather than the actual bookmark time. This is an invalid assertion for shopify #self.assertEqual(first_bookmark_value, second_bookmark_value) for record in first_sync_messages: replication_key_value = record.get(replication_key) # verify 1st sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater replication key value was synced" ) for record in second_sync_messages: replication_key_value = record.get(replication_key) # verify the 2nd sync replication key value is greater or equal to the 1st sync bookmarks self.assertGreaterEqual( replication_key_value, simulated_bookmark_value, msg= "Second sync records do not respect the previous bookmark" ) # verify the 2nd sync bookmark value is the max replication key value for a given stream self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater replication key value was synced" ) # verify that we get less data in the 2nd sync # collects has all the records with the same value of replication key, so we are removing from this assertion if stream not in ('collects'): self.assertLess( second_sync_count, first_sync_count, msg= "Second sync does not have less records, bookmark usage not verified" ) # verify that we get atleast 1 record in the second sync if stream not in ('collects'): self.assertGreater( second_sync_count, 0, msg="Second sync did not yield any records")
def binlog_edge_test(self, expected_records=[]): """ Test binlog replication edge cases • Verify an initial sync returns expected records of various datatypes • Verify we bookmark correctly when a transaction spans multiple files • Insert and delete a record prior to sync. Verify both events are replicated • Insert and update a record prior to sync. Verify both events are replicated • Verify a valid log_file and log_pos state are persisted after each sync """ conn_id = connections.ensure_connection(self) # prior to first sync update a record... updated_timestamp = datetime.datetime.now() updated_id = 1 expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime( updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ") # insert a record and... inserted_record = self.generate_record_n(len(expected_records)) expected_records += [inserted_record] # TODO need to format # delete a record deleted_id = 2 with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: cur.execute( "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format( self.database_name(), self.table_name_1(), updated_timestamp, updated_id)) self.insert_record(cur, inserted_record, self.table_name_1()) delete_time = datetime.datetime.now() cur.execute("DELETE FROM {}.{} WHERE id = {}".format( self.database_name(), self.table_name_1(), deleted_id)) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \ "\nEVENTS: {} records updated".format(1) + \ "\n {} records deleted\n\n".format(1) ) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) t1 = self.table_name_1() t2 = self.table_name_2() expected_check_streams = { self.tap_stream_id(t1), self.tap_stream_id(t2) } expected_sync_streams = {t1, t2} expected_pks = {t1: {'id'}, t2: {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name']) self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] for catalog in found_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) _ = connections.select_catalog_and_fields_via_metadata( conn_id, catalog, catalog, additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258 # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream) records_for_stream = runner.get_records_from_target_output()[ self.table_name_1()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] # verify activate version messages are present self.assertEqual('activate_version', message_actions[0]) self.assertEqual('activate_version', message_actions[-1]) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] # we need to compare record by record since there are so many. # a failure comparing expected_records to upsert_records would result in # an output message greater in length than a standard tmux buffer # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257 # for expected_record in expected_records: # upsert_record = [rec for rec in upsert_records # if rec['id'] == expected_record['id']] # self.assertEqual(1, len(upsert_record), # msg="multiple upsert_recs with same pk: {}".format(upsert_record)) # self.assertEqual(expected_record, upsert_record.pop()) # TODO add check for _sdc_delete_at for deleted record once bug addressed # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # Create 1 more record prior to 2nd sync new_record = self.generate_record_n(len(expected_records)) with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, new_record, self.table_name_1()) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \ "\nEVENTS: {} records inserted".format(1) ) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: print("PATH A") self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] print("PATH B") self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) # Execute delete across tables using join prior to 3rd sync deleted_id = 4 with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: delete_time = datetime.datetime.now() # DELETE T1, T2 # FROM T1 # INNER JOIN T2 ON T1.key = T2.key # WHERE condition; db = self.database_name() db_t1 = db + "." + t1 db_t2 = db + "." + t2 t1_key = db_t1 + ".id" t2_key = db_t2 + ".id" statement = "DELETE {}, {} ".format(db_t1, db_t2) + \ "FROM {} ".format(t1) + \ "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \ "WHERE {} = {}".format(t1_key, deleted_id) cur.execute(statement) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \ "\nTABLE: {}".format(self.table_name_2()) + \ "\nEVENTS: {} records deleted\n\n".format(1) ) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) target_records = runner.get_records_from_target_output() records_stream_1 = target_records[self.table_name_1()] upsert_records_1 = [ m['data'] for m in records_stream_1['messages'] if m['action'] == 'upsert' ] records_stream_2 = target_records[self.table_name_2()] upsert_records_2 = [ m['data'] for m in records_stream_2['messages'] if m['action'] == 'upsert' ] # make sure the record is in the target for both tables with a delete time deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at') deleted_at_t1_timestamp = utils.strptime_to_utc( deleted_at_t1).timestamp() self.assertIsNotNone(deleted_at_t1) deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at') deleted_at_t2_timestamp = utils.strptime_to_utc( deleted_at_t2).timestamp() self.assertIsNotNone(deleted_at_t2) # the delete times should be equal since it was a single transaction self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp) time_delta = delete_time.timestamp() - deleted_at_t1_timestamp print("Delete time vs record: difference in seconds", time_delta) self.assertLess(time_delta, 3) # time delta less than 3 seconds in magnitude
def test_run(self): """ Verify that a bookmark doesn't exist for the stream Verify that the second sync includes the same number or more records than the first sync Verify that all records in the first sync are included in the second sync Verify that the sync only sent records to the target for selected streams (catalogs) PREREQUISITE For EACH stream that is fully replicated there are multiple rows of data with different values for the replication key """ conn_id = self.create_connection_with_initial_discovery() # Select all streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) full_streams = { key for key, value in self.expected_replication_method().items() if value == self.FULL } our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in full_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual( set(first_sync_record_count.keys()), full_streams, logging="verify only full table streams were replicated") first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync first_sync_records_by_stream = runner.get_records_from_target_output() # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records_by_stream = runner.get_records_from_target_output() for stream in full_streams: with self.subTest(stream=stream): # verify there is no bookmark values from state state_value = first_sync_state.get("bookmarks", {}).get(stream) self.assertIsNone( state_value, logging="verify no bookmark value is saved in state") # verify that there is more than 1 record of data - setup necessary self.assertGreater( first_sync_record_count.get(stream, 0), 1, logging="verify multiple records are replicatied") # verify that you get the same or more data the 2nd time around self.assertGreaterEqual( second_sync_record_count.get(stream, 0), first_sync_record_count.get(stream, 0), logging= "verify the second full table sync replicates at least as many records as the first sync" ) # verify all data from 1st sync included in 2nd sync first_sync_records = [ record["data"] for record in first_sync_records_by_stream[stream]["messages"] ] second_sync_records = [ record["data"] for record in second_sync_records_by_stream[stream]["messages"] ] LOGGER.info( "verify all records from the first sync are replicated in the second sync" ) for record in first_sync_records: self.assertIn(record, second_sync_records)
def binlog_test(self): """ Test binlog replication • Verify an initial sync returns expected records of various datatypes • Verify no changes and a subsequent sync results in no replicated records • Update, Delete, and Insert records then verify the next sync captures these changes • Verify some log_file and log_pos state was persisted after each sync """ print("RUNNING {}\n\n".format(self.name())) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) expected_check_streams = {self.tap_stream_id()} expected_sync_streams = {self.table_name()} expected_pks = {self.table_name(): {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(self.table_name(), test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {self.table_name(): 2}) records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual( message_actions, ['activate_version', 'upsert', 'upsert', 'activate_version']) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] self.assertEqual([expected_rec_1, expected_rec_2], upsert_records) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # run some inserts, updates, and deletes in source updated_rec_1_varchar = 'THIS HAS BEEN UPDATED' with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: cur.execute( "UPDATE {}.{} SET our_varchar = '{}' WHERE id = {}".format( self.database_name(), self.table_name(), updated_rec_1_varchar, rec_1['id'])) delete_time = datetime.datetime.now() cur.execute("DELETE FROM {}.{} WHERE id = {}".format( self.database_name(), self.table_name(), rec_2['id'])) self.insert_record(cur, rec_3) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: print("PATH A") self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] print("PATH B") self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] updated_expected_rec_1 = copy.deepcopy(expected_rec_1) updated_expected_rec_2 = copy.deepcopy(expected_rec_2) updated_expected_rec_3 = copy.deepcopy(expected_rec_3) updated_expected_rec_1['our_varchar'] = updated_rec_1_varchar # Floats that come back from binlog provide more precision # than from SELECT based queries updated_expected_rec_1['our_unsigned_float'] = Decimal( "1.2345000505447388") updated_expected_rec_1['our_signed_float'] = -Decimal( "1.2345000505447388") # updated_expected_rec_1['_sdc_deleted_at'] = None updated_expected_rec_2['our_unsigned_float'] = Decimal( "2.4690001010894775") updated_expected_rec_2['our_signed_float'] = -Decimal( "2.4690001010894775") # updated_expected_rec_2['_sdc_deleted_at'] = None # updated_expected_rec_3['_sdc_deleted_at'] = None # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # check for expected records record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {self.table_name(): 3}) records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['upsert', 'upsert', 'upsert']) upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] deleted_at_rec = upsert_records[1].get('_sdc_deleted_at') deleted_at_rec_timestamp = utils.strptime_to_utc( deleted_at_rec).timestamp() time_delta = delete_time.timestamp() - deleted_at_rec_timestamp print("Delete time vs record: difference in seconds", time_delta) self.assertIsNotNone(deleted_at_rec) assert (time_delta < 3) #i dunno # since we don't know exactly what the _sdc_deleted_at value will be # we will make the assertions we can make on that field here # and then remove it from all records prior to doing a full # record-level comparison self.assertIn('_sdc_deleted_at', upsert_records[0]) self.assertIn('_sdc_deleted_at', upsert_records[1]) self.assertIn('_sdc_deleted_at', upsert_records[2]) self.assertIsNone(upsert_records[0].get('_sdc_deleted_at')) self.assertIsNotNone(upsert_records[1].get('_sdc_deleted_at')) self.assertIsNone(upsert_records[2].get('_sdc_deleted_at')) del upsert_records[0]['_sdc_deleted_at'] del upsert_records[1]['_sdc_deleted_at'] del upsert_records[2]['_sdc_deleted_at'] self.assertEqual([ updated_expected_rec_1, updated_expected_rec_2, updated_expected_rec_3 ], upsert_records) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {})
def test_run(self): """ Verify for each stream that you can do a sync which records bookmarks. Verify that the bookmark is the max value sent to the target for the `date` PK field Verify that the 2nd sync respects the bookmark Verify that all data of the 2nd sync is >= the bookmark from the first sync Verify that the number of records in the 2nd sync is less then the first Verify inclusivivity of bookmarks PREREQUISITE For EACH stream that is incrementally replicated there are multiple rows of data with different values for the replication key """ print("\n\nTESTING IN SQUARE_ENVIRONMENT: {}".format( os.getenv('TAP_SQUARE_ENVIRONMENT'))) print("\n\nRUNNING {}\n\n".format(self.name())) # Instatiate static start date self.START_DATE = self.STATIC_START_DATE # Ensure tested streams have data expected_records_first_sync = self.create_test_data( self.testable_streams_static(), self.START_DATE) # Instantiate connection with default start conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Select all testable streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) streams_to_select = self.testable_streams_static() our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_to_select ] self.select_all_streams_and_fields(conn_id, our_catalogs) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual( streams_to_select, set(first_sync_record_count.keys()), msg= "Expect first_sync_record_count keys {} to equal testable streams {}," " first_sync_record_count was {}".format( first_sync_record_count.keys(), streams_to_select, first_sync_record_count)) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync runner.get_records_from_target_output() # Set expectations for 2nd sync expected_records_second_sync = {x: [] for x in self.expected_streams()} # adjust expectations for full table streams to include the expected records from sync 1 for stream in self.testable_streams_static(): if stream in self.expected_full_table_streams(): for record in expected_records_first_sync.get(stream, []): expected_records_second_sync[stream].append(record) # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() second_sync_state = menagerie.get_state(conn_id) # Loop first_sync_records and compare against second_sync_records for stream in self.testable_streams_static(): with self.subTest(stream=stream): second_sync_data = [ record.get("data") for record in second_sync_records.get( stream, {}).get("messages", {"data": {}}) ] # TESTING INCREMENTAL STREAMS if stream in self.expected_incremental_streams(): # Verify both syncs write / keep the same bookmark self.assertEqual( set(first_sync_state.get('bookmarks', {}).keys()), set(second_sync_state.get('bookmarks', {}).keys())) # Verify second sync's bookmarks move past the first sync's self.assertGreater( second_sync_state.get('bookmarks', { stream: {} }).get(stream, { 'updated_at': -1 }).get('updated_at'), first_sync_state.get('bookmarks', { stream: {} }).get(stream, { 'updated_at': -1 }).get('updated_at')) # verify that there is more than 1 record of data - setup necessary self.assertGreater( first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get no data on the 2nd sync self.assertGreaterEqual( 0, second_sync_record_count.get(stream, 0), msg= "first sync didn't have more records, bookmark usage not verified" ) elif stream in self.expected_full_table_streams(): # TESTING FULL TABLE STREAMS # Verify no bookmarks are present first_state = first_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, first_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(first_sync_state) + \ "\tBookmark: {}".format(first_state)) second_state = second_sync_state.get('bookmarks', {}).get(stream) self.assertEqual({}, second_state, msg="Unexpected state for {}\n".format(stream) + \ "\tState: {}\n".format(second_sync_state) + \ "\tBookmark: {}".format(second_state)) # TESTING APPLICABLE TO ALL STREAMS # Verify that the expected records are replicated in the 2nd sync # For incremental streams we should see 0 records # For full table streams we should see the same records from the first sync expected_records = expected_records_second_sync.get(stream, []) self.assertEqual( len(expected_records), len(second_sync_data), msg= "Expected number of records do not match actual for 2nd sync.\n" + "Expected: {}\nActual: {}".format( len(expected_records), len(second_sync_data)))
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc # This needs to be consistent based on replication method so you can change replication methods table_version = records_by_stream[stream]['table_version'] # verify on the first sync you get activate version message before and after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is self.assertIsNone( bookmark.get('lsn'), msg= "expected bookmark for stream to have NO lsn because we are using full-table replication" ) self.assertEqual( bookmark['version'], table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))