def test_run(self): """ Verify that we can get multiple pages of automatic fields for each stream """ conn_id = connections.ensure_connection(self) self.run_and_verify_check_mode(conn_id) self.select_and_verify_fields(conn_id, select_all_fields=False) record_count_by_stream = self.run_and_verify_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() # Assert all expected streams synced at least a full pages of records for stream in self.expected_sync_streams(): with self.subTest(stream=stream): self.assertGreater( record_count_by_stream.get(stream, 0), int(self.get_properties()['page_size']), msg="{} did not sync more than a page of records".format( stream)) for stream_name, actual_fields in actual_fields_by_stream.items(): with self.subTest(stream=stream_name): self.assertSetEqual( self.expected_automatic_fields()[stream_name], actual_fields)
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ self.start_date = '2020-11-10T00:00:00Z' conn_id = self.create_connection(original_properties=False) # Select all parent streams and no fields within streams # Select all (testable) report streams and only fields which are automatic and/or required by bing to genereate a report found_catalogs = menagerie.get_catalogs(conn_id) test_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()] # BUG_SRCE-4313 (https://stitchdata.atlassian.net/browse/SRCE-4313) streams missing automatic fields specific_fields = {**self.report_automatic_fields(), **self.parent_automatic_fields()} # COMMENT to reproduce # specific_fields = {**self.report_measure_fields(), **self.parent_automatic_fields()} # UNCOMMENT to reproduce # specific_fields = self.report_measure_fields() # TODO Use this line once bugs addressed. self.perform_and_verify_adjusted_selection( conn_id, test_catalogs, select_all_fields=False, specific_fields=specific_fields ) # COMMENT EVERYTHING DOWN FROM HERE TO ADDRESS BUG_SRCE-4313 # Run a sync job using orchestrator state = menagerie.get_state(conn_id) record_count_by_stream = self.run_and_verify_sync(conn_id, state) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_sync_streams(): with self.subTest(stream=stream): if stream == 'goals_and_funnels_report': # SKIP TESTING FOR THIS STREAM continue # There is no data available, since we would need to implement a tracking script on singer's site # verify that you get some records for each stream self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg="The number of records is not over the stream max limit") # verify that only the automatic fields are sent to the target for parent streams, and that # automatic fields, _sdc_report_datetime, AND specific measure fields are sent to target for report streams actual = actual_fields_by_stream.get(stream) or set() expected = self.expected_automatic_fields().get(stream, set()) if stream.endswith('_report'): # update expectations for report streams expected_measure = 'Assists' if stream.startswith('goals') else 'Clicks' expected.update({ '_sdc_report_datetime', # tap applies sdc value as pk for all reports expected_measure # reports require a perf measure (which is intentionally not automatic) }) self.assertSetEqual(expected, actual)
def automatic_test(self, conn_id, testable_streams): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ incremental_streams = { key for key, value in self.expected_replication_method().items() if value == self.INCREMENTAL and key in testable_streams } # Select all streams and no fields within streams # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in incremental_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in incremental_streams: with self.subTest(stream=stream): # verify that you get more than a page of data # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT stream_metadata = self.expected_metadata().get(stream, {}) minimum_record_count = stream_metadata.get( self.API_LIMIT, self.get_properties().get('result_per_page', self.DEFAULT_RESULTS_PER_PAGE)) self.assertGreater( record_count_by_stream.get(stream, -1), minimum_record_count, msg="The number of records is not over the stream max limit" ) # verify that only the automatic fields are sent to the target self.assertEqual( actual_fields_by_stream.get(stream, set()), self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get(stream, set()), msg= "The fields sent to the target are not the automatic fields" )
def do_test(self, conn_id): """ Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ # Select all streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_streams(): with self.subTest(stream=stream): # verify that we can paginate with all fields selected self.assertGreater( record_count_by_stream.get(stream, -1), self.expected_metadata().get(stream, {}).get(self.API_LIMIT, 0), msg="The number of records is not over the stream max limit" ) # verify that the automatic fields are sent to the target self.assertTrue( actual_fields_by_stream.get(stream, set()).issuperset( self.expected_primary_keys().get(stream, set()) | self.top_level_replication_key_fields().get( stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include all automatic fields" ) # verify we have more fields sent to the target than just automatic fields # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM self.assertTrue( actual_fields_by_stream.get(stream, set( )).symmetric_difference( self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include non-automatic fields" )
def do_test(self, conn_id): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ self.create_test_data() # Select all streams and no fields within streams # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE found_catalogs = menagerie.get_catalogs(conn_id) self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=False) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_streams(): with self.subTest(stream=stream): # verify that you get more than a page of data # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT self.assertGreater( record_count_by_stream.get(stream, -1), self.expected_metadata().get(stream, {}).get(self.API_LIMIT), msg="The number of records is not over the stream max limit" ) # verify that only the automatic fields are sent to the target expected_fields_for_stream = ( self.expected_primary_keys().get(stream, set()) | self.top_level_replication_key_fields().get(stream, set()) | self.expected_foreign_keys().get(stream, set())) self.assertEqual( actual_fields_by_stream.get(stream, set()), expected_fields_for_stream, msg= "The fields sent to the target are not the automatic fields.\nExpected: {}\nActual: {}" .format(expected_fields_for_stream, actual_fields_by_stream.get(stream, set())))
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables expected_streams = self.testable_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] for catalog_entry in catalog_entries: stream_schema = menagerie.get_annotated_schema( conn_id, catalog_entry['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog_entry, stream_schema) # Run sync first_record_count_by_stream = self.run_and_verify_sync(conn_id) replicated_row_count = sum(first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Test by Stream for stream in self.testable_streams(): with self.subTest(stream=stream): expected_fields = set( synced_records.get(stream)['schema']['properties'].keys()) print('Number of expected keys ', len(expected_fields)) actual_fields = set( runner.examine_target_output_for_fields()[stream]) print('Number of actual keys ', len(actual_fields)) print('Number of known missing keys ', len(KNOWN_MISSING_FIELDS[stream])) unexpected_fields = actual_fields & KNOWN_MISSING_FIELDS[stream] if unexpected_fields: print('WARNING: Found new fields: {}'.format( unexpected_fields)) self.assertSetEqual( expected_fields, actual_fields | KNOWN_MISSING_FIELDS[stream])
def test_run(self): """ Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ conn_id = connections.ensure_connection(self) incremental_streams = {key for key, value in self.expected_replication_method().items() if value == self.INCREMENTAL} untested_streams = self.child_streams().union({ 'balance_transactions', # 'charges', # 'coupons', # 'customers', 'disputes', # 'invoice_items', 'invoice_line_items', # 'invoices', 'payout_transactions', # 'payouts', # 'plans', # 'products', 'subscription_items', # 'subscriptions', 'transfers', }) tested_streams = incremental_streams.difference(untested_streams) # Select all streams and all fields within streams found_catalogs = self.run_and_verify_check_mode(conn_id) our_catalogs = get_catalogs(conn_id, tested_streams) self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Ensure tested streams have a record count which exceeds the API LIMIT logging.info("Checking record counts for tested streams...") streams_to_create = {} for stream in tested_streams: records = list_all_object(stream) record_count = len(records) streams_to_create[stream] = record_count logging.info(" Stream {} has {} records created today".format(stream, record_count)) logging.info("Creating records for tested streams...") new_objects = {stream: [] for stream in streams_to_create} for stream in streams_to_create: if stream != "events" and streams_to_create[stream] <= self.API_LIMIT: while streams_to_create[stream] <= self.API_LIMIT: logging.info("Creating a record for {} | {} records created today ".format(stream, streams_to_create[stream])) new_objects[stream].append(create_object(stream)) streams_to_create[stream] += 1 records = list_all_object(stream) self.assertEqual(100, len(records)) logging.info(" Stream {} has at least {} records created today".format(stream, len(records) + 1)) # Run a sync job using orchestrator record_count_by_stream = self.run_and_verify_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in incremental_streams.difference(untested_streams): with self.subTest(stream=stream): # verify that we can paginate with all fields selected self.assertGreater( record_count_by_stream.get(stream, -1), self.expected_metadata().get(stream, {}).get(self.API_LIMIT, 0), msg="The number of records is not over the stream max limit") # verify that the automatic fields are sent to the target actual = actual_fields_by_stream.get(stream) or set() expected = self.expected_automatic_fields().get(stream, set()) self.assertTrue(actual.issuperset(expected), msg="The fields sent to the target don't include all automatic fields. " "Expected: {}, Actual: {}". format(expected, actual) ) # verify we have more fields sent to the target than just automatic fields # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM actual = actual_fields_by_stream.get(stream) or set() expected = self.expected_automatic_fields().get(stream, set()) self.assertTrue(actual.symmetric_difference(expected), msg="The fields sent to the target don't include any non-automatic fields" ) if stream != "events": actual = actual_fields_by_stream.get(stream, set()) expected = set(new_objects[stream][0].keys())
def test_run(self): """ Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ conn_id = self.create_connection() # Select all streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) incremental_streams = { key for key, value in self.expected_replication_method().items() if value == self.INCREMENTAL } untested_streams = self.child_streams().union({ 'abandoned_checkouts', 'collects', 'metafields', 'transactions', 'order_refunds' }) our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in incremental_streams.difference( untested_streams) ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_streams().difference(untested_streams): with self.subTest(stream=stream): # verify that we can paginate with all fields selected self.assertGreater( record_count_by_stream.get(stream, -1), self.expected_metadata().get(stream, {}).get(self.API_LIMIT, 0), msg="The number of records is not over the stream max limit" ) # verify that the automatic fields are sent to the target self.assertTrue( actual_fields_by_stream.get(stream, set()).issuperset( self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include all automatic fields" ) # verify we have more fields sent to the target than just automatic fields # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM self.assertTrue( actual_fields_by_stream.get(stream, set( )).symmetric_difference( self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include non-automatic fields" )
def test_run(self): """ Verify that a bookmark doesn't exist for the stream Verify that the second sync includes the same number or more records than the first sync Verify that all records in the first sync are included in the second sync Verify that the sync only sent records to the target for selected streams (catalogs) PREREQUISITE For EACH stream that is fully replicated there are multiple rows of data with different values for the replication key """ print("running test {}".format(self.name())) conn_id = self.create_connection() # Select all streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) full_streams = { key for key, value in self.expected_replication_method().items() if value == self.FULL } our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in full_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual(set(first_sync_record_count.keys()), full_streams) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync first_sync_records = runner.get_records_from_target_output() # Get the fields for each stream from the first sync actual_fields_by_stream = runner.examine_target_output_for_fields() # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() # THIS MAKES AN ASSUMPTION THAT CHILD STREAMS DO NOT NEED TESTING. # ADJUST IF NECESSARY for stream in full_streams.difference(self.child_streams()): with self.subTest(stream=stream): # verify there is no bookmark values from state state_value = first_sync_state.get("bookmarks", {}).get(stream) self.assertIsNone(state_value) # verify that there is more than 1 record of data - setup necessary self.assertGreater( first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get the same or more data the 2nd time around self.assertGreaterEqual( second_sync_record_count.get(stream, 0), first_sync_record_count.get(stream, 0), msg= "second syc didn't have more records, full sync not verified" ) # verify all data from 1st sync included in 2nd sync first_data = [ record["data"] for record in first_sync_records.get( stream, {}).get("messages", {"data": {}}) ] second_data = [ record["data"] for record in second_sync_records.get( stream, {}).get("messages", {"data": {}}) ] same_records = 0 for first_record in first_data: first_value = json.dumps(first_record, sort_keys=True) for compare_record in second_data: compare_value = json.dumps(compare_record, sort_keys=True) if first_value == compare_value: second_data.remove(compare_record) same_records += 1 break self.assertEqual( len(first_data), same_records, msg= "Not all data from the first sync was in the second sync") # verify we have more fields sent to the target than just automatic fields # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM self.assertTrue( actual_fields_by_stream.get(stream, set( )).symmetric_difference( self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include non-automatic fields" )
def test_run(self): """ Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ conn_id = self.create_connection_with_initial_discovery() self.create_test_data() # Select all streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() synced_recs = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): # gather expectations expected_pks = self.expected_primary_keys()[stream] # gather results record_count = record_count_by_stream.get(stream, -1) api_limit = self.expected_metadata().get(stream, {}).get(self.API_LIMIT) replicated_fields = actual_fields_by_stream.get(stream, set()) pk_value_list = [ tuple(message.get("data").get(pk) for pk in expected_pks) for message in synced_recs[stream].get("messages", []) if message["action"] == "upsert" ] unique_pk_values = set(pk_value_list) # verify that we can paginate with all fields selected self.assertGreater( record_count, api_limit, logging="verify the number of records replicated exceeds the stream api limit" ) # verify that the automatic fields are sent to the target self.assertTrue( replicated_fields.issuperset( self.expected_primary_keys().get(stream, set()) | self.top_level_replication_key_fields().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), logging="verify the automatic fields are sent to the target" ) # verify we have more fields sent to the target than just automatic fields # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM self.assertTrue( replicated_fields.difference( self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get(stream, set()) ), logging="verify more than just the automatic fields are sent to the target" ) # verify no records have dulpicate primary-keys value self.assertEqual(len(pk_value_list), len(unique_pk_values), logging="verify records have unique primary key values")
def test_run(self): """ - Verify that for each stream you can get multiple pages of data - when no fields are selected and only the automatic fields are replicated. - Verify that all replicated records have unique primary key values PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ conn_id = self.create_connection_with_initial_discovery() self.create_test_data() # Select all streams and no fields within streams # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE found_catalogs = menagerie.get_catalogs(conn_id) expected_streams = self.expected_streams() our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in expected_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # gather expectations expected_primary_keys = self.expected_primary_keys().get( stream, set()) expected_automatic_fields = ( expected_primary_keys | self.top_level_replication_key_fields().get(stream, set()) | self.expected_foreign_keys().get(stream, set())) api_limit = self.expected_metadata().get(stream, {}).get( self.API_LIMIT) # collect results messages = synced_records.get(stream) record_count = record_count_by_stream.get(stream, -1) fields_replicated = actual_fields_by_stream.get(stream, set()) records_pks_list = [ tuple([ message.get('data').get(primary_key) for primary_key in expected_primary_keys ]) for message in messages.get('messages') ] # verify that you get more than a page of data self.assertGreater( record_count, api_limit, logging="verify multiple pages are replicated") # verify that only the automatic fields are sent to the target self.assertEqual( fields_replicated, expected_automatic_fields, logging="verify only automatic fields are replicated") # Verify that all replicated records have unique primary key values self.assertCountEqual( set(records_pks_list), records_pks_list, msg="We have duplicate records for {}".format(stream), logging="verify all records have unique primary key values" )
def run_test(self, streams): """ Testing that the pagination works when there are records greater than the page size - Verify for each stream you can get multiple pages of data - Verify by pks that the data replicated matches the data we expect. """ expected_streams = streams conn_id = connections.ensure_connection(self) # Select all streams and all fields within streams found_catalogs = self.run_and_verify_check_mode(conn_id) self.select_found_catalogs(conn_id, found_catalogs, only_streams=expected_streams) # Run a sync job using orchestrator record_count_by_stream = self.run_and_verify_sync( conn_id, expected_streams) actual_fields_by_stream = runner.examine_target_output_for_fields() sync_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): # verify that we can paginate with all fields selected minimum_record_count = self.page_size self.assertGreater( record_count_by_stream.get(stream, -1), minimum_record_count, msg="The number of records is not over the stream max limit" ) expected_primary_key = self.expected_primary_keys().get( stream, set()) sync_messages = sync_records.get(stream, { 'messages': [] }).get('messages') expected_replication_key = self.expected_replication_keys( ).get(stream, set()) expected_automatic_fields = expected_primary_key | expected_replication_key # verify that the automatic fields are sent to the target self.assertTrue( actual_fields_by_stream.get( stream, set()).issuperset(expected_automatic_fields), msg= "The fields sent to the target don't include all automatic fields" ) # verify we have more fields sent to the target than just automatic fields self.assertTrue( actual_fields_by_stream.get( stream, set()).symmetric_difference(expected_automatic_fields), msg= "The fields sent to the target don't include non-automatic fields" ) # Verify we did not duplicate any records across pages records_pks_list = [ tuple([ message.get('data').get(primary_key) for primary_key in expected_primary_key ]) for message in sync_messages ] self.assertCountEqual( records_pks_list, set(records_pks_list), msg=f"We have duplicate records for {stream}")
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ conn_id = connections.ensure_connection(self) streams_to_create = { # "balance_transactions", # should be created implicity with a create in the payouts or charges streams "charges", "coupons", "customers", "invoice_items", "invoice_line_items", # this is created implicity by invoices, it just creates another invoice "invoices", # this will create an invoice_item "payouts", "plans", "products", "subscription_items", "subscriptions", # this will create a new plan and payment method } untested_streams = {"disputes", "transfers", "payout_transactions"} new_objects = { stream: create_object(stream) for stream in streams_to_create.difference() } # Select all streams and no fields within streams # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE found_catalogs = self.run_and_verify_check_mode(conn_id) self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=False) # Run a sync job using orchestrator record_count_by_stream = self.run_and_verify_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_streams().difference(untested_streams): with self.subTest(stream=stream): # verify that you get some records for each stream # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT self.assertGreater( record_count_by_stream.get(stream, -1), 0, msg="The number of records is not over the stream max limit" ) # verify that only the automatic fields are sent to the target actual = actual_fields_by_stream.get(stream) or set() expected = self.expected_automatic_fields().get(stream, set()) self.assertEqual( actual, expected, msg= ("The fields sent to the target are not the automatic fields. Expected: {}, Actual: {}" .format(actual, expected)))
def pagination_test(self, conn_id, testable_streams): """ Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ # Select all streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) incremental_streams = { key for key, value in self.expected_replication_method().items() if value == self.INCREMENTAL and key in testable_streams } # our_catalogs = [catalog for catalog in found_catalogs if # catalog.get('tap_stream_id') in incremental_streams.difference( # untested_streams)] our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in testable_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() sync_records = runner.get_records_from_target_output() api_limit = int(self.get_properties().get( 'results_per_page', self.DEFAULT_RESULTS_PER_PAGE)) for stream in testable_streams: with self.subTest(stream=stream): # verify that we can paginate with all fields selected stream_metadata = self.expected_metadata().get(stream, {}) minimum_record_count = 100 if stream == 'transactions' else api_limit self.assertGreater( record_count_by_stream.get(stream, -1), minimum_record_count, msg="The number of records is not over the stream max limit" ) expected_pk = self.expected_primary_keys() sync_messages = sync_records.get(stream, { 'messages': [] }).get('messages') # verify that the automatic fields are sent to the target self.assertTrue( actual_fields_by_stream.get(stream, set()).issuperset( expected_pk.get(stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include all automatic fields" ) # verify we have more fields sent to the target than just automatic fields # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM self.assertTrue( actual_fields_by_stream.get(stream, set( )).symmetric_difference( expected_pk.get(stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include non-automatic fields" ) # Verify we did not duplicate any records across pages records_pks_set = { tuple([ message.get('data').get(primary_key) for primary_key in expected_pk.get(stream, set()) ]) for message in sync_messages } records_pks_list = [ tuple([ message.get('data').get(primary_key) for primary_key in expected_pk.get(stream, set()) ]) for message in sync_messages ] self.assertCountEqual( records_pks_set, records_pks_list, msg=f"We have duplicate records for {stream}")
def do_test(self, conn_id): """ Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ # Select all streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) # self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True) # Run a sync job using orchestrator record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() untested_streams = [ stream for stream in self._master if not self._master[stream]['test'] ] for stream in self.expected_streams().difference( set(untested_streams)): with self.subTest(stream=stream): logging.info("Testing " + stream) # verify that we can paginate with all fields selected self.assertGreater( record_count_by_stream.get(stream, -1), self.expected_metadata().get(stream, {}).get(self.API_LIMIT, 0), msg="The number of records is not over the stream max limit" ) # TODO - change following assertion to assertEqual and capture all fields # Note - This ^ is nontrivial for fileds which span multiple streams # ex. {evet_type: send} in estimate_messages = {sent_at: time} in estimates # verify the target recieves all possible fields for a given stream self.assertEqual( set(), self._master[stream]["expected_fields"].difference( actual_fields_by_stream.get(stream, set())), msg= "The fields sent to the target have an extra or missing field" ) # verify that the automatic fields are sent to the target for non-child streams if not self._master[stream]["child"]: self.assertTrue( actual_fields_by_stream.get(stream, set()).issuperset( self.expected_primary_keys().get(stream, set()) | self.expected_replication_keys().get( stream, set()) | self.expected_foreign_keys().get(stream, set())), msg= "The fields sent to the target don't include all automatic fields" )
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ print("running test {}".format(self.name())) conn_id = self.create_connection() # Select all streams and no fields within streams # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=False, additional_md=additional_md, non_selected_properties=[ "MySmallIntColumn", "MyBigIntColumn", "MyTinyIntColumn", "my_boolean", "MyIntColumn" ]) # Run a sync job using orchestrator menagerie.set_state(conn_id, {}) record_count_by_stream = self.run_sync(conn_id) actual_fields_by_stream = runner.examine_target_output_for_fields() for stream in self.expected_streams(): with self.subTest(stream=stream): # verify that you get more than a page of data TODO this isn't really testing this... # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT self.assertGreater( record_count_by_stream.get(stream, -1), self.expected_metadata().get(stream, {}).get(self.API_LIMIT, 0), msg="The number of records is not over the stream max limit" ) # verify that only the automatic fields are sent to the target self.assertEqual( actual_fields_by_stream.get(stream, set()), self.expected_primary_keys_by_stream_id().get( stream, set()) | self.expected_replication_keys().get(stream, set()) | self.expected_foreign_keys().get(stream, set()), msg= "The fields sent to the target are not the automatic fields" )