def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ print("\n\nRUNNING {}\n\n".format(self.name())) # Resetting tracked parent objects prior to test utils.reset_tracked_parent_objects() # ensure data exists for sync streams and set expectations _, existing_boards = utils.get_total_record_count_and_objects('boards') custom_fields_dict = {x: [] for x in self.expected_custom_fields() } # ids by stream custom_fields_by_board = { x.get('id'): copy.deepcopy(custom_fields_dict) for x in existing_boards } # ids by stream # get existing custom fields for each board print("Getting objects on baord with static custom field set") for board_id, board_cfields in custom_fields_by_board.items(): cfields = utils.get_custom_fields('boards', board_id) for field in self.expected_custom_fields(): cfields_type_field = [f for f in cfields if f['type'] == field] if cfields_type_field: board_cfields[field] += cfields_type_field # get expected cards with custom fields expected_records_cfields = list() board_id = utils.NEVER_DELETE_BOARD_ID all_cards_on_board = utils.get_objects('cards', parent_id=board_id) print("Setting custom fields expectations based on static data") for card in all_cards_on_board: card_with_cfields = utils.get_objects('cards', obj_id=card.get('id'), parent_id=board_id, custom_fields=True) if card_with_cfields: expected_records_cfields += card_with_cfields # veryify at least 1 record exists for each custom field type or else our assertions are invalid fields_exist = {x: False for x in self.expected_custom_fields()} for record in expected_records_cfields: if all(v for _, v in fields_exist.items()): break value = record.get('value') if value: key = next(iter(value)) if key in self.expected_custom_fields( ) and not fields_exist.get(key): fields_exist[key] = True elif key == 'checked': fields_exist['checkbox'] = True elif key == 'option': fields_exist['list'] = True self.assertTrue(all(v for _, v in fields_exist.items()), msg="Not all custom field types have data. Data must be restored manually on Trello account" +\ "\nCurrent data: {}".format(fields_exist)) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") # Select all streams and all fields self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True) for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) for k in self.expected_automatic_fields()[cat['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( cat['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') catalogs = menagerie.get_catalogs(conn_id) #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in first_record_count_by_stream.items(): assert stream in self.expected_sync_streams() self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print("total replicated row count: {}".format(replicated_row_count)) # Testing streams with custom fields for stream in self.testable_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) record_messages = [row['data'] for row in data['messages']] record_ids = [message.get('id') for message in record_messages] record_custom_fields = [ message.get('customFieldItems') for message in record_messages if message.get('customFieldItems', None) ] record_cfield_ids = [] for record in record_custom_fields: for cfield in record: record_cfield_ids.append(cfield.get('id')) # Verify that we replicated the records with custom_fields for card in all_cards_on_board: if card.get('id') in expected_records_cfields: self.assertIn( card.get('id'), records_ids, msg="Missing a record that has custom fields:\n{}". format(card.get('id'))) # Verify that we replicated the expected custom fields on those records for expected_cfield in expected_records_cfields: self.assertIn( expected_cfield.get('id'), record_cfield_ids, msg="Missing custom field from expected {} record id={}" .format(stream, expected_cfield.get('id'))) # Verify the expected custom field attributes match the replicated data for actual_cfields in record_custom_fields: expected_cfield_replicated = expected_cfield in actual_cfields if expected_cfield_replicated: break self.assertTrue(expected_cfield_replicated) # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ print("\n\nRUNNING {}\n\n".format(self.name())) # Resetting tracked parent objects prior to test utils.reset_tracked_parent_objects() # ensure data exists for sync streams and set expectations expected_records = {x: [] for x in self.expected_sync_streams() } # ids by stream for stream in self.testable_streams(): since = None if stream in self.expected_incremental_streams(): since = dt.strptime(self.get_properties()['start_date'], self.START_DATE_FORMAT).strftime( self.TEST_TIME_FORMAT) _, existing_objects = utils.get_total_record_count_and_objects( stream, since=since) if existing_objects: logging.info("Data exists for stream: {}".format(stream)) for obj in existing_objects: expected_records[stream].append({ field: obj.get(field) for field in self.expected_automatic_fields().get( stream) }) continue logging.info("Data does not exist for stream: {}".format(stream)) new_object = utils.create_object(stream) logging.info("Data generated for stream: {}".format(stream)) expected_records[stream].append({ field: new_object.get(field) for field in self.expected_automatic_fields().get(stream) }) conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") # Select all streams but only automtic fields self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=False) for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) for k in self.expected_automatic_fields()[cat['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( cat['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') catalogs = menagerie.get_catalogs(conn_id) #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in first_record_count_by_stream.items(): assert stream in self.expected_sync_streams() self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print("total replicated row count: {}".format(replicated_row_count)) for stream in self.testable_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in data['messages'] ] expected_keys = self.expected_automatic_fields().get(stream) # Verify that ONLY automatic fields are emitted for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(expected_keys), set(), msg="Expected automatic fields and nothing else.") actual_records = [row['data'] for row in data['messages']] # Verify the number of records match expectations # NOTE: actions seem to be getting updated by trello's backend resulting in an action from a previous # test run gettting synced again, so we will be less strict for this stream if stream == 'actions': self.assertLessEqual(len(expected_records.get(stream)), len(actual_records), msg="Number of actual records do match expectations. " +\ "We probably have duplicate records.") else: self.assertEqual(len(expected_records.get(stream)), len(actual_records), msg="Number of actual records do match expectations. " +\ "We probably have duplicate records.") # verify by values, that we replicated the expected records for actual_record in actual_records: if stream != 'actions': # see NOTE above self.assertTrue( actual_record in expected_records.get(stream), msg="Actual record missing from expectations") for expected_record in expected_records.get(stream): self.assertTrue(expected_record in actual_records, msg="Expected record missing from target.") # CLEAN UP stream_to_delete = 'boards' boards_remaining = 5 print("Deleting all but {} records for stream {}.".format( boards_remaining, stream_to_delete)) board_count = len(expected_records.get(stream_to_delete, [])) for obj_to_delete in expected_records.get( stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ print("\n\nRUNNING {}\n\n".format(self.name())) # Ensure tested streams have a record count which exceeds the API LIMIT expected_records = {x: [] for x in self.expected_sync_streams()} # ids by stream final_count = {x: 0 for x in self.expected_sync_streams()} for stream in self.testable_streams(): # just actions at the moment # Look for parent object with most number of stream records start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) since = start_date.strftime(self.TEST_TIME_FORMAT) parent_stream = utils.get_parent_stream(stream) record_count, parent_id = self.get_highest_record_count_by_parent_obj_id(parent_stream, stream, since) if record_count > 0: # If we do have data already add it to expectations logging.info("Data exists for stream: {}".format(stream)) existing_objects = utils.get_objects(obj_type=stream, parent_id=parent_id, since=since) assert record_count == len(existing_objects), "TEST ISSUE | referencing wrong parent obj." for obj in existing_objects: expected_records[stream].append( {field: obj.get(field) for field in self.expected_automatic_fields().get(stream)} ) if record_count <= self.API_LIMIT: logging.info("Not enough data to paginate : {} has {} records".format(stream, record_count)) while record_count <= self.API_LIMIT: new_object = utils.create_object(obj_type=stream, parent_id=parent_id) record_count += 1 logging.info("Record Created: {} has {} records".format(stream, record_count)) expected_records[stream].append({field: new_object.get(field) for field in self.expected_automatic_fields().get(stream)}) final_count[stream] = record_count logging.info("FINAL RECORD COUNT: {} has {} records".format(stream, final_count[stream])) # Verify we did in fact generate enough records to exceed the API LIMIT # If we are failing here, it is most likely an issue with /tests/trello_utils.py self.assertGreater(final_count[stream], self.API_LIMIT, msg="Failed to create sufficient data prior to sync.") conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, cat['stream_id']) for k in self.expected_automatic_fields()[cat['stream_name']]: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format(cat['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, cat, catalog_entry) #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() for stream in self.testable_streams(): with self.subTest(stream=stream): # Verify we are paginating for testable synced streams self.assertGreater(record_count_by_stream.get(stream, -1), self.API_LIMIT, msg="We didn't gaurantee pagination. The number of records should exceed the api limit.") data = synced_records.get(stream, []) record_messages_keys = [set(row['data'].keys()) for row in data['messages']] for actual_keys in record_messages_keys: # Verify that the automatic fields are sent to the target for paginated streams self.assertEqual(self.expected_automatic_fields().get(stream) - actual_keys, set(), msg="A paginated synced stream has a record that is missing automatic fields.") # Verify we have more fields sent to the target than just automatic fields (this is set above) # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM self.assertGreater(actual_keys, self.expected_automatic_fields().get(stream), msg="A paginated synced stream has a record that is missing non-automatic fields.") # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def test_run(self): print("\n\nRUNNING {}\n\n".format(self.name())) # ensure data exists for sync streams and set expectations expected_records_1 = {x: [] for x in self.expected_sync_streams()} # ids by stream for stream in self.expected_sync_streams().difference(self.untestable_streams()): if stream in self.expected_incremental_sync_streams(): start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) since = start_date.strftime(self.TEST_TIME_FORMAT) _, existing_objects = utils.get_total_record_count_and_objects(stream, since=since) else: _, existing_objects = utils.get_total_record_count_and_objects(stream) if existing_objects: logging.info("Data exists for stream: {}".format(stream)) for obj in existing_objects: # add existing records to expectations expected_records_1[stream].append(obj) continue # Create 1 record if none exist logging.info("Data does not exist for stream: {}".format(stream)) new_object = utils.create_object(stream) logging.info("Data generated for stream: {}".format(stream)) expected_records_1[stream].append(new_object) # Create comment actions start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) since = start_date.strftime(self.TEST_TIME_FORMAT) # count_before, before_records = utils.get_total_record_count_and_objects('actions', since=since) action_comments = [] action_comments.append(utils.create_object('actions', action_type="comment")) action_comments.append(utils.create_object('actions', action_type="comment")) for action in action_comments: expected_records_1['actions'].append(action) # count_after, after_records = utils.get_total_record_count_and_objects('actions', since=since) # run in check mode conn_id = connections.ensure_connection(self) check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) for k in self.expected_automatic_fields()[c['stream_name']]: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify data was replicated record_count_by_stream_1 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks() ) replicated_row_count_1 = reduce(lambda accum,c : accum + c, record_count_by_stream_1.values()) self.assertGreater(replicated_row_count_1, 0, msg="failed to replicate any data: {}".format(record_count_by_stream_1)) print("total replicated row count: {}".format(replicated_row_count_1)) # get emitted with records synced_records_1 = runner.get_records_from_target_output() # Verify bookmarks were saved for all streams state_1 = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): self.assertTrue(state_1.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks meet expectations") # Generate data between syncs for bookmarking streams print("Generating more data prior to 2nd sync") expected_records_2 = {x: [] for x in self.expected_sync_streams()} for stream in self.expected_full_table_sync_streams().difference(self.untestable_streams()): for _ in range(1): new_object = utils.create_object(stream) expected_records_2[stream].append({field: new_object.get(field) for field in self.expected_automatic_fields().get(stream)}) # Update a single comment action before second sync print("Updating existing data prior to 2nd sync") updated_records = {x: [] for x in self.expected_sync_streams()} action_id_to_update = random.choice(action_comments).get('id') updated_action = utils.update_object_action(obj_id=action_id_to_update) updated_records['actions'].append(updated_action) # Get new actions from data manipulation between syncs print("Acquriing in-test actions prior to 2nd sync") for stream in self.expected_incremental_sync_streams().difference(self.untestable_streams()): state = dt.strptime(state_1.get('bookmarks').get(stream).get('window_start'), self.TEST_TIME_FORMAT) since = (state - timedelta(days=self.LOOKBACK_WINDOW)).strftime(self.TEST_TIME_FORMAT) # start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) # since = start_date.strftime(self.TEST_TIME_FORMAT) _, objects = utils.get_total_record_count_and_objects(stream, since=since) for obj in objects: expected_records_2[stream].append({field: obj.get(field) for field in self.expected_automatic_fields().get(stream)}) # Run another sync print("Running 2nd sync job") sync_job_name_2 = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status_2 = menagerie.get_exit_status(conn_id, sync_job_name_2) menagerie.verify_sync_exit_status(self, exit_status_2, sync_job_name_2) # verify data was replicated record_count_by_stream_2 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks() ) replicated_row_count_2 = reduce(lambda accum,c : accum + c, record_count_by_stream_2.values()) self.assertGreater(replicated_row_count_2, 0, msg="failed to replicate any data: {}".format(record_count_by_stream_2)) print("total replicated row count: {}".format(replicated_row_count_2)) # get emitted with records synced_records_2 = runner.get_records_from_target_output() # Verify bookmarks were saved as expected inc streams state_2 = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): self.assertTrue(state_2.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks meet expectations") # TESTING FULL TABLE STREAMS for stream in self.expected_full_table_sync_streams().difference(self.untestable_streams()): with self.subTest(stream=stream): record_count_1 = record_count_by_stream_1.get(stream, 0) record_count_2 = record_count_by_stream_2.get(stream, 0) # Assert we have data for both syncs for full table streams self.assertGreater(record_count_1, 0) self.assertGreater(record_count_2, 0) # Assert that we are capturing the expected number of records for full table streams self.assertGreater(record_count_2, record_count_1, msg="Full table streams should have more data in second sync.") self.assertEqual((record_count_2 - record_count_1), len(expected_records_2.get(stream, [])), msg="The differnce in record counts between syncs should " +\ "equal the number of records we created between syncs.\n" +\ "This is not the case for {}".format(stream)) # Test that we are capturing the expected records for full table streams expected_ids_1 = set(record.get('id') for record in expected_records_1.get(stream)) data_1 = synced_records_1.get(stream, []) record_messages_1 = [row.get('data') for row in data_1['messages']] record_ids_1 = set(row.get('data').get('id') for row in data_1['messages']) expected_ids_2 = set(record.get('id') for record in expected_records_2.get(stream)) data_2 = synced_records_2.get(stream, []) record_messages_2 = [row.get('data') for row in data_2['messages']] record_ids_2 = set(row.get('data').get('id') for row in data_2['messages']) # verify all expected records are replicated for both syncs self.assertEqual(expected_ids_1, record_ids_1, msg="Data discrepancy. Expected records do not match actual in sync 1.") self.assertTrue(expected_ids_1.issubset(record_ids_2), msg="Data discrepancy. Expected records do not match actual in sync 2.") for expected_record in expected_records_1.get(stream): actual_record = [message for message in record_messages_1 if message.get('id') == expected_record.get('id')].pop() self.assertEqual(set(expected_record.keys()), set(actual_record.keys()), msg="Field mismatch between expectations and replicated records in sync 1.") # verify the 2nd sync gets records created after the 1st sync self.assertEqual(set(record_ids_2).difference(set(record_ids_1)), expected_ids_2, msg="We did not get the new record(s)") print("Full table streams tested.") # TESTING INCREMENTAL STREAMS for stream in self.expected_incremental_sync_streams().difference(self.untestable_streams()): with self.subTest(stream=stream): record_count_1 = record_count_by_stream_1.get(stream, 0) record_count_2 = record_count_by_stream_2.get(stream, 0) # Assert we have data for both syncs for inc streams self.assertGreater(record_count_1, 0) self.assertGreater(record_count_2, 0) # Assert that we are capturing the expected number of records for inc streams self.assertEqual(record_count_1, len(expected_records_1.get(stream, [])), msg="Stream {} replicated an unexpedted number records on 1st sync.".format(stream)) self.assertEqual(record_count_2, len(expected_records_2.get(stream, [])), msg="Stream {} replicated an unexpedted number records on 2nd sync.".format(stream)) # Assert that we are capturing the expected records for inc streams data_1 = synced_records_1.get(stream, []) record_messages_1 = [row.get('data').get('id') for row in data_1['messages']] data_2 = synced_records_2.get(stream, []) record_messages_2 = [row.get('data').get('id') for row in data_2['messages']] for record in expected_records_1.get(stream): self.assertTrue(record.get('id') in record_messages_1, msg="Missing an expected record from sync 1.") for record in expected_records_2.get(stream): self.assertTrue(record.get('id') in record_messages_2, msg="Missing an expected record from sync 2.") record_data_1 = [row.get('data') for row in data_1['messages']] record_data_2 = [row.get('data') for row in data_2['messages']] # Testing action comments (the only action type that can be updated) for action in action_comments: # Get text value for action comment from sync 1 original_action_text = "" for record in record_data_1: if record.get('id') == action.get('id'): original_action_text = record.get('data').get('text') assert original_action_text, "Record {} is missing from 1st sync.".format(action.get('id')) # Get text value for action comment from sync 2 for record in record_data_2: if record.get('id') == action.get('id'): current_action_text = record.get('data').get('text') assert current_action_text, "Record {} is missing from 2nd sync.".format(action.get('id')) # Verify the action comment text matches expectations if action.get('id')== action_id_to_update: self.assertNotEqual(original_action_text, current_action_text, msg="Update was not captured.") self.assertIn("UPDATE", current_action_text, msg="Update was captured but not as expected.") else: self.assertEqual(original_action_text, current_action_text, msg="Text does not match expected.") print("Incremental streams tested.") # CLEANING UP stream_to_delete = 'boards' boards_remaining = 5 print("Deleting all but {} records for stream {}.".format(boards_remaining, stream_to_delete)) board_count = len(expected_records_1.get(stream_to_delete, [])) + len(expected_records_2.get(stream_to_delete, [])) for obj_to_delete in expected_records_2.get(stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break for obj_to_delete in expected_records_1.get(stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def test_run(self): print("\n\nRUNNING {}\n\n".format(self.name())) # Initialize start date prior to first sync self.START_DATE = self.get_properties().get('start_date') # ensure data exists for sync streams and set expectations records_to_create = 3 expected_records = {x: [] for x in self.expected_sync_streams() } # ids by stream for stream in self.expected_sync_streams().difference( self.untestable_streams()): if stream in self.expected_incremental_sync_streams(): since = dt.strptime(self.START_DATE, self.START_DATE_FORMAT).strftime( self.TEST_TIME_FORMAT) _, existing_objects = utils.get_total_record_count_and_objects( stream, since=since) else: _, existing_objects = utils.get_total_record_count_and_objects( stream) if existing_objects: logging.info("Data exists for stream: {}".format(stream)) for obj in existing_objects: # add existing records to expectations expected_records[stream].append({ field: obj.get(field) for field in self.expected_automatic_fields().get( stream) }) else: logging.info( "Data does not exist for stream: {}".format(stream)) while len(expected_records.get(stream)) < records_to_create: # Create more records if necessary new_object = utils.create_object(stream) logging.info("Data generated for stream: {}".format(stream)) expected_records[stream].append({ field: new_object.get(field) for field in self.expected_automatic_fields().get(stream) }) # run in check mode conn_id = connections.ensure_connection(self) check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) for k in self.expected_automatic_fields()[c['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( c['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata( conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify data was replicated record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() # Verify bookmarks were saved for all streams state = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): self.assertTrue( state.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks meet expectations") # Grab the empty formatted states to test states_to_test = [ self.get_states_formatted(i) for i in range(len(self.ACTIONS_STATES)) ] ########################################################################## ### Testing standard sync state_0 ########################################################################## version_0 = menagerie.get_state_version(conn_id) # Set window_start to start_date window_start_0 = dt.strptime(self.START_DATE, self.START_DATE_FORMAT) states_to_test[0]['bookmarks']['actions'][ 'window_start'] = window_start_0.strftime(self.TEST_TIME_FORMAT) print("Interjecting test state:\n{}".format(states_to_test[0])) menagerie.set_state(conn_id, states_to_test[0], version_0) # Run another sync print("Running sync job 0") sync_job_name_0 = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status_0 = menagerie.get_exit_status(conn_id, sync_job_name_0) menagerie.verify_sync_exit_status(self, exit_status_0, sync_job_name_0) # verify data was replicated record_count_by_stream_0 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count_0 = reduce(lambda accum, c: accum + c, record_count_by_stream_0.values()) self.assertGreater(replicated_row_count_0, 0, msg="failed to replicate any data: {}".format( record_count_by_stream_0)) print("total replicated row count: {}".format(replicated_row_count_0)) synced_records_0 = runner.get_records_from_target_output() # Test state_0 print("Testing State 0") state_0 = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): # Verify bookmarks were saved as expected inc streams self.assertTrue( state_0.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks meet expectations") for stream in self.expected_sync_streams().difference( self.untestable_streams()): data = synced_records.get(stream) record_messages = [set(row['data']) for row in data['messages']] data_0 = synced_records_0.get(stream) record_messages_0 = [ set(row['data']) for row in data_0['messages'] ] # Verify we got the same number of records as the first sync self.assertEqual( record_count_by_stream_0.get(stream), record_count_by_stream.get(stream), msg="Syncs should replicate the samee number of records") self.assertEqual( record_messages_0, record_messages, msg="Syncs should replicate the samee number of records") # Verify we got the exact same records as the first sync for record_message in record_messages: self.assertTrue(record_message in record_messages_0, msg="Expected {} to be in this sync.".format( record_message)) ########################################################################## ### Testing interrupted sync state_1 with date-windowing ########################################################################## version_1 = menagerie.get_state_version(conn_id) # Set parent_id to id of second-to-last baord the tap will replicate sorted_parent_objs = self.get_tap_sorted_stream() penultimate_created_parent_id, _ = sorted_parent_objs[-2] last_created_parent_id, _ = sorted_parent_objs[-1] states_to_test[1]['bookmarks']['actions'][ 'parent_id'] = penultimate_created_parent_id # Set window_end based off current time window_end_1 = dt.utcnow().strftime(self.TEST_TIME_FORMAT) # window_end_1 = state['bookmarks']['actions']['window_start'] states_to_test[1]['bookmarks']['actions']['window_end'] = window_end_1 # Set sub_window_end to today sub_window_end_1 = dt.strptime( self.START_DATE, self.START_DATE_FORMAT) + timedelta(days=2) states_to_test[1]['bookmarks']['actions'][ 'sub_window_end'] = sub_window_end_1.strftime( self.TEST_TIME_FORMAT) # Set window_start to start_date window_start_1 = dt.strptime(self.START_DATE, self.START_DATE_FORMAT) states_to_test[1]['bookmarks']['actions'][ 'window_start'] = window_start_1.strftime(self.TEST_TIME_FORMAT) print("Interjecting test state:\n{}".format(states_to_test[1])) menagerie.set_state(conn_id, states_to_test[1], version_1) # Run another sync (state_1) print("Running sync job 1") sync_job_name_1 = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status_1 = menagerie.get_exit_status(conn_id, sync_job_name_1) menagerie.verify_sync_exit_status(self, exit_status_1, sync_job_name_1) # verify data was replicated record_count_by_stream_1 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count_1 = reduce(lambda accum, c: accum + c, record_count_by_stream_1.values()) self.assertGreater(replicated_row_count_1, 0, msg="failed to replicate any data: {}".format( record_count_by_stream_1)) print("total replicated row count: {}".format(replicated_row_count_1)) synced_records_1 = runner.get_records_from_target_output() # Test state_1 print("Testing State 1") state_1 = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): # Verify bookmarks were saved as expected inc streams self.assertTrue( state_1.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks for {} meet expectations".format(stream)) # Verify the original sync catches more data since current test state bookmarks on the second most recent board self.assertGreater( record_count_by_stream.get(stream, 0), record_count_by_stream_1.get(stream, 0), msg="Expected to have more records for {}".format(stream)) # Verify sync 1 only replicates data from the bookmarked parent object (the most recently creted board) records_last_board = utils.get_objects( stream, parent_id=last_created_parent_id, since=window_start_1) record_count_last_board = len(records_last_board) records_penult_window_start = utils.get_objects( stream, parent_id=penultimate_created_parent_id, since=window_start_1) record_count_penult_window_start = len(records_penult_window_start) records_penult_sub_window = utils.get_objects( stream, parent_id=penultimate_created_parent_id, since=sub_window_end_1) record_count_penult_sub_window = len(records_penult_sub_window) record_count_penult_board = record_count_penult_window_start - record_count_penult_sub_window for record in records_penult_sub_window: # records_penult_window_start - records_penult_sub_window for rec in records_penult_window_start: if record.get('id') == rec.get('id'): records_penult_window_start.remove(rec) break assert record_count_penult_board == len( records_penult_window_start) expected_record_count_1 = record_count_penult_board + record_count_last_board # expected_records_1 = records_last_board + records_penult_window_start SEE FOR LOOPS synced_actions = synced_records_1.get(stream) actual_data = [ row.get('data').get('id') for row in synced_actions['messages'] ] for record in records_last_board: if record.get('id') in actual_data: continue print("MISSING RECORD {}".format(record)) for record in records_penult_window_start: if record.get('id') in actual_data: continue print("MISSING RECORD {}".format(record)) self.assertEqual( expected_record_count_1, record_count_by_stream_1.get(stream, 0), msg= "Sync 1 should only replicate data from the most recently creted board." ) ########################################################################## ### Testing interrupted sync state_2 without date-windowing ########################################################################## version_2 = menagerie.get_state_version(conn_id) # Set parent_id to id of last baord the tap will replicate # Set window_end based off current time window_end_2 = dt.utcnow().strftime(self.TEST_TIME_FORMAT) # Set window_start to today at midnight window_start_2 = dt.strptime( self.START_DATE, self.START_DATE_FORMAT) + timedelta(days=2) states_to_test[2]['bookmarks']['actions'] = {} for stream in self.expected_full_table_sync_streams().difference( {'boards'}): states_to_test[2]['bookmarks'][stream] = { 'window_start': window_start_2.strftime(self.TEST_TIME_FORMAT), 'window_end': window_end_2, 'parent_id': last_created_parent_id } print("Interjecting test state:\n{}".format(states_to_test[2])) menagerie.set_state(conn_id, states_to_test[2], version_2) # Run another sync print("Running sync job 2") sync_job_name_2 = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status_2 = menagerie.get_exit_status(conn_id, sync_job_name_2) menagerie.verify_sync_exit_status(self, exit_status_2, sync_job_name_2) # verify data was replicated record_count_by_stream_2 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count_2 = reduce(lambda accum, c: accum + c, record_count_by_stream_2.values()) self.assertGreater(replicated_row_count_2, 0, msg="failed to replicate any data: {}".format( record_count_by_stream_2)) print("total replicated row count: {}".format(replicated_row_count_2)) synced_records_2 = runner.get_records_from_target_output() # Test state_2 print("Testing State 2") state_2 = menagerie.get_state(conn_id) for stream in self.expected_full_table_sync_streams().difference( self.untestable_streams()): # Verify bookmarks were saved as expected inc streams self.assertTrue( state_2.get('bookmarks', {}).get(stream, {}).get('window_start', {}), msg="{} should have a bookmark value".format(stream)) print("Bookmarks meet expectations") # Verify the smaller window replicates less data self.assertLessEqual( record_count_by_stream_2.get(stream, 0), record_count_by_stream.get(stream, 0), msg="Expected to have more records for {}".format(stream)) # Verify the actions from today are caught in this sync expected_record_count_2 = len( utils.get_objects(stream, parent_id=last_created_parent_id)) self.assertEqual( expected_record_count_2, record_count_by_stream_2.get(stream, 0), msg= "Should have less than or equal number of records based on whether we lookback." ) ########################################################################## ### CLEAN UP ########################################################################## stream_to_delete = 'boards' boards_remaining = 5 print("Deleting all but {} records for stream {}.".format( boards_remaining, stream_to_delete)) board_count = len(expected_records.get(stream_to_delete, [])) for obj_to_delete in expected_records.get( stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()