def test_persist_to_db_different_regions(self, mock_write, _mock_region, mock_session_return): scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) ii2 = ingest_info.IngestInfo() ii2.create_person( person_id=TEST_ID, full_name=TEST_NAME2).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) mock_session_1 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii, scrape_key1, t) expected_proto = serialization.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key1.region_code, mock_session_1.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # We expect the region that we persisted to have no more ingest infos. ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session_1.start) self.assertEqual(len(ingest_infos_1), 0) mock_session_2 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii2, scrape_key2, t2) ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[1], mock_session_2.start) self.assertEqual(len(ingest_infos_2), 1) expected_proto = serialization.convert_ingest_info_to_proto(ii2) batch_persistence.persist_to_database(scrape_key2.region_code, mock_session_2.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) self.assertEqual(mock_write.call_count, 2)
def test_batch_delete_ingest_infos_for_region(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("1"), task_hash=task_hash, ) datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("2"), task_hash=task_hash, ) datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("3"), task_hash=task_hash, ) unrelated = datastore_ingest_info.write_ingest_info( region="unrelated_us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("n/a"), task_hash=task_hash, ) datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county") assert (datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) == []) actual = datastore_ingest_info.batch_get_ingest_infos_for_region( "unrelated_us_state_county", start_time) assert actual == [unrelated] datastore_ingest_info.batch_delete_ingest_infos_for_region( "unrelated_us_state_county")
def test_persist_to_db(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) expected_proto = serialization.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # After we persist, there should no longer be ingest infos on Datastore ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)
def test_write_errors(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() batch_ingest_info_data = datastore_ingest_info.write_error( region="us_state_county", session_start_time=start_time, error="error string", trace_id="trace", task_hash=task_hash, ) results = datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) assert results == [batch_ingest_info_data] datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county")
def test_batch_delete_over_500_ingest_infos_for_region(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() # The Datastore limit for entity writes in one call is 500. Confirm # that batch delete is properly handled when more than 500 entities # exist for the same region. for i in range(600): datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info(str(i)), task_hash=task_hash, ) datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county") assert (datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) == [])
def test_batch_delete_ingest_infos_for_region(self): task_hash = hash( json.dumps(Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT).to_serializable(), sort_keys=True)) start_time = datetime.now() datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('1'), task_hash=task_hash) datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('2'), task_hash=task_hash) datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('3'), task_hash=task_hash) unrelated = datastore_ingest_info \ .write_ingest_info(region='unrelated_us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('n/a'), task_hash=task_hash) datastore_ingest_info.batch_delete_ingest_infos_for_region( 'us_state_county') assert datastore_ingest_info.batch_get_ingest_infos_for_region( 'us_state_county', start_time) == [] actual = datastore_ingest_info.batch_get_ingest_infos_for_region( 'unrelated_us_state_county', start_time) assert actual == [unrelated] datastore_ingest_info.batch_delete_ingest_infos_for_region( 'unrelated_us_state_county')
def _get_batch_ingest_info_list( region_code: str, session_start_time: datetime.datetime) -> List[BatchIngestInfoData]: """Reads all of the messages from Datastore for the region. Args: region_code (str): The region code of the scraper. session_start_time (datetime): The start time of the scraper. Returns: A list of BatchIngestInfoData. """ return datastore_ingest_info.batch_get_ingest_infos_for_region( region_code, session_start_time)
def test_write_single_ingest_info(self): task_hash = hash( json.dumps(Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT).to_serializable(), sort_keys=True)) start_time = datetime.now() ingest_info = datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('1'), task_hash=task_hash) results = datastore_ingest_info.batch_get_ingest_infos_for_region( 'us_state_county', start_time) assert results == [ingest_info] datastore_ingest_info.batch_delete_ingest_infos_for_region( 'us_state_county')
def test_persist_to_db_same_task_one_fail_one_pass(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) mock_write.return_value = True ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) # Because the tasks are the same, we expect that to be counted as a # pass. t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key) expected_proto = serialization.convert_ingest_info_to_proto(ii) self.assertTrue( batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start)) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)