def test_write_to_datastore(self, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person(full_name=TEST_NAME).create_booking( booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True)) expected_batch = BatchIngestInfoData(ingest_info=ii, task_hash=task_hash) batch_persistence.write(ii, scrape_key, t) batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list( scrape_key.region_code, mock_session.start) self.assertEqual(len(batch_ingest_info_list), 1) self.assertEqual(expected_batch, batch_ingest_info_list[0])
def test_persist_to_db_different_regions(self, mock_write, _mock_region, mock_session_return): scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) ii2 = ingest_info.IngestInfo() ii2.create_person( person_id=TEST_ID, full_name=TEST_NAME2).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) mock_session_1 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii, scrape_key1, t) expected_proto = serialization.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key1.region_code, mock_session_1.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # We expect the region that we persisted to have no more ingest infos. ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session_1.start) self.assertEqual(len(ingest_infos_1), 0) mock_session_2 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii2, scrape_key2, t2) ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[1], mock_session_2.start) self.assertEqual(len(ingest_infos_2), 1) expected_proto = serialization.convert_ingest_info_to_proto(ii2) batch_persistence.persist_to_database(scrape_key2.region_code, mock_session_2.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) self.assertEqual(mock_write.call_count, 2)
def test_scrape_data_and_more_no_persist_second_time_persist( self, mock_get_more, mock_fetch, mock_populate, mock_write): populate_task = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) mock_get_more.return_value = [populate_task] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=False, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=populate_task, scraper_start_time=start_time, ingest_info=self.ii, ) ] self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_get_more.assert_called_once_with(TEST_HTML, t) self.assertCountEqual(expected_tasks, scraper.tasks) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) scraper._generic_scrape(scraper.tasks[0]) self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 2) self.assertEqual(mock_write.call_count, 1) expected_metadata = IngestMetadata( scraper.region.region_code, scraper.region.jurisdiction_id, start_time, scraper.get_enum_overrides(), ) expected_proto = convert_ingest_info_to_proto(self.ii) mock_write.assert_called_once_with(expected_proto, expected_metadata)
def test_scrape_data_no_more_tasks(self, mock_get_more, mock_fetch, mock_populate, mock_write): mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_metadata = IngestMetadata( scraper.region.region_code, scraper.region.jurisdiction_id, start_time, scraper.get_enum_overrides(), ) expected_proto = convert_ingest_info_to_proto(self.ii) self.assertEqual(mock_get_more.call_count, 0) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 1) mock_write.assert_called_once_with(expected_proto, expected_metadata) self.assertEqual(len(scraper.tasks), 0)
def test_persist_duplicates_to_db(self, mock_write, _mock_region, mock_session_return): """Tests that duplicate ingest_info.Person objects are merged before write.""" mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) # Arrange ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) ii_2 = IngestInfo() ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2) ii_1_dup = copy.deepcopy(ii) t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT + str(i), response_type=constants.ResponseType.TEXT) for i in range(3)) batch_persistence.write(ii, scrape_key, t1) batch_persistence.write(ii_2, scrape_key, t2) batch_persistence.write(ii_1_dup, scrape_key, t3) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) expected_ii = IngestInfo(people=ii.people + ii_2.people) expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto)
def test_get_more_and_updates_cookies( self, mock_get_more: Mock, mock_fetch: Mock ) -> None: mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {1: 1}) start_time = datetime.datetime.now() req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ) t = Task.evolve(TEST_TASK, cookies={1: 1}) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) ] self.assertCountEqual(expected_tasks, scraper.tasks)
def test_scrape_data_no_more_tasks_batch( self, mock_get_more: Mock, mock_fetch: Mock, mock_populate: Mock, mock_write: Mock, mock_batch_write: Mock, ) -> None: mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper._generic_scrape(req) scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND) self.assertEqual(mock_get_more.call_count, 0) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_batch_write.assert_called_once_with( ingest_info=self.ii, task=t, scrape_key=scrape_key, ) self.assertEqual(len(scraper.tasks), 0)
def test_scrape_data_and_more_no_persist( self, mock_get_more, mock_fetch, mock_populate, mock_write): mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=False, ) start_time = datetime.datetime.now() t = Task.evolve( TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time ) scraper = FakeScraper('test') scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ingest_info=self.ii )] self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_get_more.assert_called_once_with(TEST_HTML, t) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_fetch_sends_all_args(self, mock_get_more, mock_fetch): mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, None) start_time = datetime.datetime.now() t = Task.evolve( TEST_TASK, headers='TEST_HEADERS', cookies='TEST_COOKIES', params='TEST_PARAMS', post_data='TEST_POST', json='TEST_JSON' ) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time ) scraper = FakeScraper('test') scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_tasks = [QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, )] mock_fetch.assert_called_once_with( t.endpoint, t.response_type, headers=t.headers, cookies=t.cookies, params=t.params, post_data=t.post_data, json_data=t.json ) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_write_errors(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() batch_ingest_info_data = datastore_ingest_info.write_error( region="us_state_county", session_start_time=start_time, error="error string", trace_id="trace", task_hash=task_hash, ) results = datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) assert results == [batch_ingest_info_data] datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county")
def test_batch_delete_over_500_ingest_infos_for_region(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() # The Datastore limit for entity writes in one call is 500. Confirm # that batch delete is properly handled when more than 500 entities # exist for the same region. for i in range(600): datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info(str(i)), task_hash=task_hash, ) datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county") assert (datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) == [])
def test_persist_to_db(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) expected_proto = serialization.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # After we persist, there should no longer be ingest infos on Datastore ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)
def write(ingest_info: IngestInfo, scrape_key: ScrapeKey, task: Task): session = sessions.get_current_session(scrape_key) if not session: raise DatastoreError(scrape_key.region_code, "write") datastore_ingest_info.write_ingest_info( region=scrape_key.region_code, session_start_time=session.start, ingest_info=ingest_info, task_hash=hash(json.dumps(task.to_serializable(), sort_keys=True)))
def validate_and_return_populate_data(self, content, expected_ingest_info=None, expected_single_counts=None, expected_persist=True, task=None, info=None): """This function runs populate_data and runs some extra validation on the output. Args: content: the content of the page to pass into get_more_tasks expected_ingest_info: the ingest info expected to be returned from `populate_data`. If `expected_ingest_info` is `None`, then expects the return value of `populate_data` to be `None`. expected_single_counts: the list of SingleCounts expected to be returned from `populate_data`. expected_persist: the expected value of persist to be returned from `populate_data`. task: the task that is being processed, optional. info: an ingest_info to use if provided. Returns: The result from populate_data in case the user needs to do any extra validations on the output. """ info = info or ingest_info.IngestInfo() task = task or Task(task_type=constants.TaskType.SCRAPE_DATA, endpoint='') scrape_data = self.scraper.populate_data(content, task, info) print('FINAL') print(scrape_data.ingest_info) print('EXPECTED') print(expected_ingest_info) if expected_ingest_info is None and expected_single_counts is None: if scrape_data: self.assertFalse(scrape_data.persist) else: self.assertIsNone(scrape_data) self.assertCountEqual(scrape_data.single_counts, expected_single_counts or []) metadata = IngestMetadata(self.scraper.region.region_code, self.scraper.region.jurisdiction_id, _FAKE_SCRAPER_START_TIME, self.scraper.get_enum_overrides()) self.validate_ingest(scrape_data.ingest_info, expected_ingest_info, metadata) assert scrape_data.persist == expected_persist return scrape_data
def test_persist_to_db_same_task_one_fail_one_pass(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) mock_write.return_value = True ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) # Because the tasks are the same, we expect that to be counted as a # pass. t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key) expected_proto = serialization.convert_ingest_info_to_proto(ii) self.assertTrue( batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start)) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)
def write_error(error: str, trace_id: Optional[str], task: Task, scrape_key: ScrapeKey): session = sessions.get_current_session(scrape_key) if not session: raise DatastoreError(scrape_key.region_code, "write_error") datastore_ingest_info.write_error(region=scrape_key.region_code, error=error, trace_id=trace_id, task_hash=hash(json.dumps( task.to_serializable(), sort_keys=True)), session_start_time=session.start)
def test_persist_to_db_failed_no_write(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) # Because the tasks are different, we should fail. t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, params=TEST_PARAMS, ) batch_persistence.write(ii, scrape_key, t) batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key) self.assertFalse( batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start)) self.assertEqual(mock_write.call_count, 0) # We should still have both items still on Datastore because they # weren't persisted. batch_ingest_info_data_list = batch_persistence \ ._get_batch_ingest_info_list(scrape_key.region_code, mock_session.start) self.assertEqual(len(batch_ingest_info_data_list), 2)
def test_write_error_to_datastore(self, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) error = TEST_ERROR t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True)) expected_batch = BatchIngestInfoData(error=error, trace_id=TEST_TRACE, task_hash=task_hash) batch_persistence.write_error(error, TEST_TRACE, t, scrape_key) batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list( scrape_key.region_code, mock_session.start) self.assertEqual(len(batch_ingest_info_list), 1) self.assertEqual(expected_batch, batch_ingest_info_list[0])
def test_create_scrape_task(self, mock_client, mock_uuid): # Arrange uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid region_code = 'us_ca_san_francisco' project_id = 'recidiviz-456' queue_name = 'test-queue-name' queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}' task_id = 'us_ca_san_francisco-random-uuid' task_path = f'{queue_path}/{task_id}' url = '/my_scrape/task' body = { 'region': region_code, 'params': QueueRequest( next_task=Task(task_type=TaskType.INITIAL, endpoint='www.google.com'), scrape_type=ScrapeType.BACKGROUND, scraper_start_time=datetime.datetime.now()).to_serializable() } task = tasks_v2.types.task_pb2.Task(name=task_path, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': url, 'body': json.dumps(body).encode() }) mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act ScraperCloudTaskManager(project_id=project_id). \ create_scrape_task(region_code=region_code, queue_name=queue_name, url=url, body=body) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, queue_name) mock_client.return_value.task_path.assert_called_with( project_id, QUEUES_REGION, queue_name, task_id) mock_client.return_value.create_task.assert_called_with( queue_path, task)
def test_create_scrape_task(self, mock_client: Mock, mock_uuid: Mock) -> None: # Arrange uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid region_code = "us_ca_san_francisco" project_id = "recidiviz-456" queue_name = "test-queue-name" queue_path = f"queue_path/{project_id}/{QUEUES_REGION}/{queue_name}" task_id = "us_ca_san_francisco-random-uuid" task_path = f"{queue_path}/{task_id}" url = "/my_scrape/task" body = { "region": region_code, "params": QueueRequest( next_task=Task(task_type=TaskType.INITIAL, endpoint="www.google.com"), scrape_type=ScrapeType.BACKGROUND, scraper_start_time=datetime.datetime.now(), ).to_serializable(), } task = tasks_v2.types.task_pb2.Task( name=task_path, app_engine_http_request={ "http_method": "POST", "relative_uri": url, "body": json.dumps(body).encode(), }, ) mock_client.return_value.task_path.return_value = task_path mock_client.return_value.queue_path.return_value = queue_path # Act ScraperCloudTaskManager(project_id=project_id).create_scrape_task( region_code=region_code, queue_name=queue_name, url=url, body=body) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, queue_name) mock_client.return_value.task_path.assert_called_with( project_id, QUEUES_REGION, queue_name, task_id) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_batch_delete_ingest_infos_for_region(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("1"), task_hash=task_hash, ) datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("2"), task_hash=task_hash, ) datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("3"), task_hash=task_hash, ) unrelated = datastore_ingest_info.write_ingest_info( region="unrelated_us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("n/a"), task_hash=task_hash, ) datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county") assert (datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) == []) actual = datastore_ingest_info.batch_get_ingest_infos_for_region( "unrelated_us_state_county", start_time) assert actual == [unrelated] datastore_ingest_info.batch_delete_ingest_infos_for_region( "unrelated_us_state_county")
def test_scrape_data_and_more_yes_persist( self, mock_get_more: Mock, mock_fetch: Mock, mock_populate: Mock, mock_write: Mock, ) -> None: mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ) ] expected_metadata = IngestMetadata( region=scraper.region.region_code, jurisdiction_id=scraper.region.jurisdiction_id, ingest_time=start_time, enum_overrides=scraper.get_enum_overrides(), system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), ) expected_proto = convert_ingest_info_to_proto(self.ii) self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 1) mock_write.assert_called_once_with(expected_proto, expected_metadata) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_content_no_fetch(self, mock_get_more: Mock, mock_fetch: Mock) -> None: t = Task.evolve(TEST_TASK, content=TEST_HTML) mock_get_more.return_value = [t] start_time = datetime.datetime.now() req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_tasks = [req] self.assertEqual(mock_fetch.call_count, 0) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_write_single_ingest_info(self): task_hash = hash( json.dumps(Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT).to_serializable(), sort_keys=True)) start_time = datetime.now() ingest_info = datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('1'), task_hash=task_hash) results = datastore_ingest_info.batch_get_ingest_infos_for_region( 'us_state_county', start_time) assert results == [ingest_info] datastore_ingest_info.batch_delete_ingest_infos_for_region( 'us_state_county')
def test_batch_delete_ingest_infos_for_region(self): task_hash = hash( json.dumps(Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT).to_serializable(), sort_keys=True)) start_time = datetime.now() datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('1'), task_hash=task_hash) datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('2'), task_hash=task_hash) datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('3'), task_hash=task_hash) unrelated = datastore_ingest_info \ .write_ingest_info(region='unrelated_us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('n/a'), task_hash=task_hash) datastore_ingest_info.batch_delete_ingest_infos_for_region( 'us_state_county') assert datastore_ingest_info.batch_get_ingest_infos_for_region( 'us_state_county', start_time) == [] actual = datastore_ingest_info.batch_get_ingest_infos_for_region( 'unrelated_us_state_county', start_time) assert actual == [unrelated] datastore_ingest_info.batch_delete_ingest_infos_for_region( 'unrelated_us_state_county')
def get_initial_task(self) -> Task: """Returns the initial parameters to use for the first call.""" return Task( task_type=constants.TaskType.INITIAL_AND_MORE, endpoint=self.get_region().base_url, )
def _generic_scrape(self, request: QueueRequest): """ General handler for all scrape tasks. This function is a generic entry point into all types of scrapes. It decides what to call based on params. Args: params: dict of parameters passed from the last scrape session. """ try: task = request.next_task # Here we handle a special case where we weren't really sure # we were going to get data when we submitted a task, but then # we ended up with data, so no more requests are required, # just the content we already have. # TODO(#680): remove this if task.content is not None: content = self._parse_html_content(task.content) cookies = None else: post_data = task.post_data # Let the child transform the post_data if it wants before # sending the requests. This hook is in here in case the # child did something like compress the post_data before # it put it on the queue. self.transform_post_data(post_data) # We always fetch some content before doing anything. # Note that we use get here for the post_data to return a # default value of None if this scraper doesn't set it. try: content, cookies = self._fetch_content( task.endpoint, task.response_type, headers=task.headers, cookies=task.cookies, params=task.params, post_data=post_data, json_data=task.json) except Exception as e: raise ScraperFetchError(str(e)) from e scraped_data = None if self.should_scrape_data(task.task_type): # If we want to scrape data, we should either create an # ingest_info object or get the one that already exists. logging.info("Scraping data for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) try: scraped_data = self.populate_data( content, task, request.ingest_info or IngestInfo()) except Exception as e: raise ScraperPopulateDataError(str(e)) from e if self.should_get_more_tasks(task.task_type): logging.info("Getting more tasks for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) # Only send along ingest info if it will not be persisted now. ingest_info_to_send = None if scraped_data is not None and not scraped_data.persist: ingest_info_to_send = scraped_data.ingest_info try: # pylint: disable=assignment-from-no-return next_tasks = self.get_more_tasks(content, task) except Exception as e: raise ScraperGetMoreTasksError(str(e)) from e for next_task in next_tasks: # Include cookies received from response, if any if cookies: cookies.update(next_task.cookies) next_task = Task.evolve(next_task, cookies=cookies) self.add_task( '_generic_scrape', QueueRequest( scrape_type=request.scrape_type, scraper_start_time=request.scraper_start_time, next_task=next_task, ingest_info=ingest_info_to_send, )) if scraped_data is not None and scraped_data.persist: if scraped_data.ingest_info: logging.info("Logging at most 4 people (were %d):", len(scraped_data.ingest_info.people)) loop_count = min(len(scraped_data.ingest_info.people), constants.MAX_PEOPLE_TO_LOG) for i in range(loop_count): logging.info("[%s]", str(scraped_data.ingest_info.people[i])) logging.info("Last seen time of person being set as: [%s]", request.scraper_start_time) metadata = IngestMetadata(self.region.region_code, self.region.jurisdiction_id, request.scraper_start_time, self.get_enum_overrides()) if self.BATCH_WRITES: logging.info( "Queuing ingest_info ([%d] people) to " "batch_persistence for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write( ingest_info=scraped_data.ingest_info, scrape_key=scrape_key, task=task, ) else: logging.info( "Writing ingest_info ([%d] people) to the database" " for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) persistence.write( ingest_utils.convert_ingest_info_to_proto( scraped_data.ingest_info), metadata) for sc in scraped_data.single_counts: if not sc.date: scrape_key = ScrapeKey(self.region.region_code, constants.ScrapeType.BACKGROUND) session = sessions.get_current_session(scrape_key) if session: sc = attr.evolve(sc, date=session.start.date()) single_count.store_single_count( sc, self.region.jurisdiction_id) except Exception as e: if self.BATCH_WRITES: scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write_error( error=str(e), trace_id=get_trace_id_from_flask(), task=task, scrape_key=scrape_key, ) raise e
def validate_and_return_populate_data( self, content: Optional[html.HtmlElement], expected_ingest_info: Optional[IngestInfo] = None, expected_single_counts: Optional[List[SingleCount]] = None, expected_persist: bool = True, task: Optional[Task] = None, info: Optional[IngestInfo] = None, ) -> ScrapedData: """This function runs populate_data and runs some extra validation on the output. Args: content: the content of the page to pass into get_more_tasks expected_ingest_info: the ingest info expected to be returned from `populate_data`. If `expected_ingest_info` is `None`, then expects the return value of `populate_data` to be `None`. expected_single_counts: the list of SingleCounts expected to be returned from `populate_data`. expected_persist: the expected value of persist to be returned from `populate_data`. task: the task that is being processed, optional. info: an ingest_info to use if provided. Returns: The result from populate_data in case the user needs to do any extra validations on the output. """ info_to_ingest: IngestInfo = info or ingest_info.IngestInfo() task_to_process: Task = task or Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint="") if self.scraper: scrape_data = self.scraper.populate_data(content, task_to_process, info_to_ingest) print("FINAL") print(scrape_data.ingest_info) print("EXPECTED") print(expected_ingest_info) if expected_ingest_info is None and expected_single_counts is None: if scrape_data: assert scrape_data.persist is False else: assert scrape_data is None if expected_single_counts and scrape_data.single_counts: assert len( scrape_data.single_counts) == len(expected_single_counts) diff = set(expected_single_counts) ^ set( scrape_data.single_counts) assert not diff metadata: IngestMetadata = IngestMetadata( region=self.scraper.region.region_code, jurisdiction_id=self.scraper.region.jurisdiction_id, ingest_time=_FAKE_SCRAPER_START_TIME, enum_overrides=self.scraper.get_enum_overrides(), system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema( SchemaType.JAILS), ) if scrape_data.ingest_info and expected_ingest_info: self.validate_ingest(scrape_data.ingest_info, expected_ingest_info, metadata) assert scrape_data.persist == expected_persist if scrape_data: return scrape_data raise ValueError("Scrape data was not provided ingest info")
import pytest import pytz from flask import Flask from mock import Mock, create_autospec, patch from recidiviz.ingest.scrape import constants, scrape_phase, sessions, worker from recidiviz.ingest.scrape.task_params import QueueRequest, Task from recidiviz.utils.regions import Region PATH = "/work/us_ca" FAKE_QUEUE_PARAMS = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, scraper_start_time=datetime.datetime.now(tz=pytz.UTC), next_task=Task( task_type=constants.TaskType.INITIAL, endpoint="some.endpoint", ), ) app = Flask(__name__) app.register_blueprint(worker.worker) app.config["TESTING"] = True @patch("recidiviz.utils.metadata.project_id", Mock(return_value="test-project")) @patch("recidiviz.utils.metadata.project_number", Mock(return_value="123456789")) class TestWorker: """Tests for requests to the Worker API.""" # noinspection PyAttributeOutsideInit def setup_method(self, _test_method: Callable) -> None:
def mock_region(region_code, queue_name=None, is_stoppable=False): return Region( region_code=region_code, shared_queue=queue_name or None, agency_name='the agency', agency_type='benevolent', base_url='localhost:3000', names_file='names.txt', timezone='America/New_York', environment='production', jurisdiction_id='jurisdiction_id', is_stoppable=is_stoppable or False, ) FAKE_TASK = Task(task_type=constants.TaskType.INITIAL, endpoint='fake') class FakeScraper(Scraper): def __init__(self, region_name, initial_task_method): super().__init__(region_name) self.initial_task_method = initial_task_method def get_initial_task_method(self): return self.initial_task_method def get_initial_task(self): return FAKE_TASK