def test_write_to_datastore(self, mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()

        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(full_name=TEST_NAME).create_booking(
            booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )
        task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True))

        expected_batch = BatchIngestInfoData(ingest_info=ii,
                                             task_hash=task_hash)

        batch_persistence.write(ii, scrape_key, t)

        batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list(
            scrape_key.region_code, mock_session.start)

        self.assertEqual(len(batch_ingest_info_list), 1)
        self.assertEqual(expected_batch, batch_ingest_info_list[0])
    def test_persist_to_db_different_regions(self, mock_write, _mock_region,
                                             mock_session_return):
        scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        ii2 = ingest_info.IngestInfo()
        ii2.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME2).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        mock_session_1 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii, scrape_key1, t)
        expected_proto = serialization.convert_ingest_info_to_proto(ii)
        batch_persistence.persist_to_database(scrape_key1.region_code,
                                              mock_session_1.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # We expect the region that we persisted to have no more ingest infos.
        ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session_1.start)
        self.assertEqual(len(ingest_infos_1), 0)

        mock_session_2 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii2, scrape_key2, t2)
        ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[1], mock_session_2.start)
        self.assertEqual(len(ingest_infos_2), 1)

        expected_proto = serialization.convert_ingest_info_to_proto(ii2)
        batch_persistence.persist_to_database(scrape_key2.region_code,
                                              mock_session_2.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        self.assertEqual(mock_write.call_count, 2)
Пример #3
0
    def test_scrape_data_and_more_no_persist_second_time_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        populate_task = Task.evolve(TEST_TASK,
                                    task_type=constants.TaskType.SCRAPE_DATA)
        mock_get_more.return_value = [populate_task]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK,
                        task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=populate_task,
                scraper_start_time=start_time,
                ingest_info=self.ii,
            )
        ]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)

        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        scraper._generic_scrape(scraper.tasks[0])
        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 2)
        self.assertEqual(mock_write.call_count, 1)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
Пример #4
0
    def test_scrape_data_no_more_tasks(self, mock_get_more, mock_fetch,
                                       mock_populate, mock_write):
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertEqual(len(scraper.tasks), 0)
    def test_persist_duplicates_to_db(self, mock_write, _mock_region,
                                      mock_session_return):
        """Tests that duplicate ingest_info.Person objects are merged before
        write."""
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        # Arrange
        ii = IngestInfo()
        ii.create_person(person_id=TEST_ID,
                         full_name=TEST_NAME) \
            .create_booking(booking_id=TEST_ID)

        ii_2 = IngestInfo()
        ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2)

        ii_1_dup = copy.deepcopy(ii)

        t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA,
                           endpoint=TEST_ENDPOINT + str(i),
                           response_type=constants.ResponseType.TEXT)
                      for i in range(3))

        batch_persistence.write(ii, scrape_key, t1)
        batch_persistence.write(ii_2, scrape_key, t2)
        batch_persistence.write(ii_1_dup, scrape_key, t3)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        expected_ii = IngestInfo(people=ii.people + ii_2.people)
        expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii)
        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)
Пример #6
0
    def test_get_more_and_updates_cookies(
        self, mock_get_more: Mock, mock_fetch: Mock
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {1: 1})
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )

        t = Task.evolve(TEST_TASK, cookies={1: 1})

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=t,
                scraper_start_time=start_time,
            )
        ]

        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #7
0
    def test_scrape_data_no_more_tasks_batch(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
        mock_batch_write: Mock,
    ) -> None:
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper._generic_scrape(req)

        scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND)
        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_batch_write.assert_called_once_with(
            ingest_info=self.ii,
            task=t,
            scrape_key=scrape_key,
        )
        self.assertEqual(len(scraper.tasks), 0)
Пример #8
0
    def test_scrape_data_and_more_no_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
            ingest_info=self.ii
        )]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #9
0
    def test_fetch_sends_all_args(self, mock_get_more, mock_fetch):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, None)
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, headers='TEST_HEADERS', cookies='TEST_COOKIES',
            params='TEST_PARAMS', post_data='TEST_POST', json='TEST_JSON'
        )
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )]

        mock_fetch.assert_called_once_with(
            t.endpoint, t.response_type, headers=t.headers, cookies=t.cookies,
            params=t.params, post_data=t.post_data, json_data=t.json
        )
        self.assertCountEqual(expected_tasks, scraper.tasks)
    def test_write_errors(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))

        start_time = datetime.now()
        batch_ingest_info_data = datastore_ingest_info.write_error(
            region="us_state_county",
            session_start_time=start_time,
            error="error string",
            trace_id="trace",
            task_hash=task_hash,
        )

        results = datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time)

        assert results == [batch_ingest_info_data]
        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")
    def test_batch_delete_over_500_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))
        start_time = datetime.now()

        # The Datastore limit for entity writes in one call is 500. Confirm
        # that batch delete is properly handled when more than 500 entities
        # exist for the same region.
        for i in range(600):
            datastore_ingest_info.write_ingest_info(
                region="us_state_county",
                session_start_time=start_time,
                ingest_info=sample_ingest_info(str(i)),
                task_hash=task_hash,
            )

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")

        assert (datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time) == [])
Пример #12
0
    def test_persist_to_db(self, mock_write, _mock_region,
                           mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # After we persist, there should no longer be ingest infos on Datastore
        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)
Пример #13
0
def write(ingest_info: IngestInfo, scrape_key: ScrapeKey, task: Task):
    session = sessions.get_current_session(scrape_key)
    if not session:
        raise DatastoreError(scrape_key.region_code, "write")
    datastore_ingest_info.write_ingest_info(
        region=scrape_key.region_code,
        session_start_time=session.start,
        ingest_info=ingest_info,
        task_hash=hash(json.dumps(task.to_serializable(), sort_keys=True)))
Пример #14
0
    def validate_and_return_populate_data(self,
                                          content,
                                          expected_ingest_info=None,
                                          expected_single_counts=None,
                                          expected_persist=True,
                                          task=None,
                                          info=None):
        """This function runs populate_data and runs some extra validation
        on the output.

        Args:
            content: the content of the page to pass into get_more_tasks
            expected_ingest_info: the ingest info expected to be returned from
                `populate_data`. If `expected_ingest_info` is `None`, then
                expects the return value of `populate_data` to be `None`.
            expected_single_counts: the list of SingleCounts expected to be
            returned from `populate_data`.
            expected_persist: the expected value of persist to be returned from
                `populate_data`.
            task: the task that is being processed, optional.
            info: an ingest_info to use if provided.

        Returns:
            The result from populate_data in case the user needs to do any
            extra validations on the output.
        """
        info = info or ingest_info.IngestInfo()
        task = task or Task(task_type=constants.TaskType.SCRAPE_DATA,
                            endpoint='')

        scrape_data = self.scraper.populate_data(content, task, info)

        print('FINAL')
        print(scrape_data.ingest_info)
        print('EXPECTED')
        print(expected_ingest_info)

        if expected_ingest_info is None and expected_single_counts is None:
            if scrape_data:
                self.assertFalse(scrape_data.persist)
            else:
                self.assertIsNone(scrape_data)

        self.assertCountEqual(scrape_data.single_counts, expected_single_counts
                              or [])

        metadata = IngestMetadata(self.scraper.region.region_code,
                                  self.scraper.region.jurisdiction_id,
                                  _FAKE_SCRAPER_START_TIME,
                                  self.scraper.get_enum_overrides())

        self.validate_ingest(scrape_data.ingest_info, expected_ingest_info,
                             metadata)

        assert scrape_data.persist == expected_persist

        return scrape_data
Пример #15
0
    def test_persist_to_db_same_task_one_fail_one_pass(self, mock_write,
                                                       _mock_region,
                                                       mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        mock_write.return_value = True

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        # Because the tasks are the same, we expect that to be counted as a
        # pass.
        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)
        batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        self.assertTrue(
            batch_persistence.persist_to_database(scrape_key.region_code,
                                                  mock_session.start))

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)
Пример #16
0
def write_error(error: str, trace_id: Optional[str], task: Task,
                scrape_key: ScrapeKey):
    session = sessions.get_current_session(scrape_key)
    if not session:
        raise DatastoreError(scrape_key.region_code, "write_error")

    datastore_ingest_info.write_error(region=scrape_key.region_code,
                                      error=error, trace_id=trace_id,
                                      task_hash=hash(json.dumps(
                                          task.to_serializable(),
                                          sort_keys=True)),
                                      session_start_time=session.start)
Пример #17
0
    def test_persist_to_db_failed_no_write(self, mock_write, _mock_region,
                                           mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = IngestInfo()
        ii.create_person(person_id=TEST_ID,
                         full_name=TEST_NAME) \
            .create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        # Because the tasks are different, we should fail.
        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
            params=TEST_PARAMS,
        )

        batch_persistence.write(ii, scrape_key, t)
        batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key)

        self.assertFalse(
            batch_persistence.persist_to_database(scrape_key.region_code,
                                                  mock_session.start))

        self.assertEqual(mock_write.call_count, 0)

        # We should still have both items still on Datastore because they
        # weren't persisted.
        batch_ingest_info_data_list = batch_persistence \
            ._get_batch_ingest_info_list(scrape_key.region_code,
                                         mock_session.start)
        self.assertEqual(len(batch_ingest_info_data_list), 2)
    def test_write_error_to_datastore(self, mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        error = TEST_ERROR

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )
        task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True))

        expected_batch = BatchIngestInfoData(error=error, trace_id=TEST_TRACE,
                                             task_hash=task_hash)

        batch_persistence.write_error(error, TEST_TRACE, t, scrape_key)

        batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list(
            scrape_key.region_code, mock_session.start)

        self.assertEqual(len(batch_ingest_info_list), 1)
        self.assertEqual(expected_batch, batch_ingest_info_list[0])
    def test_create_scrape_task(self, mock_client, mock_uuid):
        # Arrange
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid

        region_code = 'us_ca_san_francisco'
        project_id = 'recidiviz-456'

        queue_name = 'test-queue-name'
        queue_path = f'queue_path/{project_id}/{QUEUES_REGION}/{queue_name}'
        task_id = 'us_ca_san_francisco-random-uuid'
        task_path = f'{queue_path}/{task_id}'
        url = '/my_scrape/task'

        body = {
            'region':
            region_code,
            'params':
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint='www.google.com'),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now()).to_serializable()
        }
        task = tasks_v2.types.task_pb2.Task(name=task_path,
                                            app_engine_http_request={
                                                'http_method': 'POST',
                                                'relative_uri': url,
                                                'body':
                                                json.dumps(body).encode()
                                            })

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id). \
            create_scrape_task(region_code=region_code,
                               queue_name=queue_name,
                               url=url,
                               body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
    def test_create_scrape_task(self, mock_client: Mock,
                                mock_uuid: Mock) -> None:
        # Arrange
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid

        region_code = "us_ca_san_francisco"
        project_id = "recidiviz-456"

        queue_name = "test-queue-name"
        queue_path = f"queue_path/{project_id}/{QUEUES_REGION}/{queue_name}"
        task_id = "us_ca_san_francisco-random-uuid"
        task_path = f"{queue_path}/{task_id}"
        url = "/my_scrape/task"

        body = {
            "region":
            region_code,
            "params":
            QueueRequest(
                next_task=Task(task_type=TaskType.INITIAL,
                               endpoint="www.google.com"),
                scrape_type=ScrapeType.BACKGROUND,
                scraper_start_time=datetime.datetime.now(),
            ).to_serializable(),
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_path,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri": url,
                "body": json.dumps(body).encode(),
            },
        )

        mock_client.return_value.task_path.return_value = task_path
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        ScraperCloudTaskManager(project_id=project_id).create_scrape_task(
            region_code=region_code, queue_name=queue_name, url=url, body=body)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name)
        mock_client.return_value.task_path.assert_called_with(
            project_id, QUEUES_REGION, queue_name, task_id)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def test_batch_delete_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))
        start_time = datetime.now()

        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("1"),
            task_hash=task_hash,
        )
        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("2"),
            task_hash=task_hash,
        )
        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("3"),
            task_hash=task_hash,
        )
        unrelated = datastore_ingest_info.write_ingest_info(
            region="unrelated_us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("n/a"),
            task_hash=task_hash,
        )

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")

        assert (datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time) == [])

        actual = datastore_ingest_info.batch_get_ingest_infos_for_region(
            "unrelated_us_state_county", start_time)
        assert actual == [unrelated]

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "unrelated_us_state_county")
Пример #22
0
    def test_scrape_data_and_more_yes_persist(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ]
        expected_metadata = IngestMetadata(
            region=scraper.region.region_code,
            jurisdiction_id=scraper.region.jurisdiction_id,
            ingest_time=start_time,
            enum_overrides=scraper.get_enum_overrides(),
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #23
0
    def test_content_no_fetch(self, mock_get_more: Mock, mock_fetch: Mock) -> None:
        t = Task.evolve(TEST_TASK, content=TEST_HTML)
        mock_get_more.return_value = [t]
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [req]

        self.assertEqual(mock_fetch.call_count, 0)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #24
0
    def test_write_single_ingest_info(self):
        task_hash = hash(
            json.dumps(Task(
                task_type=constants.TaskType.SCRAPE_DATA,
                endpoint=TEST_ENDPOINT,
                response_type=constants.ResponseType.TEXT).to_serializable(),
                       sort_keys=True))

        start_time = datetime.now()
        ingest_info = datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('1'),
            task_hash=task_hash)
        results = datastore_ingest_info.batch_get_ingest_infos_for_region(
            'us_state_county', start_time)
        assert results == [ingest_info]
        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'us_state_county')
Пример #25
0
    def test_batch_delete_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(Task(
                task_type=constants.TaskType.SCRAPE_DATA,
                endpoint=TEST_ENDPOINT,
                response_type=constants.ResponseType.TEXT).to_serializable(),
                       sort_keys=True))
        start_time = datetime.now()

        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('1'),
            task_hash=task_hash)
        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('2'),
            task_hash=task_hash)
        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('3'),
            task_hash=task_hash)
        unrelated = datastore_ingest_info \
            .write_ingest_info(region='unrelated_us_state_county',
                               session_start_time=start_time,
                               ingest_info=sample_ingest_info('n/a'),
                               task_hash=task_hash)

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'us_state_county')

        assert datastore_ingest_info.batch_get_ingest_infos_for_region(
            'us_state_county', start_time) == []

        actual = datastore_ingest_info.batch_get_ingest_infos_for_region(
            'unrelated_us_state_county', start_time)
        assert actual == [unrelated]

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'unrelated_us_state_county')
Пример #26
0
 def get_initial_task(self) -> Task:
     """Returns the initial parameters to use for the first call."""
     return Task(
         task_type=constants.TaskType.INITIAL_AND_MORE,
         endpoint=self.get_region().base_url,
     )
Пример #27
0
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json)
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info("Scraping data for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info("Getting more tasks for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        '_generic_scrape',
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ))

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info("Logging at most 4 people (were %d):",
                                 len(scraped_data.ingest_info.people))
                    loop_count = min(len(scraped_data.ingest_info.people),
                                     constants.MAX_PEOPLE_TO_LOG)
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info("Last seen time of person being set as: [%s]",
                                 request.scraper_start_time)
                    metadata = IngestMetadata(self.region.region_code,
                                              self.region.jurisdiction_id,
                                              request.scraper_start_time,
                                              self.get_enum_overrides())
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]", len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        persistence.write(
                            ingest_utils.convert_ingest_info_to_proto(
                                scraped_data.ingest_info), metadata)
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e
Пример #28
0
    def validate_and_return_populate_data(
        self,
        content: Optional[html.HtmlElement],
        expected_ingest_info: Optional[IngestInfo] = None,
        expected_single_counts: Optional[List[SingleCount]] = None,
        expected_persist: bool = True,
        task: Optional[Task] = None,
        info: Optional[IngestInfo] = None,
    ) -> ScrapedData:
        """This function runs populate_data and runs some extra validation
        on the output.

        Args:
            content: the content of the page to pass into get_more_tasks
            expected_ingest_info: the ingest info expected to be returned from
                `populate_data`. If `expected_ingest_info` is `None`, then
                expects the return value of `populate_data` to be `None`.
            expected_single_counts: the list of SingleCounts expected to be
            returned from `populate_data`.
            expected_persist: the expected value of persist to be returned from
                `populate_data`.
            task: the task that is being processed, optional.
            info: an ingest_info to use if provided.

        Returns:
            The result from populate_data in case the user needs to do any
            extra validations on the output.
        """
        info_to_ingest: IngestInfo = info or ingest_info.IngestInfo()
        task_to_process: Task = task or Task(
            task_type=constants.TaskType.SCRAPE_DATA, endpoint="")

        if self.scraper:
            scrape_data = self.scraper.populate_data(content, task_to_process,
                                                     info_to_ingest)

            print("FINAL")
            print(scrape_data.ingest_info)
            print("EXPECTED")
            print(expected_ingest_info)

            if expected_ingest_info is None and expected_single_counts is None:
                if scrape_data:
                    assert scrape_data.persist is False
                else:
                    assert scrape_data is None

            if expected_single_counts and scrape_data.single_counts:
                assert len(
                    scrape_data.single_counts) == len(expected_single_counts)
                diff = set(expected_single_counts) ^ set(
                    scrape_data.single_counts)
                assert not diff

            metadata: IngestMetadata = IngestMetadata(
                region=self.scraper.region.region_code,
                jurisdiction_id=self.scraper.region.jurisdiction_id,
                ingest_time=_FAKE_SCRAPER_START_TIME,
                enum_overrides=self.scraper.get_enum_overrides(),
                system_level=SystemLevel.COUNTY,
                database_key=SQLAlchemyDatabaseKey.for_schema(
                    SchemaType.JAILS),
            )

            if scrape_data.ingest_info and expected_ingest_info:
                self.validate_ingest(scrape_data.ingest_info,
                                     expected_ingest_info, metadata)

            assert scrape_data.persist == expected_persist

        if scrape_data:
            return scrape_data
        raise ValueError("Scrape data was not provided ingest info")
Пример #29
0
import pytest
import pytz
from flask import Flask
from mock import Mock, create_autospec, patch

from recidiviz.ingest.scrape import constants, scrape_phase, sessions, worker
from recidiviz.ingest.scrape.task_params import QueueRequest, Task
from recidiviz.utils.regions import Region

PATH = "/work/us_ca"
FAKE_QUEUE_PARAMS = QueueRequest(
    scrape_type=constants.ScrapeType.BACKGROUND,
    scraper_start_time=datetime.datetime.now(tz=pytz.UTC),
    next_task=Task(
        task_type=constants.TaskType.INITIAL,
        endpoint="some.endpoint",
    ),
)

app = Flask(__name__)
app.register_blueprint(worker.worker)
app.config["TESTING"] = True


@patch("recidiviz.utils.metadata.project_id", Mock(return_value="test-project"))
@patch("recidiviz.utils.metadata.project_number", Mock(return_value="123456789"))
class TestWorker:
    """Tests for requests to the Worker API."""

    # noinspection PyAttributeOutsideInit
    def setup_method(self, _test_method: Callable) -> None:
Пример #30
0

def mock_region(region_code, queue_name=None, is_stoppable=False):
    return Region(
        region_code=region_code,
        shared_queue=queue_name or None,
        agency_name='the agency',
        agency_type='benevolent',
        base_url='localhost:3000',
        names_file='names.txt',
        timezone='America/New_York',
        environment='production',
        jurisdiction_id='jurisdiction_id',
        is_stoppable=is_stoppable or False,
    )


FAKE_TASK = Task(task_type=constants.TaskType.INITIAL, endpoint='fake')


class FakeScraper(Scraper):
    def __init__(self, region_name, initial_task_method):
        super().__init__(region_name)
        self.initial_task_method = initial_task_method

    def get_initial_task_method(self):
        return self.initial_task_method

    def get_initial_task(self):
        return FAKE_TASK