Пример #1
0
    def test_scrape_data_and_more_no_persist_second_time_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        populate_task = Task.evolve(TEST_TASK,
                                    task_type=constants.TaskType.SCRAPE_DATA)
        mock_get_more.return_value = [populate_task]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK,
                        task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=populate_task,
                scraper_start_time=start_time,
                ingest_info=self.ii,
            )
        ]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)

        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        scraper._generic_scrape(scraper.tasks[0])
        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 2)
        self.assertEqual(mock_write.call_count, 1)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
Пример #2
0
    def test_scrape_data_no_more_tasks_batch(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
        mock_batch_write: Mock,
    ) -> None:
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper._generic_scrape(req)

        scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND)
        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_batch_write.assert_called_once_with(
            ingest_info=self.ii,
            task=t,
            scrape_key=scrape_key,
        )
        self.assertEqual(len(scraper.tasks), 0)
Пример #3
0
    def test_get_more_and_updates_cookies(
        self, mock_get_more: Mock, mock_fetch: Mock
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {1: 1})
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )

        t = Task.evolve(TEST_TASK, cookies={1: 1})

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=t,
                scraper_start_time=start_time,
            )
        ]

        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #4
0
    def test_scrape_data_no_more_tasks(self, mock_get_more, mock_fetch,
                                       mock_populate, mock_write):
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_metadata = IngestMetadata(
            scraper.region.region_code,
            scraper.region.jurisdiction_id,
            start_time,
            scraper.get_enum_overrides(),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 0)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertEqual(len(scraper.tasks), 0)
Пример #5
0
    def test_scrape_data_and_more_no_persist(
            self, mock_get_more, mock_fetch, mock_populate, mock_write):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=False,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
            ingest_info=self.ii
        )]

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 0)
        mock_get_more.assert_called_once_with(TEST_HTML, t)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #6
0
    def test_fetch_sends_all_args(self, mock_get_more, mock_fetch):
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, None)
        start_time = datetime.datetime.now()
        t = Task.evolve(
            TEST_TASK, headers='TEST_HEADERS', cookies='TEST_COOKIES',
            params='TEST_PARAMS', post_data='TEST_POST', json='TEST_JSON'
        )
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time
        )

        scraper = FakeScraper('test')
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=TEST_TASK,
            scraper_start_time=start_time,
        )]

        mock_fetch.assert_called_once_with(
            t.endpoint, t.response_type, headers=t.headers, cookies=t.cookies,
            params=t.params, post_data=t.post_data, json_data=t.json
        )
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #7
0
    def test_scrape_data_and_more_yes_persist(
        self,
        mock_get_more: Mock,
        mock_fetch: Mock,
        mock_populate: Mock,
        mock_write: Mock,
    ) -> None:
        mock_get_more.return_value = [TEST_TASK]
        mock_fetch.return_value = (TEST_HTML, {})
        mock_populate.return_value = ScrapedData(
            ingest_info=self.ii,
            persist=True,
        )
        start_time = datetime.datetime.now()
        t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE)
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )

        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        # Should send the ii since we chose not to persist.
        expected_tasks = [
            QueueRequest(
                scrape_type=constants.ScrapeType.BACKGROUND,
                next_task=TEST_TASK,
                scraper_start_time=start_time,
            )
        ]
        expected_metadata = IngestMetadata(
            region=scraper.region.region_code,
            jurisdiction_id=scraper.region.jurisdiction_id,
            ingest_time=start_time,
            enum_overrides=scraper.get_enum_overrides(),
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )
        expected_proto = convert_ingest_info_to_proto(self.ii)

        self.assertEqual(mock_get_more.call_count, 1)
        self.assertEqual(mock_populate.call_count, 1)
        self.assertEqual(mock_write.call_count, 1)
        mock_write.assert_called_once_with(expected_proto, expected_metadata)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #8
0
    def test_content_no_fetch(self, mock_get_more: Mock, mock_fetch: Mock) -> None:
        t = Task.evolve(TEST_TASK, content=TEST_HTML)
        mock_get_more.return_value = [t]
        start_time = datetime.datetime.now()
        req = QueueRequest(
            scrape_type=constants.ScrapeType.BACKGROUND,
            next_task=t,
            scraper_start_time=start_time,
        )
        scraper = FakeScraper("test")
        scraper.BATCH_WRITES = False
        scraper._generic_scrape(req)

        expected_tasks = [req]

        self.assertEqual(mock_fetch.call_count, 0)
        self.assertCountEqual(expected_tasks, scraper.tasks)
Пример #9
0
    def _generic_scrape(self, request: QueueRequest):
        """
        General handler for all scrape tasks.  This function is a generic entry
        point into all types of scrapes.  It decides what to call based on
        params.

        Args:
            params: dict of parameters passed from the last scrape session.
        """
        try:
            task = request.next_task

            # Here we handle a special case where we weren't really sure
            # we were going to get data when we submitted a task, but then
            # we ended up with data, so no more requests are required,
            # just the content we already have.
            # TODO(#680): remove this
            if task.content is not None:
                content = self._parse_html_content(task.content)
                cookies = None
            else:
                post_data = task.post_data

                # Let the child transform the post_data if it wants before
                # sending the requests.  This hook is in here in case the
                # child did something like compress the post_data before
                # it put it on the queue.
                self.transform_post_data(post_data)

                # We always fetch some content before doing anything.
                # Note that we use get here for the post_data to return a
                # default value of None if this scraper doesn't set it.
                try:
                    content, cookies = self._fetch_content(
                        task.endpoint,
                        task.response_type,
                        headers=task.headers,
                        cookies=task.cookies,
                        params=task.params,
                        post_data=post_data,
                        json_data=task.json)
                except Exception as e:
                    raise ScraperFetchError(str(e)) from e

            scraped_data = None
            if self.should_scrape_data(task.task_type):
                # If we want to scrape data, we should either create an
                # ingest_info object or get the one that already exists.
                logging.info("Scraping data for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)
                try:
                    scraped_data = self.populate_data(
                        content, task, request.ingest_info or IngestInfo())
                except Exception as e:
                    raise ScraperPopulateDataError(str(e)) from e

            if self.should_get_more_tasks(task.task_type):
                logging.info("Getting more tasks for [%s] and endpoint: [%s]",
                             self.region.region_code, task.endpoint)

                # Only send along ingest info if it will not be persisted now.
                ingest_info_to_send = None
                if scraped_data is not None and not scraped_data.persist:
                    ingest_info_to_send = scraped_data.ingest_info

                try:
                    # pylint: disable=assignment-from-no-return
                    next_tasks = self.get_more_tasks(content, task)
                except Exception as e:
                    raise ScraperGetMoreTasksError(str(e)) from e
                for next_task in next_tasks:
                    # Include cookies received from response, if any
                    if cookies:
                        cookies.update(next_task.cookies)
                        next_task = Task.evolve(next_task, cookies=cookies)
                    self.add_task(
                        '_generic_scrape',
                        QueueRequest(
                            scrape_type=request.scrape_type,
                            scraper_start_time=request.scraper_start_time,
                            next_task=next_task,
                            ingest_info=ingest_info_to_send,
                        ))

            if scraped_data is not None and scraped_data.persist:
                if scraped_data.ingest_info:
                    logging.info("Logging at most 4 people (were %d):",
                                 len(scraped_data.ingest_info.people))
                    loop_count = min(len(scraped_data.ingest_info.people),
                                     constants.MAX_PEOPLE_TO_LOG)
                    for i in range(loop_count):
                        logging.info("[%s]",
                                     str(scraped_data.ingest_info.people[i]))
                    logging.info("Last seen time of person being set as: [%s]",
                                 request.scraper_start_time)
                    metadata = IngestMetadata(self.region.region_code,
                                              self.region.jurisdiction_id,
                                              request.scraper_start_time,
                                              self.get_enum_overrides())
                    if self.BATCH_WRITES:
                        logging.info(
                            "Queuing ingest_info ([%d] people) to "
                            "batch_persistence for [%s]",
                            len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        scrape_key = ScrapeKey(self.region.region_code,
                                               request.scrape_type)
                        batch_persistence.write(
                            ingest_info=scraped_data.ingest_info,
                            scrape_key=scrape_key,
                            task=task,
                        )
                    else:
                        logging.info(
                            "Writing ingest_info ([%d] people) to the database"
                            " for [%s]", len(scraped_data.ingest_info.people),
                            self.region.region_code)
                        persistence.write(
                            ingest_utils.convert_ingest_info_to_proto(
                                scraped_data.ingest_info), metadata)
                for sc in scraped_data.single_counts:
                    if not sc.date:
                        scrape_key = ScrapeKey(self.region.region_code,
                                               constants.ScrapeType.BACKGROUND)
                        session = sessions.get_current_session(scrape_key)
                        if session:
                            sc = attr.evolve(sc, date=session.start.date())
                    single_count.store_single_count(
                        sc, self.region.jurisdiction_id)
        except Exception as e:
            if self.BATCH_WRITES:
                scrape_key = ScrapeKey(self.region.region_code,
                                       request.scrape_type)
                batch_persistence.write_error(
                    error=str(e),
                    trace_id=get_trace_id_from_flask(),
                    task=task,
                    scrape_key=scrape_key,
                )
            raise e