Exemplo n.º 1
0
 def test_get_sessions_background_and_open_only(self):
     first = self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 17)),
     )
     # snapshot
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 18)),
     )
     # closed
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
         end=fix_dt(datetime(2009, 6, 21)),
     )
     # different region, scrape type
     self.create_session(
         region_code="us_fl",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
     )
     results = sessions.get_sessions(
         "us_ny", include_closed=False, scrape_type=constants.ScrapeType.BACKGROUND
     )
     assert to_entities(results) == to_entities([first])
Exemplo n.º 2
0
 def test_get_sessions_not_open_or_closed(self):
     # different region
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 17)),
     )
     # open
     self.create_session(
         region_code="us_fl",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 18)),
     )
     # closed
     self.create_session(
         region_code="us_fl",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
         end=fix_dt(datetime(2009, 6, 21)),
     )
     results = sessions.get_sessions(
         "us_fl", include_open=False, include_closed=False
     )
     assert not to_entities(results)
Exemplo n.º 3
0
 def test_get_sessions_none_for_scrape_type(self):
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 17)),
     )
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 18)),
     )
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
         end=fix_dt(datetime(2009, 6, 21)),
     )
     self.create_session(
         region_code="us_fl",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
     )
     results = sessions.get_sessions(
         "us_fl", scrape_type=constants.ScrapeType.BACKGROUND
     )
     assert not to_entities(results)
Exemplo n.º 4
0
 def test_get_sessions_open_most_recent_only(self):
     # older
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 17)),
     )
     second = self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 18)),
     )
     # closed
     self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
         end=fix_dt(datetime(2009, 6, 21)),
     )
     # different region
     self.create_session(
         region_code="us_fl",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)),
     )
     results = sessions.get_sessions(
         "us_ny", include_closed=False, most_recent_only=True
     )
     assert to_entities(results) == to_entities([second])
Exemplo n.º 5
0
 def test_get_sessions_defaults(self):
     first = self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         start=fix_dt(datetime(2009, 6, 17)),
         phase=scrape_phase.ScrapePhase.SCRAPE,
     )
     second = self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         start=fix_dt(datetime(2009, 6, 18)),
         phase=scrape_phase.ScrapePhase.SCRAPE,
     )
     third = self.create_session(
         region_code="us_ny",
         scrape_type=constants.ScrapeType.BACKGROUND,
         start=fix_dt(datetime(2009, 6, 19)),
         end=fix_dt(datetime(2009, 6, 21)),
         phase=scrape_phase.ScrapePhase.SCRAPE,
     )
     # different region
     self.create_session(
         region_code="us_fl",
         scrape_type=constants.ScrapeType.SNAPSHOT,
         start=fix_dt(datetime(2009, 6, 19)),
         phase=scrape_phase.ScrapePhase.SCRAPE,
     )
     results = sessions.get_sessions("us_ny")
     assert to_entities(results) == to_entities([third, second, first])
Exemplo n.º 6
0
    def stop_scrape(self, scrape_type, respect_is_stoppable=False) -> bool:
        """Stops all active scraping tasks, resume non-targeted scrape types
        Stops the scraper, even if in the middle of a session. In
        production, this is called by a cron job scheduled to prevent
        interference with the normal operation of the scraped site.
        We share the scraping taskqueue between snapshot and
        background scraping to be certain of our throttling for the
        third-party service. As a result, cleaning up / purging the
        taskqueue necessarily kills all scrape types.  We kick off
        resume_scrape for any ongoing scraping types that aren't
        targets.
        Args:
            scrape_type: Scrape type to terminate
            respect_is_stoppable: Defaults to false, in which case the scraper
                will be stopped regardless of whether `is_stoppable` is set to
                true. Otherwise, stops the region's scraper only if its
                `is_stoppable` is set to true.
        Returns:
            A bool indicating whether or not the scrape was stopped.
        """
        region = self.get_region()

        if respect_is_stoppable and not region.is_stoppable:
            logging.info(
                "Stop scrape was called and ignored for the region: %s "
                "because the region's manifest is flagged as not stoppable",
                region.region_code,
            )
            return False

        logging.info("Stopping scrape for the region: %s", region.region_code)

        try:
            self.cloud_task_manager.purge_scrape_tasks(
                region_code=region.region_code,
                queue_name=region.get_queue_name())
        except Exception as e:
            logging.error(
                "Caught an exception while trying to purge scrape "
                "tasks. The message was:\n%s",
                str(e),
            )
            return False

        # Check for other running scrapes, and if found kick off a delayed
        # resume for them since the taskqueue purge will kill them.
        other_scrapes = set([])
        open_sessions = sessions.get_sessions(region.region_code,
                                              include_closed=False)
        for session in open_sessions:
            if session.scrape_type != scrape_type:
                other_scrapes.add(session.scrape_type)

        for scrape in other_scrapes:
            logging.info("Resuming unaffected scrape type: %s.", str(scrape))
            self.resume_scrape(scrape)

        return True
Exemplo n.º 7
0
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()
Exemplo n.º 8
0
 def test_get_sessions_defaults_with_order(self):
     first = self.create_session(
         region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 17)),
         end=fix_dt(datetime(2009, 6, 18)))
     second = self.create_session(
         region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 18)),
         end=fix_dt(datetime(2009, 6, 19)))
     third = self.create_session(
         region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND,
         phase=scrape_phase.ScrapePhase.START,
         start=fix_dt(datetime(2009, 6, 19)))
     results = sessions.get_sessions("us_ny")
     assert to_entities(results) == to_entities([third, second, first])
Exemplo n.º 9
0
 def test_get_sessions_none_at_all(self):
     results = sessions.get_sessions("us_ny")
     assert not to_entities(results)