Exemplo n.º 1
0
    def test_get_recent_sessions(self):
        first = self.create_session(
            region_code="us_ny",
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 17)),
        )
        # different scrape type
        self.create_session(
            region_code="us_ny",
            scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 18)),
        )
        third = self.create_session(
            region_code="us_ny",
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)),
            end=fix_dt(datetime(2009, 6, 21)),
        )
        # different region, scrape type
        self.create_session(
            region_code="us_fl",
            scrape_type=constants.ScrapeType.SNAPSHOT,
            phase=scrape_phase.ScrapePhase.START,
            start=fix_dt(datetime(2009, 6, 19)),
        )

        results = sessions.get_recent_sessions(
            ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND)
        )
        assert to_entities(results) == to_entities([third, first])
Exemplo n.º 2
0
    def resume_scrape(self, scrape_type):
        """Resume a stopped scrape from where it left off

        Starts the scraper up again at the same place (roughly) as it had been
        stopped previously. This allows for cron jobs to start/stop scrapers at
        different times of day.

        Args:
            scrape_type: (ScrapeType) Type of scraping to resume

        Returns:
            N/A
        """
        # Note: None of the current scrapers support resumes, so this function
        # doesn't fully work. For instance, content is thrown away.
        if scrape_type is constants.ScrapeType.BACKGROUND:
            # Background scrape

            # In most scrapers, background scrapes will use
            # short-lived docket items. However, some background
            # scrapes use only one docket item to run a giant scrape,
            # which may run for months. Limitations in GAE Pull Queues
            # make it difficult to keep track of a leased task for
            # that long, so we don't try. Resuming a background scrape
            # simply resumes from session data, and the task stays in
            # the docket un-leased. It will get deleted the next time
            # we start a new background scrape.

            recent_sessions = sessions.get_recent_sessions(
                ScrapeKey(self.get_region().region_code, scrape_type)
            )

            last_scraped = None
            for session in recent_sessions:
                if session.last_scraped:
                    last_scraped = session.last_scraped
                    break

            if last_scraped:
                content = last_scraped.split(", ")
            else:
                logging.error(
                    "No earlier session with last_scraped found; " "cannot resume."
                )
                return

        else:
            # Snapshot scrape

            # Get an item from the docket and continue from there. These queries
            # are very quick, so we don't bother trying to resume the same task
            # we left off on.

            content = self.iterate_docket_item(scrape_type)
            if not content:
                sessions.close_session(
                    ScrapeKey(self.get_region().region_code, scrape_type)
                )
                return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )