def test_get_recent_sessions(self): first = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), ) # different scrape type self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), ) third = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), ) # different region, scrape type self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), ) results = sessions.get_recent_sessions( ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND) ) assert to_entities(results) == to_entities([third, first])
def resume_scrape(self, scrape_type): """Resume a stopped scrape from where it left off Starts the scraper up again at the same place (roughly) as it had been stopped previously. This allows for cron jobs to start/stop scrapers at different times of day. Args: scrape_type: (ScrapeType) Type of scraping to resume Returns: N/A """ # Note: None of the current scrapers support resumes, so this function # doesn't fully work. For instance, content is thrown away. if scrape_type is constants.ScrapeType.BACKGROUND: # Background scrape # In most scrapers, background scrapes will use # short-lived docket items. However, some background # scrapes use only one docket item to run a giant scrape, # which may run for months. Limitations in GAE Pull Queues # make it difficult to keep track of a leased task for # that long, so we don't try. Resuming a background scrape # simply resumes from session data, and the task stays in # the docket un-leased. It will get deleted the next time # we start a new background scrape. recent_sessions = sessions.get_recent_sessions( ScrapeKey(self.get_region().region_code, scrape_type) ) last_scraped = None for session in recent_sessions: if session.last_scraped: last_scraped = session.last_scraped break if last_scraped: content = last_scraped.split(", ") else: logging.error( "No earlier session with last_scraped found; " "cannot resume." ) return else: # Snapshot scrape # Get an item from the docket and continue from there. These queries # are very quick, so we don't bother trying to resume the same task # we left off on. content = self.iterate_docket_item(scrape_type) if not content: sessions.close_session( ScrapeKey(self.get_region().region_code, scrape_type) ) return self.add_task( self.get_initial_task_method(), QueueRequest( scrape_type=scrape_type, scraper_start_time=datetime.now(), next_task=self.get_initial_task(), ), )