def test_get_sessions_background_and_open_only(self): first = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), ) # snapshot self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), ) # closed self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), ) # different region, scrape type self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), ) results = sessions.get_sessions( "us_ny", include_closed=False, scrape_type=constants.ScrapeType.BACKGROUND ) assert to_entities(results) == to_entities([first])
def test_get_sessions_not_open_or_closed(self): # different region self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), ) # open self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), ) # closed self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), ) results = sessions.get_sessions( "us_fl", include_open=False, include_closed=False ) assert not to_entities(results)
def test_get_sessions_none_for_scrape_type(self): self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), ) self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), ) self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), ) self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), ) results = sessions.get_sessions( "us_fl", scrape_type=constants.ScrapeType.BACKGROUND ) assert not to_entities(results)
def test_get_sessions_open_most_recent_only(self): # older self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), ) second = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), ) # closed self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), ) # different region self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), ) results = sessions.get_sessions( "us_ny", include_closed=False, most_recent_only=True ) assert to_entities(results) == to_entities([second])
def test_get_sessions_defaults(self): first = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, start=fix_dt(datetime(2009, 6, 17)), phase=scrape_phase.ScrapePhase.SCRAPE, ) second = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, start=fix_dt(datetime(2009, 6, 18)), phase=scrape_phase.ScrapePhase.SCRAPE, ) third = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), phase=scrape_phase.ScrapePhase.SCRAPE, ) # different region self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, start=fix_dt(datetime(2009, 6, 19)), phase=scrape_phase.ScrapePhase.SCRAPE, ) results = sessions.get_sessions("us_ny") assert to_entities(results) == to_entities([third, second, first])
def stop_scrape(self, scrape_type, respect_is_stoppable=False) -> bool: """Stops all active scraping tasks, resume non-targeted scrape types Stops the scraper, even if in the middle of a session. In production, this is called by a cron job scheduled to prevent interference with the normal operation of the scraped site. We share the scraping taskqueue between snapshot and background scraping to be certain of our throttling for the third-party service. As a result, cleaning up / purging the taskqueue necessarily kills all scrape types. We kick off resume_scrape for any ongoing scraping types that aren't targets. Args: scrape_type: Scrape type to terminate respect_is_stoppable: Defaults to false, in which case the scraper will be stopped regardless of whether `is_stoppable` is set to true. Otherwise, stops the region's scraper only if its `is_stoppable` is set to true. Returns: A bool indicating whether or not the scrape was stopped. """ region = self.get_region() if respect_is_stoppable and not region.is_stoppable: logging.info( "Stop scrape was called and ignored for the region: %s " "because the region's manifest is flagged as not stoppable", region.region_code, ) return False logging.info("Stopping scrape for the region: %s", region.region_code) try: self.cloud_task_manager.purge_scrape_tasks( region_code=region.region_code, queue_name=region.get_queue_name()) except Exception as e: logging.error( "Caught an exception while trying to purge scrape " "tasks. The message was:\n%s", str(e), ) return False # Check for other running scrapes, and if found kick off a delayed # resume for them since the taskqueue purge will kill them. other_scrapes = set([]) open_sessions = sessions.get_sessions(region.region_code, include_closed=False) for session in open_sessions: if session.scrape_type != scrape_type: other_scrapes.add(session.scrape_type) for scrape in other_scrapes: logging.info("Resuming unaffected scrape type: %s.", str(scrape)) self.resume_scrape(scrape) return True
def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = next( sessions.get_sessions( region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type, ), None, ) if most_recent_session and not most_recent_session.phase.has_persisted( ): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE, ) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_scraper() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. load_docket_thread = threading.Thread( target=structured_logging.with_context(docket.load_target_list), args=(scrape_key, given_names, surname), ) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join()
def test_get_sessions_defaults_with_order(self): first = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), end=fix_dt(datetime(2009, 6, 18))) second = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), end=fix_dt(datetime(2009, 6, 19))) third = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19))) results = sessions.get_sessions("us_ny") assert to_entities(results) == to_entities([third, second, first])
def test_get_sessions_none_at_all(self): results = sessions.get_sessions("us_ny") assert not to_entities(results)