def test_create_session_with_existing(self, mock_datetime, mock_client, mock_query): mock_datetime.now.return_value = fixed_now existing_session = ScrapeSession.new( key=datastore.key.Key("session", "existing", project=0), start=fixed_now, scrape_type=constants.ScrapeType.BACKGROUND, region="us_ny", phase=scrape_phase.ScrapePhase.START, ) new_key = datastore.key.Key("session", "new", project=0) new_session = ScrapeSession.new( key=new_key, start=fixed_now, scrape_type=constants.ScrapeType.BACKGROUND, region="us_wy", phase=scrape_phase.ScrapePhase.START, ) client = mock_client.return_value client.key.return_value = new_key wire_sessions_to_query(mock_client, mock_query, [existing_session]) scrape_key = ScrapeKey("us_wy", constants.ScrapeType.BACKGROUND) sessions.create_session(scrape_key) existing_session.end = fixed_now client.put.assert_any_call(existing_session.to_entity()) client.put.assert_any_call(new_session.to_entity()) assert client.put.call_count == 2
def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = next( sessions.get_sessions( region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type, ), None, ) if most_recent_session and not most_recent_session.phase.has_persisted( ): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE, ) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_scraper() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. load_docket_thread = threading.Thread( target=structured_logging.with_context(docket.load_target_list), args=(scrape_key, given_names, surname), ) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join()
def test_create_session(self, mock_datetime, mock_client): mock_datetime.now.return_value = fixed_now # Must use a full key so that the entities are equal. key = datastore.key.Key('session', 'key', project=0) client = mock_client.return_value client.key.return_value = key scrape_key = ScrapeKey("us_ok", constants.ScrapeType.SNAPSHOT) sessions.create_session(scrape_key) session = ScrapeSession.new( key=datastore.key.Key('session', 'key', project=0), start=fixed_now, scrape_type=constants.ScrapeType.SNAPSHOT, region='us_ok', phase=scrape_phase.ScrapePhase.START, ) client.put.assert_called_with(session.to_entity())
def scraper_resume(): """Request handler to resume one or several stopped scrapers Resumes scraping for each region and scrape type in request. Example query: /scraper_control/resume?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ scrape_regions = ingest_utils.validate_regions( get_str_param_values("region", request.args)) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) if not scrape_regions or not scrape_types: return ( "Missing or invalid parameters, see service logs.", HTTPStatus.BAD_REQUEST, ) for region in scrape_regions: for scrape_type in scrape_types: logging.info("Resuming [%s] scrape for [%s].", scrape_type, region) sessions.create_session(ScrapeKey(region, scrape_type)) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(5) scraper = regions.get_region(region).get_scraper() scraper.resume_scrape(scrape_type) return ("", HTTPStatus.OK)