def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = next( sessions.get_sessions( region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type, ), None, ) if most_recent_session and not most_recent_session.phase.has_persisted( ): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE, ) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_scraper() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. load_docket_thread = threading.Thread( target=structured_logging.with_context(docket.load_target_list), args=(scrape_key, given_names, surname), ) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join()
def purge_query_docket(scrape_key: ScrapeKey): """Purges the docket of all tasks for provided region / scrape type This deletes our current subscription to the given docket topic. When we try to add or pull from the topic next, we will create a new subscription. That subscription will only receive messages that are published after it is created. Args: scrape_key: (ScrapeKey) The scraper whose tasks to purge the docket of Returns: N/A """ logging.info("Purging existing query docket for scraper: [%s]", scrape_key) pubsub_helper.purge(scrape_key, PUBSUB_TYPE)