예제 #1
0
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()
예제 #2
0
def purge_query_docket(scrape_key: ScrapeKey):
    """Purges the docket of all tasks for provided region / scrape type

    This deletes our current subscription to the given docket topic. When we try
    to add or pull from the topic next, we will create a new subscription. That
    subscription will only receive messages that are published after it is
    created.

    Args:
        scrape_key: (ScrapeKey) The scraper whose tasks to purge the docket of

    Returns:
        N/A
    """
    logging.info("Purging existing query docket for scraper: [%s]", scrape_key)
    pubsub_helper.purge(scrape_key, PUBSUB_TYPE)