def infer_release(): """Runs infer release for the given regions.""" region_codes = validate_regions( get_str_param_values("region", request.args)) regions = [get_region(region_code) for region_code in region_codes] for region in regions: with monitoring.push_tags( {monitoring.TagKey.REGION: region.region_code}): if region.agency_type != "jail": continue session = sessions.get_most_recent_completed_session( region.region_code) if session: logging.info( "Got most recent completed session for [%s] with " "start time [%s]", region.region_code, session.start, ) persistence.infer_release_on_open_bookings( region.region_code, session.start, _get_custody_status(region)) sessions.update_phase(session, scrape_phase.ScrapePhase.DONE) return "", HTTPStatus.OK
def test_validate_regions_one_all(self, _mock_modules): assert ingest_utils.validate_regions(['all']) == { 'us_ny', 'us_pa', 'us_vt', 'us_pa_greene', }
def test_validate_regions_one_all(self, _mock_modules): assert ingest_utils.validate_regions(["all"]) == { "us_ny", "us_pa", "us_vt", "us_pa_greene", }
def run_scraper(args: argparse.Namespace) -> None: use_in_memory_sqlite_database(JailsBase) region_codes = validate_regions(args.region.split(",")) if not region_codes: sys.exit(1) failed_regions = [] valid_region_codes = cast(Set[str], region_codes) for region_code in valid_region_codes: logging.info("***") logging.info("***") logging.info("Starting scraper for region: [%s]", region_code) logging.info("***") logging.info("***") try: run_scraper_for_region(regions.get_region(region_code), args) except Exception: print(traceback.format_exc()) failed_regions.append(region_code) if failed_regions: logging.info("***") logging.info( "The following regions raised errors during scraping: " "[%s]", failed_regions, )
def check_for_finished_scrapers(): """Checks for any finished scrapers and kicks off next processes.""" next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None cloud_task_manager = ScraperCloudTaskManager() @monitoring.with_region_tag def _check_finished(region_code: str): # If there are no sessions currently scraping, nothing to check. session = sessions.get_current_session( ScrapeKey(region_code, constants.ScrapeType.BACKGROUND) ) if not session or not session.phase.is_actively_scraping(): return if is_scraper_finished(region_code, cloud_task_manager): logging.info("Region [%s] has finished scraping.", region_code) if next_phase: logging.info( "Enqueueing [%s] for region [%s].", next_phase, region_code ) ScraperCloudTaskManager().create_scraper_phase_task( region_code=region_code, url=next_phase_url ) region_codes = ingest_utils.validate_regions( get_str_param_values("region", request.args) ) failed_regions = [] with futures.ThreadPoolExecutor() as executor: future_to_region = { executor.submit( structured_logging.with_context(_check_finished), region_code ): region_code for region_code in region_codes } for future in futures.as_completed(future_to_region): region_code = future_to_region[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( "An exception occured when checking region [%s]", region_code ) failed_regions.append(region_code) if failed_regions: return ( "Failed to check regions: {}".format(failed_regions), HTTPStatus.INTERNAL_SERVER_ERROR, ) return ("", HTTPStatus.OK)
def test_validate_regions_multiple_all(self, mock_region, mock_env, _mock_modules): fake_region = Mock() mock_region.return_value = fake_region fake_region.environment = "production" mock_env.return_value = "production" assert ingest_utils.validate_regions(["us_pa", "all"]) == { "us_ny", "us_pa", "us_vt", "us_pa_greene", }
def test_validate_regions_multiple_all(self, mock_region, mock_env, _mock_modules): fake_region = Mock() mock_region.return_value = fake_region fake_region.environment = 'production' mock_env.return_value = 'production' assert ingest_utils.validate_regions(['us_pa', 'all']) == { 'us_ny', 'us_pa', 'us_vt', 'us_pa_greene', }
def test_validate_regions_environments(self, mock_region, mock_env, _mock_modules): region_prod, region_staging, region_none = Mock(), Mock(), Mock() region_prod.environment = 'production' region_staging.environment = 'staging' region_none.environment = False mock_region.side_effect = [ region_prod, region_none, region_prod, region_staging ] mock_env.return_value = 'production' assert len(ingest_utils.validate_regions(['all'])) == 2
def scraper_resume(): """Request handler to resume one or several stopped scrapers Resumes scraping for each region and scrape type in request. Example query: /scraper_control/resume?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ scrape_regions = ingest_utils.validate_regions( get_str_param_values("region", request.args)) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) if not scrape_regions or not scrape_types: return ( "Missing or invalid parameters, see service logs.", HTTPStatus.BAD_REQUEST, ) for region in scrape_regions: for scrape_type in scrape_types: logging.info("Resuming [%s] scrape for [%s].", scrape_type, region) sessions.create_session(ScrapeKey(region, scrape_type)) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(5) scraper = regions.get_region(region).get_scraper() scraper.resume_scrape(scrape_type) return ("", HTTPStatus.OK)
def run_scraper(args): use_in_memory_sqlite_database(JailsBase) region_codes = validate_regions(args.region.split(',')) if not region_codes: sys.exit(1) failed_regions = [] for region_code in region_codes: logging.info('***') logging.info('***') logging.info("Starting scraper for region: [%s]", region_code) logging.info('***') logging.info('***') try: run_scraper_for_region(regions.get_region(region_code), args) except Exception: print(traceback.format_exc()) failed_regions.append(region_code) if failed_regions: logging.info('***') logging.info("The following regions raised errors during scraping: " "[%s]", failed_regions)
def test_validate_regions_multiple_ok(self, _mock_modules): assert ingest_utils.validate_regions(['us_pa', 'us_ny']) == {'us_pa', 'us_ny'}
def test_validate_regions_multiple_invalid(self, _mock_modules): assert not ingest_utils.validate_regions(["us_pa", "invalid"])
def test_validate_regions_multiple_ok(self, _mock_modules): assert ingest_utils.validate_regions(["us_pa", "us_ny"]) == {"us_pa", "us_ny"}
def test_validate_regions_one_invalid(self, _mock_modules): assert not ingest_utils.validate_regions(["ca_bc"])
def test_validate_regions_empty(self, _mock_modules): assert ingest_utils.validate_regions([]) == set()
def test_validate_regions_multiple_invalid(self, _mock_modules): assert not ingest_utils.validate_regions(['us_pa', 'invalid'])
def scraper_stop(): """Request handler to stop one or several running scrapers. Note: Stopping any scrape type for a region involves purging the scraping task queue for that region, necessarily killing any other in-progress scrape types. Untargeted scrapes killed by this request handler will be noted and resumed a moment or two later. Unlike the other Scraper action methods, stop_scrape doesn't call individually for each scrape type. That could create a race condition, as each call noticed the other scrape type was running at the same time, kicked off a resume effort with a delay, and then our second call came to kill the other type and missed the (delayed / not yet in taskqueue) call - effectively not stopping the scrape. Instead, we send the full list of scrape_types to stop, and Scraper.stop_scrape is responsible for fan-out. Example query: /scraper_control/stop?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) respect_is_stoppable = get_str_param_value("respect_is_stoppable", request.args) # If a timezone wasn't provided stop all regions. If it was only stop # regions that match the timezone. scrape_regions = ingest_utils.validate_regions( get_str_param_values("region", request.args), timezone=timezone) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None @structured_logging.copy_trace_id_to_thread @monitoring.with_region_tag def _stop_scraper(region: str): closed_sessions = [] for scrape_type in scrape_types: closed_sessions.extend( sessions.close_session(ScrapeKey(region, scrape_type))) for session in closed_sessions: sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST) if not closed_sessions: return was_stopped = False try: logging.info("Stopping scraper for region [%s].", region) region_scraper = regions.get_region(region).get_ingestor() was_stopped = region_scraper.stop_scrape(scrape_types, respect_is_stoppable) finally: if next_phase and was_stopped: logging.info("Enqueueing %s for region [%s].", next_phase, region) queues.enqueue_scraper_phase(region_code=region, url=next_phase_url) if not scrape_regions or not scrape_types: return ('Missing or invalid parameters, see service logs.', HTTPStatus.BAD_REQUEST) failed_stops = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_regions = \ {executor.submit(_stop_scraper, region_code): region_code for region_code in scrape_regions} # Wait for all the calls to finish. for future in futures.as_completed(future_to_regions): region_code = future_to_regions[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( 'An exception occured when stopping region [%s] for ' '[%s]', region_code, scrape_types) failed_stops.append(region_code) else: logging.info('Finished stopping region [%s] for [%s].', region_code, scrape_types) if failed_stops: # This causes the whole request to be retried. Any regions whose session # was closed during this call will be immediately skipped in the next # call as we won't find any sessions to close. Any regions we failed to # start likely still had their sessions closed and thus will be skipped, # but it is worth retrying anyway. return ('Failed to stop regions: {}'.format(failed_stops), HTTPStatus.INTERNAL_SERVER_ERROR) return ('', HTTPStatus.OK)
def scraper_start(): """Request handler to start one or several running scrapers Kicks off new scrape session for each region and scrape type in request Example query: /scraper_control/start?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' timezone: (string) The timezone to scrape. surname: (string, optional) Name to start scrape at. Required if given_names provided given_names: (string, optional) Name to start scrape at Args: N/A Returns: N/A """ @structured_logging.copy_trace_id_to_thread @monitoring.with_region_tag def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = \ next(sessions.get_sessions(region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type), None) if most_recent_session and not \ most_recent_session.phase.has_persisted(): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_ingestor() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. # # TODO(#1045): Either kill this, or ensure logs are correlated and # exceptions are passed up to the parent thread. load_docket_thread = threading.Thread( target=docket.load_target_list, args=(scrape_key, given_names, surname)) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join() timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) region_value = get_str_param_values("region", request.args) # If a timezone wasn't provided start all regions. If it was only start # regions that match the timezone. scrape_regions = ingest_utils.validate_regions( region_value, timezone=timezone) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) if not scrape_regions or not scrape_types: return ('Missing or invalid parameters, or no regions found, see logs.', HTTPStatus.BAD_REQUEST) given_names = get_str_param_value("given_names", request.args, "") surname = get_str_param_value("surname", request.args, "") failed_starts = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_args = \ {executor.submit(_start_scraper, region_code, scrape_type): \ (region_code, scrape_type) for scrape_type in scrape_types for region_code in scrape_regions} # Wait for all the calls to finish. for future in futures.as_completed(future_to_args): region_code, scrape_type = future_to_args[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( 'An exception occured when starting region [%s] for ' '[%s]', region_code, scrape_type) failed_starts.append((region_code, scrape_type)) else: logging.info('Finished starting region [%s] for [%s].', region_code, scrape_type) if failed_starts: # This causes the whole request to be retried. Any regions whose session # was opened during this call will be immediately skipped in the next # call when we check for open sessions. Any regions we failed to start # likely still had sessions opened and thus will be skipped, but it is # worth retrying anyway. return ('Failed to start regions: {}'.format(failed_starts), HTTPStatus.INTERNAL_SERVER_ERROR) return ('', HTTPStatus.OK)