def check_for_finished_scrapers(): """Checks for any finished scrapers and kicks off next processes.""" next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None cloud_task_manager = ScraperCloudTaskManager() @monitoring.with_region_tag def _check_finished(region_code: str): # If there are no sessions currently scraping, nothing to check. session = sessions.get_current_session( ScrapeKey(region_code, constants.ScrapeType.BACKGROUND) ) if not session or not session.phase.is_actively_scraping(): return if is_scraper_finished(region_code, cloud_task_manager): logging.info("Region [%s] has finished scraping.", region_code) if next_phase: logging.info( "Enqueueing [%s] for region [%s].", next_phase, region_code ) ScraperCloudTaskManager().create_scraper_phase_task( region_code=region_code, url=next_phase_url ) region_codes = ingest_utils.validate_regions( get_str_param_values("region", request.args) ) failed_regions = [] with futures.ThreadPoolExecutor() as executor: future_to_region = { executor.submit( structured_logging.with_context(_check_finished), region_code ): region_code for region_code in region_codes } for future in futures.as_completed(future_to_region): region_code = future_to_region[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( "An exception occured when checking region [%s]", region_code ) failed_regions.append(region_code) if failed_regions: return ( "Failed to check regions: {}".format(failed_regions), HTTPStatus.INTERNAL_SERVER_ERROR, ) return ("", HTTPStatus.OK)
def read_and_persist() -> Tuple[str, HTTPStatus]: """Reads all of the messages from Datastore for a region and persists them to the database. """ region = request.args.get("region") if not isinstance(region, str): raise ValueError(f"Expected string region, found [{region}]") batch_tags = { monitoring.TagKey.STATUS: "COMPLETED", monitoring.TagKey.PERSISTED: False, } # Note: measurements must be second so it receives the region tag. with monitoring.push_tags( {monitoring.TagKey.REGION: region} ), monitoring.measurements(batch_tags) as measurements: measurements.measure_int_put(m_batch_count, 1) session = sessions.get_most_recent_completed_session( region, ScrapeType.BACKGROUND ) if not session: raise ValueError( f"Most recent session for region [{region}] is unexpectedly None" ) scrape_type = session.scrape_type try: did_persist = persist_to_database(region, session.start) batch_tags[monitoring.TagKey.PERSISTED] = did_persist except Exception as e: logging.exception( "An exception occurred in read and persist: %s", type(e).__name__ ) batch_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(type(e).__name__) sessions.update_phase(session, scrape_phase.ScrapePhase.DONE) raise BatchPersistError(region, scrape_type) from e if did_persist: next_phase = scrape_phase.next_phase(request.endpoint) sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE) if next_phase: logging.info("Enqueueing %s for region %s.", next_phase, region) ScraperCloudTaskManager().create_scraper_phase_task( region_code=region, url=url_for(next_phase) ) return "", HTTPStatus.OK sessions.update_phase(session, scrape_phase.ScrapePhase.DONE) return "", HTTPStatus.ACCEPTED
def read_and_persist(): """Reads all of the messages from Datastore for a region and persists them to the database. """ region = request.args.get('region') batch_tags = { monitoring.TagKey.STATUS: 'COMPLETED', monitoring.TagKey.PERSISTED: False } # Note: measurements must be second so it receives the region tag. with monitoring.push_tags({monitoring.TagKey.REGION: region}), \ monitoring.measurements(batch_tags) as measurements: measurements.measure_int_put(m_batch_count, 1) session = sessions.get_most_recent_completed_session( region, ScrapeType.BACKGROUND) scrape_type = session.scrape_type try: did_persist = persist_to_database(region, session.start) batch_tags[monitoring.TagKey.PERSISTED] = did_persist except Exception as e: logging.exception("An exception occurred in read and persist: %s", type(e).__name__) batch_tags[monitoring.TagKey.STATUS] = 'ERROR: {}' \ .format(type(e).__name__) sessions.update_phase(session, scrape_phase.ScrapePhase.DONE) raise BatchPersistError(region, scrape_type) if did_persist: next_phase = scrape_phase.next_phase(request.endpoint) sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE) if next_phase: logging.info("Enqueueing %s for region %s.", region, next_phase) queues.enqueue_scraper_phase(region_code=region, url=url_for(next_phase)) return '', HTTPStatus.OK sessions.update_phase(session, scrape_phase.ScrapePhase.DONE) return '', HTTPStatus.ACCEPTED
def scraper_stop(): """Request handler to stop one or several running scrapers. Note: Stopping any scrape type for a region involves purging the scraping task queue for that region, necessarily killing any other in-progress scrape types. Untargeted scrapes killed by this request handler will be noted and resumed a moment or two later. Unlike the other Scraper action methods, stop_scrape doesn't call individually for each scrape type. That could create a race condition, as each call noticed the other scrape type was running at the same time, kicked off a resume effort with a delay, and then our second call came to kill the other type and missed the (delayed / not yet in taskqueue) call - effectively not stopping the scrape. Instead, we send the full list of scrape_types to stop, and Scraper.stop_scrape is responsible for fan-out. Example query: /scraper_control/stop?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) respect_is_stoppable = get_str_param_value("respect_is_stoppable", request.args) # If a timezone wasn't provided stop all regions. If it was only stop # regions that match the timezone. scrape_regions = ingest_utils.validate_regions( get_str_param_values("region", request.args), timezone=timezone) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None @structured_logging.copy_trace_id_to_thread @monitoring.with_region_tag def _stop_scraper(region: str): closed_sessions = [] for scrape_type in scrape_types: closed_sessions.extend( sessions.close_session(ScrapeKey(region, scrape_type))) for session in closed_sessions: sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST) if not closed_sessions: return was_stopped = False try: logging.info("Stopping scraper for region [%s].", region) region_scraper = regions.get_region(region).get_ingestor() was_stopped = region_scraper.stop_scrape(scrape_types, respect_is_stoppable) finally: if next_phase and was_stopped: logging.info("Enqueueing %s for region [%s].", next_phase, region) queues.enqueue_scraper_phase(region_code=region, url=next_phase_url) if not scrape_regions or not scrape_types: return ('Missing or invalid parameters, see service logs.', HTTPStatus.BAD_REQUEST) failed_stops = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_regions = \ {executor.submit(_stop_scraper, region_code): region_code for region_code in scrape_regions} # Wait for all the calls to finish. for future in futures.as_completed(future_to_regions): region_code = future_to_regions[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( 'An exception occured when stopping region [%s] for ' '[%s]', region_code, scrape_types) failed_stops.append(region_code) else: logging.info('Finished stopping region [%s] for [%s].', region_code, scrape_types) if failed_stops: # This causes the whole request to be retried. Any regions whose session # was closed during this call will be immediately skipped in the next # call as we won't find any sessions to close. Any regions we failed to # start likely still had their sessions closed and thus will be skipped, # but it is worth retrying anyway. return ('Failed to stop regions: {}'.format(failed_stops), HTTPStatus.INTERNAL_SERVER_ERROR) return ('', HTTPStatus.OK)