def export( self, export_configs: Sequence[ExportBigQueryViewConfig[MetricBigQueryView]] ) -> List[Tuple[ExportBigQueryViewConfig[MetricBigQueryView], GcsfsFilePath]]: storage_client = storage.Client() output_paths = [] with futures.ThreadPoolExecutor( max_workers=OPTIMIZED_VIEW_EXPORTER_MAX_WORKERS ) as executor: future_to_view = { executor.submit( structured_logging.with_context(self._export_view), storage_client, config, ): config for config in export_configs } for future in futures.as_completed(future_to_view): config = future_to_view.pop(future) try: output_path: GcsfsFilePath = future.result() except Exception as e: logging.error( "Exception found exporting view: %s.%s", config.view.dataset_id, config.view.view_id, ) raise e output_paths.append((config, output_path)) return output_paths
def execute_validation( rematerialize_views: bool, region_code_filter: Optional[str] = None ) -> List[DataValidationJobResult]: """Executes all validation checks. If |region_code_filter| is supplied, limits validations to just that region.""" if rematerialize_views: logging.info( 'Received query param "should_update_views" = true, updating validation dataset and views... ' ) view_update_manager.rematerialize_views() # Fetch collection of validation jobs to perform validation_jobs = _fetch_validation_jobs_to_perform(region_code_filter) logging.info("Performing a total of %s validation jobs...", len(validation_jobs)) # Perform all validations and track failures failed_to_run_validations: List[DataValidationJob] = [] failed_validations: List[DataValidationJobResult] = [] with futures.ThreadPoolExecutor() as executor: future_to_jobs = { executor.submit(structured_logging.with_context(_run_job), job): job for job in validation_jobs } for future in futures.as_completed(future_to_jobs): job = future_to_jobs[future] try: result = future.result() if not result.was_successful: failed_validations.append(result) logging.info( "Finished job [%s] for region [%s]", job.validation.validation_name, job.region_code, ) except Exception as e: logging.error( "Failed to execute asynchronous query for validation job [%s] due to error: %s", job, e, ) failed_to_run_validations.append(job) if failed_validations or failed_to_run_validations: logging.error( "Found a total of [%s] failures, with [%s] failing to run entirely. Emitting results...", len(failed_validations) + len(failed_to_run_validations), len(failed_to_run_validations), ) # Emit metrics for all failures _emit_failures(failed_to_run_validations, failed_validations) else: logging.info("Found no failed validations...") logging.info("Validation run complete. Analyzed a total of %s jobs.", len(validation_jobs)) return failed_validations
def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = next( sessions.get_sessions( region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type, ), None, ) if most_recent_session and not most_recent_session.phase.has_persisted( ): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE, ) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_scraper() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. load_docket_thread = threading.Thread( target=structured_logging.with_context(docket.load_target_list), args=(scrape_key, given_names, surname), ) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join()
def check_for_finished_scrapers(): """Checks for any finished scrapers and kicks off next processes.""" next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None cloud_task_manager = ScraperCloudTaskManager() @monitoring.with_region_tag def _check_finished(region_code: str): # If there are no sessions currently scraping, nothing to check. session = sessions.get_current_session( ScrapeKey(region_code, constants.ScrapeType.BACKGROUND) ) if not session or not session.phase.is_actively_scraping(): return if is_scraper_finished(region_code, cloud_task_manager): logging.info("Region [%s] has finished scraping.", region_code) if next_phase: logging.info( "Enqueueing [%s] for region [%s].", next_phase, region_code ) ScraperCloudTaskManager().create_scraper_phase_task( region_code=region_code, url=next_phase_url ) region_codes = ingest_utils.validate_regions( get_str_param_values("region", request.args) ) failed_regions = [] with futures.ThreadPoolExecutor() as executor: future_to_region = { executor.submit( structured_logging.with_context(_check_finished), region_code ): region_code for region_code in region_codes } for future in futures.as_completed(future_to_region): region_code = future_to_region[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( "An exception occured when checking region [%s]", region_code ) failed_regions.append(region_code) if failed_regions: return ( "Failed to check regions: {}".format(failed_regions), HTTPStatus.INTERNAL_SERVER_ERROR, ) return ("", HTTPStatus.OK)
def build( func: Callable, kwargs_list: List[Dict], max_workers: Optional[int] = None ) -> Generator["FutureExecutor", None, None]: """ Creates a ThreadPoolExecutor and corresponding FutureExecutor """ with futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_execution = FutureExecutor([ executor.submit(structured_logging.with_context(func), **kwargs) for kwargs in kwargs_list ]) yield future_execution
def process_dag( self, view_process_fn: Callable[[BigQueryView, ParentResultsT], ViewResultT] ) -> Dict[BigQueryView, ViewResultT]: """This method provides a level-by-level "breadth-first" traversal of a DAG and executes view_process_fn on every node in level order.""" processed: Set[DagKey] = set() queue: Set[BigQueryViewDagNode] = set(self.roots) result: Dict[BigQueryView, ViewResultT] = {} with futures.ThreadPoolExecutor(max_workers=DAG_WALKER_MAX_WORKERS) as executor: future_to_view = { executor.submit( structured_logging.with_context(view_process_fn), node.view, {} ): node for node in self.roots } processing = {node.dag_key for node in future_to_view.values()} while processing: completed, _not_completed = futures.wait( future_to_view.keys(), return_when="FIRST_COMPLETED" ) for future in completed: node = future_to_view.pop(future) try: view_result: ViewResultT = future.result() except Exception as e: logging.error( "Exception found fetching result for view_key: %s", node.dag_key, ) raise e result[node.view] = view_result processing.remove(node.dag_key) processed.add(node.dag_key) for child_key in node.child_node_keys: child_node = self.nodes_by_key[child_key] if child_node in processed or child_node in queue: raise ValueError( f"Unexpected situation where child node has already been processed: {child_key}" ) if child_node in processing: continue parents_all_processed = True parent_results = {} for parent_key in child_node.parent_keys: if ( parent_key in self.nodes_by_key and parent_key not in processed ): parents_all_processed = False break if parent_key in self.nodes_by_key: parent_view = self.nodes_by_key[parent_key].view parent_results[parent_view] = result[parent_view] if parents_all_processed: future = executor.submit( structured_logging.with_context(view_process_fn), child_node.view, parent_results, ) future_to_view[future] = child_node processing.add(child_node.dag_key) return result
def execute_validation( rematerialize_views: bool, region_code_filter: Optional[str] = None, validation_name_filter: Optional[Pattern] = None, sandbox_dataset_prefix: Optional[str] = None, ) -> List[DataValidationJobResult]: """Executes all validation checks. If |region_code_filter| is supplied, limits validations to just that region. If |validation_name_filter| is supplied, only performs validations on those that have a regex match. If |sandbox_dataset_prefix| is supplied, performs validation using sandbox dataset """ sandbox_dataset_overrides = None if sandbox_dataset_prefix: sandbox_dataset_overrides = dataset_overrides_for_view_builders( sandbox_dataset_prefix, DEPLOYED_VIEW_BUILDERS) if rematerialize_views: logging.info( 'Received query param "should_update_views" = true, updating validation dataset and views... ' ) view_update_manager.rematerialize_views_for_view_builders( views_to_update_builders=DEPLOYED_VIEW_BUILDERS, all_view_builders=DEPLOYED_VIEW_BUILDERS, view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS, dataset_overrides=sandbox_dataset_overrides, # If a given view hasn't been loaded to the sandbox it will skip it skip_missing_views=True, ) # Fetch collection of validation jobs to perform validation_jobs = _fetch_validation_jobs_to_perform( region_code_filter=region_code_filter, validation_name_filter=validation_name_filter, dataset_overrides=sandbox_dataset_overrides, ) run_datetime = datetime.datetime.today() run_id = uuid.uuid4().hex logging.info( "Performing a total of %s validation jobs [run_datetime: %s, run_id: %s]...", len(validation_jobs), run_datetime.isoformat(), run_id, ) # Perform all validations and track failures failed_to_run_validations: List[DataValidationJob] = [] failed_validations: List[DataValidationJobResult] = [] results_to_store: List[ValidationResultForStorage] = [] with futures.ThreadPoolExecutor() as executor: future_to_jobs = { executor.submit(structured_logging.with_context(_run_job), job): job for job in validation_jobs } for future in futures.as_completed(future_to_jobs): job = future_to_jobs[future] try: result: DataValidationJobResult = future.result() results_to_store.append( ValidationResultForStorage.from_validation_result( run_id=run_id, run_datetime=run_datetime, result=result, )) if not result.was_successful: failed_validations.append(result) logging.info( "Finished job [%s] for region [%s]", job.validation.validation_name, job.region_code, ) except Exception as e: logging.error( "Failed to execute asynchronous query for validation job [%s] due to error: %s", job, e, ) results_to_store.append( ValidationResultForStorage.from_validation_job( run_id=run_id, run_datetime=run_datetime, job=job, )) failed_to_run_validations.append(job) store_validation_results(results_to_store) if failed_validations or failed_to_run_validations: logging.error( "Found a total of [%s] failures, with [%s] failing to run entirely. Emitting results...", len(failed_validations) + len(failed_to_run_validations), len(failed_to_run_validations), ) # Emit metrics for all failures _emit_failures(failed_to_run_validations, failed_validations) else: logging.info("Found no failed validations...") logging.info("Validation run complete. Analyzed a total of %s jobs.", len(validation_jobs)) return failed_validations
def scraper_start(): """Request handler to start one or several running scrapers Kicks off new scrape session for each region and scrape type in request Example query: /scraper_control/start?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' timezone: (string) The timezone to scrape. surname: (string, optional) Name to start scrape at. Required if given_names provided given_names: (string, optional) Name to start scrape at Args: N/A Returns: N/A """ @monitoring.with_region_tag def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = next( sessions.get_sessions( region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type, ), None, ) if most_recent_session and not most_recent_session.phase.has_persisted( ): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE, ) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_scraper() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. load_docket_thread = threading.Thread( target=structured_logging.with_context(docket.load_target_list), args=(scrape_key, given_names, surname), ) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join() timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) stripe_value = get_str_param_values("stripe", request.args) region_value = get_str_param_values("region", request.args) # If a timezone wasn't provided start all regions. If it was only start # regions that match the timezone. scrape_regions = ingest_utils.validate_regions(region_value, timezone=timezone, stripes=stripe_value) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) if not scrape_regions or not scrape_types: return ( "Missing or invalid parameters, or no regions found, see logs.", HTTPStatus.BAD_REQUEST, ) given_names = get_str_param_value("given_names", request.args, "") surname = get_str_param_value("surname", request.args, "") failed_starts = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_args = { executor.submit( structured_logging.with_context(_start_scraper), region_code, scrape_type, ): (region_code, scrape_type) for scrape_type in scrape_types for region_code in scrape_regions } # Wait for all the calls to finish. for future in futures.as_completed(future_to_args): region_code, scrape_type = future_to_args[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( "An exception occured when starting region [%s] for " "[%s]", region_code, scrape_type, ) failed_starts.append((region_code, scrape_type)) else: logging.info( "Finished starting region [%s] for [%s].", region_code, scrape_type, ) if failed_starts: # This causes the whole request to be retried. Any regions whose session # was opened during this call will be immediately skipped in the next # call when we check for open sessions. Any regions we failed to start # likely still had sessions opened and thus will be skipped, but it is # worth retrying anyway. return ( "Failed to start regions: {}".format(failed_starts), HTTPStatus.INTERNAL_SERVER_ERROR, ) return ("", HTTPStatus.OK)
def scraper_stop(): """Request handler to stop one or several running scrapers. Note: Stopping any scrape type for a region involves purging the scraping task queue for that region, necessarily killing any other in-progress scrape types. Untargeted scrapes killed by this request handler will be noted and resumed a moment or two later. Unlike the other Scraper action methods, stop_scrape doesn't call individually for each scrape type. That could create a race condition, as each call noticed the other scrape type was running at the same time, kicked off a resume effort with a delay, and then our second call came to kill the other type and missed the (delayed / not yet in taskqueue) call - effectively not stopping the scrape. Instead, we send the full list of scrape_types to stop, and Scraper.stop_scrape is responsible for fan-out. Example query: /scraper_control/stop?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) stripe = get_str_param_values("stripe", request.args) respect_is_stoppable = get_str_param_value("respect_is_stoppable", request.args) # If a timezone wasn't provided stop all regions. If it was only stop # regions that match the timezone. If stripe provided, stop only regions # with matching stripe scrape_regions = ingest_utils.validate_regions(get_str_param_values( "region", request.args), timezone=timezone, stripes=stripe) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None @monitoring.with_region_tag def _stop_scraper(region: str): logging.info("Trying to stop scraper for region [%s].", region) for scrape_type in scrape_types: key = ScrapeKey(region_code=region, scrape_type=scrape_type) session = sessions.get_current_session(key) if not session: logging.info("No [%s] scrape to stop for region: [%s]", scrape_type, region) continue region_scraper = regions.get_region(region).get_scraper() was_stopped = region_scraper.stop_scrape(scrape_type, respect_is_stoppable) if was_stopped: closed_sessions = sessions.close_session(key) for closed_session in closed_sessions: sessions.update_phase(closed_session, scrape_phase.ScrapePhase.PERSIST) if next_phase: logging.info("Enqueueing %s for region [%s].", next_phase, region) ScraperCloudTaskManager().create_scraper_phase_task( region_code=region, url=next_phase_url) if not scrape_regions or not scrape_types: return ( "Missing or invalid parameters, see service logs.", HTTPStatus.BAD_REQUEST, ) failed_stops = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_regions = { executor.submit(structured_logging.with_context(_stop_scraper), region_code): region_code for region_code in scrape_regions } # Wait for all the calls to finish. for future in futures.as_completed(future_to_regions): region_code = future_to_regions[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( "An exception occured when stopping region [%s] for " "[%s]", region_code, scrape_types, ) failed_stops.append(region_code) else: logging.info( "Finished stopping region [%s] for [%s].", region_code, scrape_types, ) if failed_stops: # This causes the whole request to be retried. Any regions whose session # was closed during this call will be immediately skipped in the next # call as we won't find any sessions to close. Any regions we failed to # start likely still had their sessions closed and thus will be skipped, # but it is worth retrying anyway. return ( "Failed to stop regions: {}".format(failed_stops), HTTPStatus.INTERNAL_SERVER_ERROR, ) return ("", HTTPStatus.OK)