def export(
        self, export_configs: Sequence[ExportBigQueryViewConfig[MetricBigQueryView]]
    ) -> List[Tuple[ExportBigQueryViewConfig[MetricBigQueryView], GcsfsFilePath]]:
        storage_client = storage.Client()
        output_paths = []
        with futures.ThreadPoolExecutor(
            max_workers=OPTIMIZED_VIEW_EXPORTER_MAX_WORKERS
        ) as executor:
            future_to_view = {
                executor.submit(
                    structured_logging.with_context(self._export_view),
                    storage_client,
                    config,
                ): config
                for config in export_configs
            }
            for future in futures.as_completed(future_to_view):
                config = future_to_view.pop(future)
                try:
                    output_path: GcsfsFilePath = future.result()
                except Exception as e:
                    logging.error(
                        "Exception found exporting view: %s.%s",
                        config.view.dataset_id,
                        config.view.view_id,
                    )
                    raise e
                output_paths.append((config, output_path))

        return output_paths
예제 #2
0
def execute_validation(
        rematerialize_views: bool,
        region_code_filter: Optional[str] = None
) -> List[DataValidationJobResult]:
    """Executes all validation checks. If |region_code_filter| is supplied, limits validations to just that region."""
    if rematerialize_views:
        logging.info(
            'Received query param "should_update_views" = true, updating validation dataset and views... '
        )
        view_update_manager.rematerialize_views()

    # Fetch collection of validation jobs to perform
    validation_jobs = _fetch_validation_jobs_to_perform(region_code_filter)
    logging.info("Performing a total of %s validation jobs...",
                 len(validation_jobs))

    # Perform all validations and track failures
    failed_to_run_validations: List[DataValidationJob] = []
    failed_validations: List[DataValidationJobResult] = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_jobs = {
            executor.submit(structured_logging.with_context(_run_job), job):
            job
            for job in validation_jobs
        }

        for future in futures.as_completed(future_to_jobs):
            job = future_to_jobs[future]
            try:
                result = future.result()
                if not result.was_successful:
                    failed_validations.append(result)
                logging.info(
                    "Finished job [%s] for region [%s]",
                    job.validation.validation_name,
                    job.region_code,
                )
            except Exception as e:
                logging.error(
                    "Failed to execute asynchronous query for validation job [%s] due to error: %s",
                    job,
                    e,
                )
                failed_to_run_validations.append(job)

    if failed_validations or failed_to_run_validations:
        logging.error(
            "Found a total of [%s] failures, with [%s] failing to run entirely. Emitting results...",
            len(failed_validations) + len(failed_to_run_validations),
            len(failed_to_run_validations),
        )
        # Emit metrics for all failures
        _emit_failures(failed_to_run_validations, failed_validations)
    else:
        logging.info("Found no failed validations...")

    logging.info("Validation run complete. Analyzed a total of %s jobs.",
                 len(validation_jobs))
    return failed_validations
예제 #3
0
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()
예제 #4
0
def check_for_finished_scrapers():
    """Checks for any finished scrapers and kicks off next processes."""

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None
    cloud_task_manager = ScraperCloudTaskManager()

    @monitoring.with_region_tag
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND)
        )
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info(
                    "Enqueueing [%s] for region [%s].", next_phase, region_code
                )
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url
                )

    region_codes = ingest_utils.validate_regions(
        get_str_param_values("region", request.args)
    )

    failed_regions = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_region = {
            executor.submit(
                structured_logging.with_context(_check_finished), region_code
            ): region_code
            for region_code in region_codes
        }
        for future in futures.as_completed(future_to_region):
            region_code = future_to_region[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when checking region [%s]", region_code
                    )
                    failed_regions.append(region_code)

    if failed_regions:
        return (
            "Failed to check regions: {}".format(failed_regions),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)
예제 #5
0
    def build(
        func: Callable,
        kwargs_list: List[Dict],
        max_workers: Optional[int] = None
    ) -> Generator["FutureExecutor", None, None]:
        """ Creates a ThreadPoolExecutor and corresponding FutureExecutor """
        with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_execution = FutureExecutor([
                executor.submit(structured_logging.with_context(func),
                                **kwargs) for kwargs in kwargs_list
            ])

            yield future_execution
    def process_dag(
        self, view_process_fn: Callable[[BigQueryView, ParentResultsT], ViewResultT]
    ) -> Dict[BigQueryView, ViewResultT]:
        """This method provides a level-by-level "breadth-first" traversal of a DAG and executes
        view_process_fn on every node in level order."""
        processed: Set[DagKey] = set()
        queue: Set[BigQueryViewDagNode] = set(self.roots)
        result: Dict[BigQueryView, ViewResultT] = {}
        with futures.ThreadPoolExecutor(max_workers=DAG_WALKER_MAX_WORKERS) as executor:
            future_to_view = {
                executor.submit(
                    structured_logging.with_context(view_process_fn), node.view, {}
                ): node
                for node in self.roots
            }
            processing = {node.dag_key for node in future_to_view.values()}
            while processing:
                completed, _not_completed = futures.wait(
                    future_to_view.keys(), return_when="FIRST_COMPLETED"
                )
                for future in completed:
                    node = future_to_view.pop(future)
                    try:
                        view_result: ViewResultT = future.result()
                    except Exception as e:
                        logging.error(
                            "Exception found fetching result for view_key: %s",
                            node.dag_key,
                        )
                        raise e
                    result[node.view] = view_result
                    processing.remove(node.dag_key)
                    processed.add(node.dag_key)

                    for child_key in node.child_node_keys:
                        child_node = self.nodes_by_key[child_key]
                        if child_node in processed or child_node in queue:
                            raise ValueError(
                                f"Unexpected situation where child node has already been processed: {child_key}"
                            )
                        if child_node in processing:
                            continue

                        parents_all_processed = True
                        parent_results = {}
                        for parent_key in child_node.parent_keys:
                            if (
                                parent_key in self.nodes_by_key
                                and parent_key not in processed
                            ):
                                parents_all_processed = False
                                break
                            if parent_key in self.nodes_by_key:
                                parent_view = self.nodes_by_key[parent_key].view
                                parent_results[parent_view] = result[parent_view]
                        if parents_all_processed:
                            future = executor.submit(
                                structured_logging.with_context(view_process_fn),
                                child_node.view,
                                parent_results,
                            )
                            future_to_view[future] = child_node
                            processing.add(child_node.dag_key)
        return result
예제 #7
0
def execute_validation(
    rematerialize_views: bool,
    region_code_filter: Optional[str] = None,
    validation_name_filter: Optional[Pattern] = None,
    sandbox_dataset_prefix: Optional[str] = None,
) -> List[DataValidationJobResult]:
    """Executes all validation checks.
    If |region_code_filter| is supplied, limits validations to just that region.
    If |validation_name_filter| is supplied, only performs validations on those
    that have a regex match.
    If |sandbox_dataset_prefix| is supplied, performs validation using sandbox dataset
    """

    sandbox_dataset_overrides = None
    if sandbox_dataset_prefix:
        sandbox_dataset_overrides = dataset_overrides_for_view_builders(
            sandbox_dataset_prefix, DEPLOYED_VIEW_BUILDERS)

    if rematerialize_views:
        logging.info(
            'Received query param "should_update_views" = true, updating validation dataset and views... '
        )

        view_update_manager.rematerialize_views_for_view_builders(
            views_to_update_builders=DEPLOYED_VIEW_BUILDERS,
            all_view_builders=DEPLOYED_VIEW_BUILDERS,
            view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS,
            dataset_overrides=sandbox_dataset_overrides,
            # If a given view hasn't been loaded to the sandbox it will skip it
            skip_missing_views=True,
        )

    # Fetch collection of validation jobs to perform
    validation_jobs = _fetch_validation_jobs_to_perform(
        region_code_filter=region_code_filter,
        validation_name_filter=validation_name_filter,
        dataset_overrides=sandbox_dataset_overrides,
    )

    run_datetime = datetime.datetime.today()
    run_id = uuid.uuid4().hex
    logging.info(
        "Performing a total of %s validation jobs [run_datetime: %s, run_id: %s]...",
        len(validation_jobs),
        run_datetime.isoformat(),
        run_id,
    )

    # Perform all validations and track failures
    failed_to_run_validations: List[DataValidationJob] = []
    failed_validations: List[DataValidationJobResult] = []
    results_to_store: List[ValidationResultForStorage] = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_jobs = {
            executor.submit(structured_logging.with_context(_run_job), job):
            job
            for job in validation_jobs
        }

        for future in futures.as_completed(future_to_jobs):
            job = future_to_jobs[future]
            try:
                result: DataValidationJobResult = future.result()
                results_to_store.append(
                    ValidationResultForStorage.from_validation_result(
                        run_id=run_id,
                        run_datetime=run_datetime,
                        result=result,
                    ))
                if not result.was_successful:
                    failed_validations.append(result)
                logging.info(
                    "Finished job [%s] for region [%s]",
                    job.validation.validation_name,
                    job.region_code,
                )
            except Exception as e:
                logging.error(
                    "Failed to execute asynchronous query for validation job [%s] due to error: %s",
                    job,
                    e,
                )
                results_to_store.append(
                    ValidationResultForStorage.from_validation_job(
                        run_id=run_id,
                        run_datetime=run_datetime,
                        job=job,
                    ))
                failed_to_run_validations.append(job)

    store_validation_results(results_to_store)

    if failed_validations or failed_to_run_validations:
        logging.error(
            "Found a total of [%s] failures, with [%s] failing to run entirely. Emitting results...",
            len(failed_validations) + len(failed_to_run_validations),
            len(failed_to_run_validations),
        )
        # Emit metrics for all failures
        _emit_failures(failed_to_run_validations, failed_validations)
    else:
        logging.info("Found no failed validations...")

    logging.info("Validation run complete. Analyzed a total of %s jobs.",
                 len(validation_jobs))
    return failed_validations
예제 #8
0
def scraper_start():
    """Request handler to start one or several running scrapers

    Kicks off new scrape session for each region and scrape type in request

    Example query:
        /scraper_control/start?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'
        timezone: (string) The timezone to scrape.
        surname: (string, optional) Name to start scrape at. Required if
            given_names provided
        given_names: (string, optional) Name to start scrape at

    Args:
        N/A

    Returns:
        N/A
    """
    @monitoring.with_region_tag
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()

    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    stripe_value = get_str_param_values("stripe", request.args)
    region_value = get_str_param_values("region", request.args)
    # If a timezone wasn't provided start all regions. If it was only start
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(region_value,
                                                   timezone=timezone,
                                                   stripes=stripe_value)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return (
            "Missing or invalid parameters, or no regions found, see logs.",
            HTTPStatus.BAD_REQUEST,
        )

    given_names = get_str_param_value("given_names", request.args, "")
    surname = get_str_param_value("surname", request.args, "")

    failed_starts = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_args = {
            executor.submit(
                structured_logging.with_context(_start_scraper),
                region_code,
                scrape_type,
            ): (region_code, scrape_type)
            for scrape_type in scrape_types for region_code in scrape_regions
        }

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_args):
            region_code, scrape_type = future_to_args[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when starting region [%s] for "
                        "[%s]",
                        region_code,
                        scrape_type,
                    )
                    failed_starts.append((region_code, scrape_type))
                else:
                    logging.info(
                        "Finished starting region [%s] for [%s].",
                        region_code,
                        scrape_type,
                    )

    if failed_starts:
        # This causes the whole request to be retried. Any regions whose session
        # was opened during this call will be immediately skipped in the next
        # call when we check for open sessions. Any regions we failed to start
        # likely still had sessions opened and thus will be skipped, but it is
        # worth retrying anyway.
        return (
            "Failed to start regions: {}".format(failed_starts),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)
예제 #9
0
def scraper_stop():
    """Request handler to stop one or several running scrapers.
    Note: Stopping any scrape type for a region involves purging the
    scraping task queue for that region, necessarily killing any other
    in-progress scrape types. Untargeted scrapes killed by this request
    handler will be noted and resumed a moment or two later.
    Unlike the other Scraper action methods, stop_scrape doesn't call
    individually for each scrape type. That could create a race condition,
    as each call noticed the other scrape type was running at the same
    time, kicked off a resume effort with a delay, and then our second
    call came to kill the other type and missed the (delayed / not yet
    in taskqueue) call - effectively not stopping the scrape.
    Instead, we send the full list of scrape_types to stop, and
    Scraper.stop_scrape is responsible for fan-out.
    Example query:
        /scraper_control/stop?region=us_ny&scrape_type=background
    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'
    Args:
        N/A
    Returns:
        N/A
    """
    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    stripe = get_str_param_values("stripe", request.args)
    respect_is_stoppable = get_str_param_value("respect_is_stoppable",
                                               request.args)

    # If a timezone wasn't provided stop all regions. If it was only stop
    # regions that match the timezone. If stripe provided, stop only regions
    # with matching stripe
    scrape_regions = ingest_utils.validate_regions(get_str_param_values(
        "region", request.args),
                                                   timezone=timezone,
                                                   stripes=stripe)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None

    @monitoring.with_region_tag
    def _stop_scraper(region: str):
        logging.info("Trying to stop scraper for region [%s].", region)
        for scrape_type in scrape_types:
            key = ScrapeKey(region_code=region, scrape_type=scrape_type)
            session = sessions.get_current_session(key)
            if not session:
                logging.info("No [%s] scrape to stop for region: [%s]",
                             scrape_type, region)
                continue

            region_scraper = regions.get_region(region).get_scraper()
            was_stopped = region_scraper.stop_scrape(scrape_type,
                                                     respect_is_stoppable)
            if was_stopped:
                closed_sessions = sessions.close_session(key)
                for closed_session in closed_sessions:
                    sessions.update_phase(closed_session,
                                          scrape_phase.ScrapePhase.PERSIST)
                if next_phase:
                    logging.info("Enqueueing %s for region [%s].", next_phase,
                                 region)
                    ScraperCloudTaskManager().create_scraper_phase_task(
                        region_code=region, url=next_phase_url)

    if not scrape_regions or not scrape_types:
        return (
            "Missing or invalid parameters, see service logs.",
            HTTPStatus.BAD_REQUEST,
        )

    failed_stops = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_regions = {
            executor.submit(structured_logging.with_context(_stop_scraper),
                            region_code): region_code
            for region_code in scrape_regions
        }

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_regions):
            region_code = future_to_regions[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when stopping region [%s] for "
                        "[%s]",
                        region_code,
                        scrape_types,
                    )
                    failed_stops.append(region_code)
                else:
                    logging.info(
                        "Finished stopping region [%s] for [%s].",
                        region_code,
                        scrape_types,
                    )

    if failed_stops:
        # This causes the whole request to be retried. Any regions whose session
        # was closed during this call will be immediately skipped in the next
        # call as we won't find any sessions to close. Any regions we failed to
        # start likely still had their sessions closed and thus will be skipped,
        # but it is worth retrying anyway.
        return (
            "Failed to stop regions: {}".format(failed_stops),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)