Пример #1
0
def _copy_regional_dataset_to_multi_region(
        config: CloudSqlToBQConfig,
        dataset_override_prefix: Optional[str]) -> None:
    """Copies the unioned regional dataset for a schema to the multi-region dataset
    that contains the same data. Backs up the multi-region dataset before performing
    the copy. This backup dataset will get cleaned up if the copy succeeds, but
    otherwise will stick around for 1 week before tables expire.
    """
    bq_client = BigQueryClientImpl()

    source_dataset_id = config.unioned_regional_dataset(
        dataset_override_prefix)
    destination_dataset_id = config.unioned_multi_region_dataset(
        dataset_override_prefix)
    destination_dataset = bq_client.dataset_ref_for_id(destination_dataset_id)

    backup_dataset = bq_client.backup_dataset_tables_if_dataset_exists(
        destination_dataset_id)

    try:
        if bq_client.dataset_exists(destination_dataset):
            tables = bq_client.list_tables(destination_dataset_id)
            for table in tables:
                bq_client.delete_table(table.dataset_id, table.table_id)

        bq_client.create_dataset_if_necessary(
            destination_dataset,
            default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
            if dataset_override_prefix else None,
        )

        # Copy into the canonical unioned source datasets in the US multi-region
        bq_client.copy_dataset_tables_across_regions(
            source_dataset_id=source_dataset_id,
            destination_dataset_id=destination_dataset_id,
        )
    except Exception as e:
        logging.info(
            "Failed to flash [%s] to [%s] - contents backup can be found at [%s]",
            source_dataset_id,
            destination_dataset_id,
            backup_dataset.dataset_id if backup_dataset else "NO BACKUP",
        )
        raise e

    if backup_dataset:
        bq_client.delete_dataset(backup_dataset,
                                 delete_contents=True,
                                 not_found_ok=True)
Пример #2
0
def _delete_empty_datasets() -> None:
    """Deletes all empty datasets in BigQuery."""
    bq_client = BigQueryClientImpl()
    datasets = bq_client.list_datasets()

    for dataset_resource in datasets:
        dataset_ref = bq_client.dataset_ref_for_id(dataset_resource.dataset_id)
        dataset = bq_client.get_dataset(dataset_ref)
        tables = peekable(bq_client.list_tables(dataset.dataset_id))
        created_time = dataset.created
        dataset_age_seconds = (datetime.datetime.now(datetime.timezone.utc) -
                               created_time).total_seconds()

        if not tables and dataset_age_seconds > DATASET_DELETION_MIN_SECONDS:
            logging.info(
                "Dataset %s is empty and was not created very recently. Deleting...",
                dataset_ref.dataset_id,
            )
            bq_client.delete_dataset(dataset_ref)
Пример #3
0
def main(dry_run: bool) -> None:
    client = BigQueryClientImpl()
    datasets = list(client.list_datasets())
    candidate_deletable_datasets = [
        d for d in datasets if d.dataset_id.startswith("temp_dataset_")
    ]

    cutoff_date = (datetime.now() - timedelta(days=1)).replace(tzinfo=pytz.UTC)
    for candidate in candidate_deletable_datasets:
        dataset = client.get_dataset(candidate.dataset_id)
        if dataset.modified is not None and dataset.modified < cutoff_date:
            if dry_run:
                logging.info("[Dry-run] Would delete %s", dataset.dataset_id)
            else:
                logging.info("Deleting %s...", dataset.dataset_id)
                client.delete_dataset(dataset,
                                      delete_contents=True,
                                      not_found_ok=True)
        else:
            logging.info("Skipping %s because it was created too recently.",
                         dataset.dataset_id)
Пример #4
0
def compare_dataflow_output_to_sandbox(
    sandbox_dataset_prefix: str,
    job_name_to_compare: str,
    base_output_job_id: str,
    sandbox_output_job_id: str,
    additional_columns_to_compare: List[str],
    allow_overwrite: bool = False,
) -> None:
    """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare|
    between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the
    |sandbox_output_job_id| job in the sandbox dataflow dataset."""
    bq_client = BigQueryClientImpl()
    sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" +
                                   DATAFLOW_METRICS_DATASET)

    sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix +
                                            "_dataflow_comparison_output")
    sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id(
        sandbox_comparison_output_dataset_id)

    if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any(
            bq_client.list_tables(sandbox_comparison_output_dataset_id)):
        if not allow_overwrite:
            if __name__ == "__main__":
                logging.error(
                    "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.",
                    sandbox_comparison_output_dataset_id,
                    bq_client.project_id,
                )
                sys.exit(1)
            else:
                raise ValueError(
                    f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: "
                    f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}."
                )
        else:
            # Clean up the existing tables in the dataset
            for table in bq_client.list_tables(
                    sandbox_comparison_output_dataset_id):
                bq_client.delete_table(table.dataset_id, table.table_id)

    bq_client.create_dataset_if_necessary(
        sandbox_comparison_output_dataset_ref,
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    query_jobs: List[Tuple[QueryJob, str]] = []

    pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts(
        "daily_pipelines")

    for pipeline in pipelines:
        if pipeline.pop("job_name", str) == job_name_to_compare:
            pipeline_metric_types = pipeline.peek_optional("metric_types", str)

            if not pipeline_metric_types:
                raise ValueError(
                    f"Pipeline job {job_name_to_compare} missing required metric_types attribute."
                )

            metric_types_for_comparison = pipeline_metric_types.split()

            for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items(
            ):
                metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[
                    metric_table].value

                if metric_type_value in metric_types_for_comparison:
                    comparison_query = _query_for_metric_comparison(
                        bq_client,
                        base_output_job_id,
                        sandbox_output_job_id,
                        sandbox_dataflow_dataset_id,
                        metric_class,
                        metric_table,
                        additional_columns_to_compare,
                    )

                    query_job = bq_client.create_table_from_query_async(
                        dataset_id=sandbox_comparison_output_dataset_id,
                        table_id=metric_table,
                        query=comparison_query,
                        overwrite=True,
                    )

                    # Add query job to the list of running jobs
                    query_jobs.append((query_job, metric_table))

    for query_job, output_table_id in query_jobs:
        # Wait for the insert job to complete before looking for the table
        query_job.result()

        output_table = bq_client.get_table(
            sandbox_comparison_output_dataset_ref, output_table_id)

        if output_table.num_rows == 0:
            # If there are no rows in the output table, then the output was identical
            bq_client.delete_table(sandbox_comparison_output_dataset_id,
                                   output_table_id)

    metrics_with_different_output = peekable(
        bq_client.list_tables(sandbox_comparison_output_dataset_id))

    logging.info(
        "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n"
    )

    if metrics_with_different_output:
        for metric_table in metrics_with_different_output:
            # This will always be true, and is here to silence mypy warnings
            assert isinstance(metric_table, bigquery.table.TableListItem)

            logging.warning(
                "Dataflow output differs for metric %s. See %s.%s for diverging rows.",
                metric_table.table_id,
                sandbox_comparison_output_dataset_id,
                metric_table.table_id,
            )
    else:
        logging.info(
            "Dataflow output identical. Deleting dataset %s.",
            sandbox_comparison_output_dataset_ref.dataset_id,
        )
        bq_client.delete_dataset(sandbox_comparison_output_dataset_ref,
                                 delete_contents=True)
Пример #5
0
def compare_metric_view_output_to_sandbox(
    sandbox_dataset_prefix: str,
    load_sandbox_views: bool,
    check_determinism: bool,
    allow_schema_changes: bool,
    dataset_id_filters: Optional[List[str]],
) -> None:
    """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox
    dataset."""
    if load_sandbox_views:
        logging.info(
            "Loading views into sandbox datasets prefixed with %s",
            sandbox_dataset_prefix,
        )
        load_views_to_sandbox(sandbox_dataset_prefix)

    bq_client = BigQueryClientImpl()
    sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix +
                                            "_metric_view_comparison_output")
    sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id(
        sandbox_comparison_output_dataset_id)

    if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any(
            bq_client.list_tables(sandbox_comparison_output_dataset_id)):
        raise ValueError(
            f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: "
            f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.")

    bq_client.create_dataset_if_necessary(
        sandbox_comparison_output_dataset_ref,
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    query_jobs: List[Tuple[QueryJob, str]] = []
    skipped_views: List[str] = []

    for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values():
        for view_builder in view_builders:
            # Only compare output of metric views
            if not isinstance(view_builder, MetricBigQueryViewBuilder):
                continue

            base_dataset_id = view_builder.dataset_id

            if dataset_id_filters and base_dataset_id not in dataset_id_filters:
                continue

            if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT:
                logging.warning(
                    "View %s.%s has known non-deterministic output. Skipping output comparison.",
                    view_builder.dataset_id,
                    view_builder.view_id,
                )
                skipped_views.append(
                    f"{view_builder.dataset_id}.{view_builder.view_id}")
                continue

            sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id

            if not bq_client.dataset_exists(
                    bq_client.dataset_ref_for_id(sandbox_dataset_id)):
                raise ValueError(
                    f"Trying to compare output to a sandbox dataset that does not exist: "
                    f"{bq_client.project_id}.{sandbox_dataset_id}")

            base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id)
            base_view_id = (view_builder.build().materialized_view_table_id
                            if view_builder.should_materialize
                            and not check_determinism else
                            view_builder.view_id)

            if not base_view_id:
                raise ValueError(
                    "Unexpected empty base_view_id. view_id or materialized_view_table_id unset"
                    f"for {view_builder}.")

            if not check_determinism and not bq_client.table_exists(
                    base_dataset_ref, base_view_id):
                logging.warning(
                    "View %s.%s does not exist. Skipping output comparison.",
                    base_dataset_ref.dataset_id,
                    base_view_id,
                )
                skipped_views.append(f"{base_dataset_id}.{base_view_id}")
                continue

            if not bq_client.table_exists(
                    bq_client.dataset_ref_for_id(sandbox_dataset_id),
                    base_view_id):
                logging.warning(
                    "View %s.%s does not exist in sandbox. Skipping output comparison.",
                    sandbox_dataset_id,
                    base_view_id,
                )
                skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}")
                continue
            query_job, output_table_id = _view_output_comparison_job(
                bq_client,
                view_builder,
                base_view_id,
                base_dataset_id,
                sandbox_dataset_id,
                sandbox_comparison_output_dataset_id,
                check_determinism,
                allow_schema_changes,
            )

            # Add query job to the list of running jobs
            query_jobs.append((query_job, output_table_id))

    for query_job, output_table_id in query_jobs:
        # Wait for the insert job to complete before looking for the table
        query_job.result()

        output_table = bq_client.get_table(
            sandbox_comparison_output_dataset_ref, output_table_id)

        if output_table.num_rows == 0:
            # If there are no rows in the output table, then the view output was identical
            bq_client.delete_table(sandbox_comparison_output_dataset_id,
                                   output_table_id)

    views_with_different_output = bq_client.list_tables(
        sandbox_comparison_output_dataset_id)
    views_with_different_output = peekable(views_with_different_output)

    logging.info(
        "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n")

    if dataset_id_filters:
        logging.info(
            "Only compared metric view output for the following datasets: \n %s \n",
            dataset_id_filters,
        )

    logging.info(
        "Skipped output comparison for the following metric views: \n %s \n",
        skipped_views,
    )

    if views_with_different_output:
        for view in views_with_different_output:
            base_dataset_id, base_view_id = view.table_id.split("--")

            logging.warning(
                "View output differs for view %s.%s. See %s.%s for diverging rows.",
                base_dataset_id,
                base_view_id,
                sandbox_comparison_output_dataset_id,
                view.table_id,
            )
    else:
        output_message = (
            "identical between deployed views and sandbox datasets"
            if not check_determinism else "deterministic")
        logging.info(
            "View output %s. Deleting dataset %s.",
            output_message,
            sandbox_comparison_output_dataset_ref.dataset_id,
        )
        bq_client.delete_dataset(sandbox_comparison_output_dataset_ref,
                                 delete_contents=True)