def _copy_regional_dataset_to_multi_region( config: CloudSqlToBQConfig, dataset_override_prefix: Optional[str]) -> None: """Copies the unioned regional dataset for a schema to the multi-region dataset that contains the same data. Backs up the multi-region dataset before performing the copy. This backup dataset will get cleaned up if the copy succeeds, but otherwise will stick around for 1 week before tables expire. """ bq_client = BigQueryClientImpl() source_dataset_id = config.unioned_regional_dataset( dataset_override_prefix) destination_dataset_id = config.unioned_multi_region_dataset( dataset_override_prefix) destination_dataset = bq_client.dataset_ref_for_id(destination_dataset_id) backup_dataset = bq_client.backup_dataset_tables_if_dataset_exists( destination_dataset_id) try: if bq_client.dataset_exists(destination_dataset): tables = bq_client.list_tables(destination_dataset_id) for table in tables: bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( destination_dataset, default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS if dataset_override_prefix else None, ) # Copy into the canonical unioned source datasets in the US multi-region bq_client.copy_dataset_tables_across_regions( source_dataset_id=source_dataset_id, destination_dataset_id=destination_dataset_id, ) except Exception as e: logging.info( "Failed to flash [%s] to [%s] - contents backup can be found at [%s]", source_dataset_id, destination_dataset_id, backup_dataset.dataset_id if backup_dataset else "NO BACKUP", ) raise e if backup_dataset: bq_client.delete_dataset(backup_dataset, delete_contents=True, not_found_ok=True)
def _delete_empty_datasets() -> None: """Deletes all empty datasets in BigQuery.""" bq_client = BigQueryClientImpl() datasets = bq_client.list_datasets() for dataset_resource in datasets: dataset_ref = bq_client.dataset_ref_for_id(dataset_resource.dataset_id) dataset = bq_client.get_dataset(dataset_ref) tables = peekable(bq_client.list_tables(dataset.dataset_id)) created_time = dataset.created dataset_age_seconds = (datetime.datetime.now(datetime.timezone.utc) - created_time).total_seconds() if not tables and dataset_age_seconds > DATASET_DELETION_MIN_SECONDS: logging.info( "Dataset %s is empty and was not created very recently. Deleting...", dataset_ref.dataset_id, ) bq_client.delete_dataset(dataset_ref)
def main(dry_run: bool) -> None: client = BigQueryClientImpl() datasets = list(client.list_datasets()) candidate_deletable_datasets = [ d for d in datasets if d.dataset_id.startswith("temp_dataset_") ] cutoff_date = (datetime.now() - timedelta(days=1)).replace(tzinfo=pytz.UTC) for candidate in candidate_deletable_datasets: dataset = client.get_dataset(candidate.dataset_id) if dataset.modified is not None and dataset.modified < cutoff_date: if dry_run: logging.info("[Dry-run] Would delete %s", dataset.dataset_id) else: logging.info("Deleting %s...", dataset.dataset_id) client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) else: logging.info("Skipping %s because it was created too recently.", dataset.dataset_id)
def compare_dataflow_output_to_sandbox( sandbox_dataset_prefix: str, job_name_to_compare: str, base_output_job_id: str, sandbox_output_job_id: str, additional_columns_to_compare: List[str], allow_overwrite: bool = False, ) -> None: """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare| between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the |sandbox_output_job_id| job in the sandbox dataflow dataset.""" bq_client = BigQueryClientImpl() sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" + DATAFLOW_METRICS_DATASET) sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_dataflow_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): if not allow_overwrite: if __name__ == "__main__": logging.error( "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.", sandbox_comparison_output_dataset_id, bq_client.project_id, ) sys.exit(1) else: raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}." ) else: # Clean up the existing tables in the dataset for table in bq_client.list_tables( sandbox_comparison_output_dataset_id): bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts( "daily_pipelines") for pipeline in pipelines: if pipeline.pop("job_name", str) == job_name_to_compare: pipeline_metric_types = pipeline.peek_optional("metric_types", str) if not pipeline_metric_types: raise ValueError( f"Pipeline job {job_name_to_compare} missing required metric_types attribute." ) metric_types_for_comparison = pipeline_metric_types.split() for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items( ): metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[ metric_table].value if metric_type_value in metric_types_for_comparison: comparison_query = _query_for_metric_comparison( bq_client, base_output_job_id, sandbox_output_job_id, sandbox_dataflow_dataset_id, metric_class, metric_table, additional_columns_to_compare, ) query_job = bq_client.create_table_from_query_async( dataset_id=sandbox_comparison_output_dataset_id, table_id=metric_table, query=comparison_query, overwrite=True, ) # Add query job to the list of running jobs query_jobs.append((query_job, metric_table)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) metrics_with_different_output = peekable( bq_client.list_tables(sandbox_comparison_output_dataset_id)) logging.info( "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n" ) if metrics_with_different_output: for metric_table in metrics_with_different_output: # This will always be true, and is here to silence mypy warnings assert isinstance(metric_table, bigquery.table.TableListItem) logging.warning( "Dataflow output differs for metric %s. See %s.%s for diverging rows.", metric_table.table_id, sandbox_comparison_output_dataset_id, metric_table.table_id, ) else: logging.info( "Dataflow output identical. Deleting dataset %s.", sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)
def compare_metric_view_output_to_sandbox( sandbox_dataset_prefix: str, load_sandbox_views: bool, check_determinism: bool, allow_schema_changes: bool, dataset_id_filters: Optional[List[str]], ) -> None: """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox dataset.""" if load_sandbox_views: logging.info( "Loading views into sandbox datasets prefixed with %s", sandbox_dataset_prefix, ) load_views_to_sandbox(sandbox_dataset_prefix) bq_client = BigQueryClientImpl() sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_metric_view_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.") bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] skipped_views: List[str] = [] for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values(): for view_builder in view_builders: # Only compare output of metric views if not isinstance(view_builder, MetricBigQueryViewBuilder): continue base_dataset_id = view_builder.dataset_id if dataset_id_filters and base_dataset_id not in dataset_id_filters: continue if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT: logging.warning( "View %s.%s has known non-deterministic output. Skipping output comparison.", view_builder.dataset_id, view_builder.view_id, ) skipped_views.append( f"{view_builder.dataset_id}.{view_builder.view_id}") continue sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id if not bq_client.dataset_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id)): raise ValueError( f"Trying to compare output to a sandbox dataset that does not exist: " f"{bq_client.project_id}.{sandbox_dataset_id}") base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id) base_view_id = (view_builder.build().materialized_view_table_id if view_builder.should_materialize and not check_determinism else view_builder.view_id) if not base_view_id: raise ValueError( "Unexpected empty base_view_id. view_id or materialized_view_table_id unset" f"for {view_builder}.") if not check_determinism and not bq_client.table_exists( base_dataset_ref, base_view_id): logging.warning( "View %s.%s does not exist. Skipping output comparison.", base_dataset_ref.dataset_id, base_view_id, ) skipped_views.append(f"{base_dataset_id}.{base_view_id}") continue if not bq_client.table_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id), base_view_id): logging.warning( "View %s.%s does not exist in sandbox. Skipping output comparison.", sandbox_dataset_id, base_view_id, ) skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}") continue query_job, output_table_id = _view_output_comparison_job( bq_client, view_builder, base_view_id, base_dataset_id, sandbox_dataset_id, sandbox_comparison_output_dataset_id, check_determinism, allow_schema_changes, ) # Add query job to the list of running jobs query_jobs.append((query_job, output_table_id)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the view output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) views_with_different_output = bq_client.list_tables( sandbox_comparison_output_dataset_id) views_with_different_output = peekable(views_with_different_output) logging.info( "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n") if dataset_id_filters: logging.info( "Only compared metric view output for the following datasets: \n %s \n", dataset_id_filters, ) logging.info( "Skipped output comparison for the following metric views: \n %s \n", skipped_views, ) if views_with_different_output: for view in views_with_different_output: base_dataset_id, base_view_id = view.table_id.split("--") logging.warning( "View output differs for view %s.%s. See %s.%s for diverging rows.", base_dataset_id, base_view_id, sandbox_comparison_output_dataset_id, view.table_id, ) else: output_message = ( "identical between deployed views and sandbox datasets" if not check_determinism else "deterministic") logging.info( "View output %s. Deleting dataset %s.", output_message, sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)