def main(*, source_project_id, source_dataset_id, destination_project_id, destination_dataset_id): """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if table.view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView(dataset_id=table_ref.dataset_id, view_id=table.table_id, view_query_template=table.view_query), destination_client=destination_client, destination_dataset_ref=destination_dataset)
def load_from_temp_to_permanent_table(bq_client: BigQueryClientImpl, project_id: str) -> None: """Query temporary table and persist view to permanent table""" num_rows_before = bq_client.get_table( dataset_ref=bigquery.DatasetReference( project=project_id, dataset_id=DATASET_ID, ), table_id=FINAL_DESTINATION_TABLE, ).num_rows insert_job = bq_client.insert_into_table_from_query( destination_dataset_id=DATASET_ID, destination_table_id=FINAL_DESTINATION_TABLE, query=INSERT_QUERY_TEMPLATE.format( project_id=project_id, dataset_id=DATASET_ID, temp_table=TEMP_DESTINATION_TABLE, final_table=FINAL_DESTINATION_TABLE, ), write_disposition=WriteDisposition.WRITE_APPEND, ) insert_job_result = insert_job.result() logging.info( "Loaded [%d] non-duplicate rows into table [%s]", (insert_job_result.total_rows - num_rows_before), FINAL_DESTINATION_TABLE, ) bq_client.delete_table(dataset_id=DATASET_ID, table_id=TEMP_DESTINATION_TABLE)
def columns(self) -> List[str]: if self._columns is None: bq_client = BigQueryClientImpl() t = bq_client.get_table( bq_client.dataset_ref_for_id(self.dataset_id), self.table_id) self._columns = [col.name for col in t.schema] return self._columns
def copy_bq_views( source_project_id: str, source_dataset_id: str, destination_project_id: str, destination_dataset_id: str, ) -> None: """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) view_query = table.view_query # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Remove any references to the source_project_id from the view_query updated_view_query = view_query.replace(source_project_id, "{project_id}") # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView( project_id=destination_project_id, dataset_id=destination_dataset_id, view_id=table.table_id, description=table.description, view_query_template=updated_view_query, ), destination_client=destination_client, destination_dataset_ref=destination_dataset, )
def compare_dataflow_output_to_sandbox( sandbox_dataset_prefix: str, job_name_to_compare: str, base_output_job_id: str, sandbox_output_job_id: str, additional_columns_to_compare: List[str], allow_overwrite: bool = False, ) -> None: """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare| between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the |sandbox_output_job_id| job in the sandbox dataflow dataset.""" bq_client = BigQueryClientImpl() sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" + DATAFLOW_METRICS_DATASET) sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_dataflow_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): if not allow_overwrite: if __name__ == "__main__": logging.error( "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.", sandbox_comparison_output_dataset_id, bq_client.project_id, ) sys.exit(1) else: raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}." ) else: # Clean up the existing tables in the dataset for table in bq_client.list_tables( sandbox_comparison_output_dataset_id): bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts( "daily_pipelines") for pipeline in pipelines: if pipeline.pop("job_name", str) == job_name_to_compare: pipeline_metric_types = pipeline.peek_optional("metric_types", str) if not pipeline_metric_types: raise ValueError( f"Pipeline job {job_name_to_compare} missing required metric_types attribute." ) metric_types_for_comparison = pipeline_metric_types.split() for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items( ): metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[ metric_table].value if metric_type_value in metric_types_for_comparison: comparison_query = _query_for_metric_comparison( bq_client, base_output_job_id, sandbox_output_job_id, sandbox_dataflow_dataset_id, metric_class, metric_table, additional_columns_to_compare, ) query_job = bq_client.create_table_from_query_async( dataset_id=sandbox_comparison_output_dataset_id, table_id=metric_table, query=comparison_query, overwrite=True, ) # Add query job to the list of running jobs query_jobs.append((query_job, metric_table)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) metrics_with_different_output = peekable( bq_client.list_tables(sandbox_comparison_output_dataset_id)) logging.info( "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n" ) if metrics_with_different_output: for metric_table in metrics_with_different_output: # This will always be true, and is here to silence mypy warnings assert isinstance(metric_table, bigquery.table.TableListItem) logging.warning( "Dataflow output differs for metric %s. See %s.%s for diverging rows.", metric_table.table_id, sandbox_comparison_output_dataset_id, metric_table.table_id, ) else: logging.info( "Dataflow output identical. Deleting dataset %s.", sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)
def _view_output_comparison_job( bq_client: BigQueryClientImpl, view_builder: MetricBigQueryViewBuilder, base_view_id: str, base_dataset_id: str, sandbox_dataset_id: str, sandbox_comparison_output_dataset_id: str, check_determinism: bool, allow_schema_changes: bool, ) -> Tuple[bigquery.QueryJob, str]: """Builds and executes the query that compares the base and sandbox views. Returns a tuple with the the QueryJob and the table_id where the output will be written to in the sandbox_comparison_output_dataset_id dataset.""" base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id) sandbox_dataset_ref = bq_client.dataset_ref_for_id(sandbox_dataset_id) output_table_id = f"{view_builder.dataset_id}--{base_view_id}" if check_determinism: # Compare all columns columns_to_compare = ["*"] preserve_column_types = True else: # Columns in deployed view deployed_base_view = bq_client.get_table(base_dataset_ref, view_builder.view_id) # If there are nested columns in the deployed view then we can't allow column type changes preserve_column_types = _table_contains_nested_columns( deployed_base_view) base_columns_to_compare = set(field.name for field in deployed_base_view.schema) # Columns in sandbox view deployed_sandbox_view = bq_client.get_table(sandbox_dataset_ref, view_builder.view_id) if not preserve_column_types: # If there are nested columns in the sandbox view then we can't allow column type changes preserve_column_types = _table_contains_nested_columns( deployed_sandbox_view) sandbox_columns_to_compare = set( field.name for field in deployed_sandbox_view.schema) if allow_schema_changes: # Only compare columns in both views shared_columns = base_columns_to_compare.intersection( sandbox_columns_to_compare) columns_to_compare = list(shared_columns) else: if base_columns_to_compare != sandbox_columns_to_compare: raise ValueError( f"Schemas of the {base_dataset_id}.{base_view_id} deployed and" f" sandbox views do not match. If this is expected, please run again" f"with the --allow_schema_changes flag.") columns_to_compare = list(base_columns_to_compare) # Only include dimensions in both views unless we are checking the determinism of the local # view metric_dimensions = [ dimension for dimension in view_builder.dimensions if dimension in columns_to_compare or check_determinism ] if not preserve_column_types: # Cast all columns to strings to guard against column types that may have changed columns_to_compare = [ f"CAST({col} AS STRING) as {col}" for col in columns_to_compare ] base_dataset_id_for_query = (sandbox_dataset_id if check_determinism else view_builder.dataset_id) diff_query = OUTPUT_COMPARISON_TEMPLATE.format( project_id=bq_client.project_id, base_dataset_id=base_dataset_id_for_query, sandbox_dataset_id=sandbox_dataset_id, view_id=base_view_id, columns_to_compare=", ".join(columns_to_compare), dimensions=", ".join(metric_dimensions), ) return ( bq_client.create_table_from_query_async( dataset_id=sandbox_comparison_output_dataset_id, table_id=output_table_id, query=diff_query, overwrite=True, ), output_table_id, )
def compare_metric_view_output_to_sandbox( sandbox_dataset_prefix: str, load_sandbox_views: bool, check_determinism: bool, allow_schema_changes: bool, dataset_id_filters: Optional[List[str]], ) -> None: """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox dataset.""" if load_sandbox_views: logging.info( "Loading views into sandbox datasets prefixed with %s", sandbox_dataset_prefix, ) load_views_to_sandbox(sandbox_dataset_prefix) bq_client = BigQueryClientImpl() sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_metric_view_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.") bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] skipped_views: List[str] = [] for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values(): for view_builder in view_builders: # Only compare output of metric views if not isinstance(view_builder, MetricBigQueryViewBuilder): continue base_dataset_id = view_builder.dataset_id if dataset_id_filters and base_dataset_id not in dataset_id_filters: continue if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT: logging.warning( "View %s.%s has known non-deterministic output. Skipping output comparison.", view_builder.dataset_id, view_builder.view_id, ) skipped_views.append( f"{view_builder.dataset_id}.{view_builder.view_id}") continue sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id if not bq_client.dataset_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id)): raise ValueError( f"Trying to compare output to a sandbox dataset that does not exist: " f"{bq_client.project_id}.{sandbox_dataset_id}") base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id) base_view_id = (view_builder.build().materialized_view_table_id if view_builder.should_materialize and not check_determinism else view_builder.view_id) if not base_view_id: raise ValueError( "Unexpected empty base_view_id. view_id or materialized_view_table_id unset" f"for {view_builder}.") if not check_determinism and not bq_client.table_exists( base_dataset_ref, base_view_id): logging.warning( "View %s.%s does not exist. Skipping output comparison.", base_dataset_ref.dataset_id, base_view_id, ) skipped_views.append(f"{base_dataset_id}.{base_view_id}") continue if not bq_client.table_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id), base_view_id): logging.warning( "View %s.%s does not exist in sandbox. Skipping output comparison.", sandbox_dataset_id, base_view_id, ) skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}") continue query_job, output_table_id = _view_output_comparison_job( bq_client, view_builder, base_view_id, base_dataset_id, sandbox_dataset_id, sandbox_comparison_output_dataset_id, check_determinism, allow_schema_changes, ) # Add query job to the list of running jobs query_jobs.append((query_job, output_table_id)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the view output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) views_with_different_output = bq_client.list_tables( sandbox_comparison_output_dataset_id) views_with_different_output = peekable(views_with_different_output) logging.info( "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n") if dataset_id_filters: logging.info( "Only compared metric view output for the following datasets: \n %s \n", dataset_id_filters, ) logging.info( "Skipped output comparison for the following metric views: \n %s \n", skipped_views, ) if views_with_different_output: for view in views_with_different_output: base_dataset_id, base_view_id = view.table_id.split("--") logging.warning( "View output differs for view %s.%s. See %s.%s for diverging rows.", base_dataset_id, base_view_id, sandbox_comparison_output_dataset_id, view.table_id, ) else: output_message = ( "identical between deployed views and sandbox datasets" if not check_determinism else "deterministic") logging.info( "View output %s. Deleting dataset %s.", output_message, sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)