def main(*, source_project_id, source_dataset_id, destination_project_id, destination_dataset_id): """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if table.view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView(dataset_id=table_ref.dataset_id, view_id=table.table_id, view_query_template=table.view_query), destination_client=destination_client, destination_dataset_ref=destination_dataset)
def run_export(project_id: str, dry_run: bool, state_codes: List[str], target_dataset: str) -> None: """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset in the given project to tables with the same names in the target dataset.""" big_query_client = BigQueryClientImpl() dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET) if not big_query_client.dataset_exists(dataset_ref): raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist') tables = big_query_client.list_tables(dataset_ref.dataset_id) for table in tables: logging.info("******************************") export_query = state_table_export_query_str(table, state_codes) logging.info(export_query) if not export_query: continue if target_dataset: if dry_run: logging.info( "[DRY RUN] Exporting to target project.dataset.table [%s.%s.%s]", project_id, target_dataset, table.table_id) else: logging.info( "Exporting to target project.dataset.table [%s.%s.%s]", project_id, target_dataset, table.table_id) copy_table_to_dataset(target_dataset, table.table_id, export_query, big_query_client)
def move_old_dataflow_metrics_to_cold_storage(): """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other output is moved to cold storage. """ bq_client = BigQueryClientImpl() dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset) for table_ref in dataflow_metrics_tables: table_id = table_ref.table_id filter_clause = """WHERE created_on NOT IN (SELECT DISTINCT created_on FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` ORDER BY created_on DESC LIMIT {day_count_limit})""".format( project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_ref.table_id, day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE) cold_storage_dataset_ref = bq_client.dataset_ref_for_id( cold_storage_dataset) if bq_client.table_exists(cold_storage_dataset_ref, table_id): # Move data from the Dataflow metrics dataset into the cold storage dataset insert_job = bq_client.insert_into_table_from_table_async( source_dataset_id=dataflow_metrics_dataset, source_table_id=table_id, destination_dataset_id=cold_storage_dataset, destination_table_id=table_id, source_data_filter_clause=filter_clause, allow_field_additions=True) # Wait for the insert job to complete before running the delete job insert_job.result() else: # This table doesn't yet exist in cold storage. Create it. table_query = f"SELECT * FROM `{bq_client.project_id}.{dataflow_metrics_dataset}.{table_id}` " \ f"{filter_clause}" create_job = bq_client.create_table_from_query_async( cold_storage_dataset, table_id, table_query, query_parameters=[]) # Wait for the create job to complete before running the delete job create_job.result() # Delete that data from the Dataflow dataset delete_job = bq_client.delete_from_table_async( dataflow_metrics_dataset, table_ref.table_id, filter_clause) # Wait for the delete job to complete before moving on delete_job.result()
def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None: """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset in the given project to CSV files with the same names as the tables to the given GCS bucket.""" today = datetime.date.today() big_query_client = BigQueryClientImpl() dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET) if not big_query_client.dataset_exists(dataset_ref): raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist") tables = big_query_client.list_tables(dataset_ref.dataset_id) export_configs = [] for table in tables: logging.info("******************************") export_query = state_table_export_query_str(table, [state_code.upper()]) logging.info(export_query) if not export_query: continue export_dir = gcs_export_directory(target_bucket, today, state_code.lower()) export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv" file = GcsfsFilePath.from_directory_and_file_name( export_dir, export_file_name) output_uri = file.uri() export_config = ExportQueryConfig( query=export_query, query_parameters=[], intermediate_dataset_id="export_temporary_tables", intermediate_table_name= f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}", output_uri=output_uri, output_format=bigquery.DestinationFormat.CSV, ) export_configs.append(export_config) if dry_run: logging.info( "[DRY RUN] Created export configuration to export table to GCS: %s", export_config, ) else: logging.info( "Created export configuration to export table to GCS: %s", export_config) if dry_run: logging.info("[DRY RUN] Exporting [%d] tables to GCS", len(export_configs)) else: logging.info("Exporting [%d] tables to GCS", len(export_configs)) big_query_client.export_query_results_to_cloud_storage( export_configs, print_header=True)
def _copy_regional_dataset_to_multi_region( config: CloudSqlToBQConfig, dataset_override_prefix: Optional[str]) -> None: """Copies the unioned regional dataset for a schema to the multi-region dataset that contains the same data. Backs up the multi-region dataset before performing the copy. This backup dataset will get cleaned up if the copy succeeds, but otherwise will stick around for 1 week before tables expire. """ bq_client = BigQueryClientImpl() source_dataset_id = config.unioned_regional_dataset( dataset_override_prefix) destination_dataset_id = config.unioned_multi_region_dataset( dataset_override_prefix) destination_dataset = bq_client.dataset_ref_for_id(destination_dataset_id) backup_dataset = bq_client.backup_dataset_tables_if_dataset_exists( destination_dataset_id) try: if bq_client.dataset_exists(destination_dataset): tables = bq_client.list_tables(destination_dataset_id) for table in tables: bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( destination_dataset, default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS if dataset_override_prefix else None, ) # Copy into the canonical unioned source datasets in the US multi-region bq_client.copy_dataset_tables_across_regions( source_dataset_id=source_dataset_id, destination_dataset_id=destination_dataset_id, ) except Exception as e: logging.info( "Failed to flash [%s] to [%s] - contents backup can be found at [%s]", source_dataset_id, destination_dataset_id, backup_dataset.dataset_id if backup_dataset else "NO BACKUP", ) raise e if backup_dataset: bq_client.delete_dataset(backup_dataset, delete_contents=True, not_found_ok=True)
def copy_bq_views( source_project_id: str, source_dataset_id: str, destination_project_id: str, destination_dataset_id: str, ) -> None: """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) view_query = table.view_query # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Remove any references to the source_project_id from the view_query updated_view_query = view_query.replace(source_project_id, "{project_id}") # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView( project_id=destination_project_id, dataset_id=destination_dataset_id, view_id=table.table_id, description=table.description, view_query_template=updated_view_query, ), destination_client=destination_client, destination_dataset_ref=destination_dataset, )
def _delete_empty_datasets() -> None: """Deletes all empty datasets in BigQuery.""" bq_client = BigQueryClientImpl() datasets = bq_client.list_datasets() for dataset_resource in datasets: dataset_ref = bq_client.dataset_ref_for_id(dataset_resource.dataset_id) dataset = bq_client.get_dataset(dataset_ref) tables = peekable(bq_client.list_tables(dataset.dataset_id)) created_time = dataset.created dataset_age_seconds = (datetime.datetime.now(datetime.timezone.utc) - created_time).total_seconds() if not tables and dataset_age_seconds > DATASET_DELETION_MIN_SECONDS: logging.info( "Dataset %s is empty and was not created very recently. Deleting...", dataset_ref.dataset_id, ) bq_client.delete_dataset(dataset_ref)
def compare_dataflow_output_to_sandbox( sandbox_dataset_prefix: str, job_name_to_compare: str, base_output_job_id: str, sandbox_output_job_id: str, additional_columns_to_compare: List[str], allow_overwrite: bool = False, ) -> None: """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare| between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the |sandbox_output_job_id| job in the sandbox dataflow dataset.""" bq_client = BigQueryClientImpl() sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" + DATAFLOW_METRICS_DATASET) sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_dataflow_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): if not allow_overwrite: if __name__ == "__main__": logging.error( "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.", sandbox_comparison_output_dataset_id, bq_client.project_id, ) sys.exit(1) else: raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}." ) else: # Clean up the existing tables in the dataset for table in bq_client.list_tables( sandbox_comparison_output_dataset_id): bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts( "daily_pipelines") for pipeline in pipelines: if pipeline.pop("job_name", str) == job_name_to_compare: pipeline_metric_types = pipeline.peek_optional("metric_types", str) if not pipeline_metric_types: raise ValueError( f"Pipeline job {job_name_to_compare} missing required metric_types attribute." ) metric_types_for_comparison = pipeline_metric_types.split() for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items( ): metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[ metric_table].value if metric_type_value in metric_types_for_comparison: comparison_query = _query_for_metric_comparison( bq_client, base_output_job_id, sandbox_output_job_id, sandbox_dataflow_dataset_id, metric_class, metric_table, additional_columns_to_compare, ) query_job = bq_client.create_table_from_query_async( dataset_id=sandbox_comparison_output_dataset_id, table_id=metric_table, query=comparison_query, overwrite=True, ) # Add query job to the list of running jobs query_jobs.append((query_job, metric_table)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) metrics_with_different_output = peekable( bq_client.list_tables(sandbox_comparison_output_dataset_id)) logging.info( "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n" ) if metrics_with_different_output: for metric_table in metrics_with_different_output: # This will always be true, and is here to silence mypy warnings assert isinstance(metric_table, bigquery.table.TableListItem) logging.warning( "Dataflow output differs for metric %s. See %s.%s for diverging rows.", metric_table.table_id, sandbox_comparison_output_dataset_id, metric_table.table_id, ) else: logging.info( "Dataflow output identical. Deleting dataset %s.", sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)
def main(dry_run: bool) -> None: """Moves all metrics with a PERSON methodology or a metric_period_months value greater than 1 to cold storage.""" bq_client = BigQueryClientImpl() dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset) for table_ref in dataflow_metrics_tables: table_id = table_ref.table_id logging.info("Migrating data to cold storage for table [%s]", table_id) filter_clause = "WHERE methodology = 'PERSON'" # Every metric except this one has a metric_period_months column if table_id != DATAFLOW_METRICS_TO_TABLES[ ReincarcerationRecidivismRateMetric]: filter_clause += " OR metric_period_months != 1" # Query for rows to be moved to the cold storage table insert_query = """ SELECT * FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` {filter_clause} """.format( project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_id, filter_clause=filter_clause, ) if dry_run: logging.info( "[DRY RUN] Would insert rows into [%s].[%s] from [%s].[%s] that match this query: %s", cold_storage_dataset, table_id, dataflow_metrics_dataset, table_id, insert_query, ) else: # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary insert_job = bq_client.insert_into_table_from_query( destination_dataset_id=cold_storage_dataset, destination_table_id=table_id, query=insert_query, allow_field_additions=True, write_disposition=WriteDisposition.WRITE_APPEND, ) # Wait for the insert job to complete before running the delete job insert_job.result() if dry_run: logging.info( "[DRY RUN] Would delete rows from [%s].[%s] %s", dataflow_metrics_dataset, table_id, filter_clause, ) else: # Delete these rows from the Dataflow metrics table delete_job = bq_client.delete_from_table_async( dataflow_metrics_dataset, table_id, filter_clause=filter_clause) # Wait for the replace job to complete before moving on delete_job.result() logging.info("Done migrating data for table [%s]", table_id)
def move_old_dataflow_metrics_to_cold_storage() -> None: """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other output is moved to cold storage. """ bq_client = BigQueryClientImpl() dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset) for table_ref in dataflow_metrics_tables: table_id = table_ref.table_id source_data_join_clause = """LEFT JOIN (SELECT DISTINCT job_id AS keep_job_id FROM `{project_id}.{reference_views_dataset}.most_recent_job_id_by_metric_and_state_code_materialized`) ON job_id = keep_job_id LEFT JOIN (SELECT DISTINCT created_on AS keep_created_date FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` ORDER BY created_on DESC LIMIT {day_count_limit}) ON created_on = keep_created_date """.format( project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, reference_views_dataset=REFERENCE_VIEWS_DATASET, table_id=table_id, day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE ) # Exclude these columns leftover from the exclusion join from being added to the metric tables in cold storage columns_to_exclude_from_transfer = ['keep_job_id', 'keep_created_date'] # This filter will return the rows that should be moved to cold storage insert_filter_clause = "WHERE keep_job_id IS NULL AND keep_created_date IS NULL" # Query for rows to be moved to the cold storage table insert_query = """ SELECT * EXCEPT({columns_to_exclude}) FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` {source_data_join_clause} {insert_filter_clause} """.format( columns_to_exclude=', '.join(columns_to_exclude_from_transfer), project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_id, source_data_join_clause=source_data_join_clause, insert_filter_clause=insert_filter_clause ) # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary insert_job = bq_client.insert_into_table_from_query( destination_dataset_id=cold_storage_dataset, destination_table_id=table_id, query=insert_query, allow_field_additions=True, write_disposition=WriteDisposition.WRITE_APPEND) # Wait for the insert job to complete before running the replace job insert_job.result() # This will return the rows that were not moved to cold storage and should remain in the table replace_query = """ SELECT * EXCEPT({columns_to_exclude}) FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` {source_data_join_clause} WHERE keep_job_id IS NOT NULL OR keep_created_date IS NOT NULL """.format( columns_to_exclude=', '.join(columns_to_exclude_from_transfer), project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_id, source_data_join_clause=source_data_join_clause, ) # Replace the Dataflow table with only the rows that should remain replace_job = bq_client.create_table_from_query_async( dataflow_metrics_dataset, table_ref.table_id, query=replace_query, overwrite=True) # Wait for the replace job to complete before moving on replace_job.result()
def compare_metric_view_output_to_sandbox( sandbox_dataset_prefix: str, load_sandbox_views: bool, check_determinism: bool, allow_schema_changes: bool, dataset_id_filters: Optional[List[str]], ) -> None: """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox dataset.""" if load_sandbox_views: logging.info( "Loading views into sandbox datasets prefixed with %s", sandbox_dataset_prefix, ) load_views_to_sandbox(sandbox_dataset_prefix) bq_client = BigQueryClientImpl() sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_metric_view_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.") bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] skipped_views: List[str] = [] for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values(): for view_builder in view_builders: # Only compare output of metric views if not isinstance(view_builder, MetricBigQueryViewBuilder): continue base_dataset_id = view_builder.dataset_id if dataset_id_filters and base_dataset_id not in dataset_id_filters: continue if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT: logging.warning( "View %s.%s has known non-deterministic output. Skipping output comparison.", view_builder.dataset_id, view_builder.view_id, ) skipped_views.append( f"{view_builder.dataset_id}.{view_builder.view_id}") continue sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id if not bq_client.dataset_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id)): raise ValueError( f"Trying to compare output to a sandbox dataset that does not exist: " f"{bq_client.project_id}.{sandbox_dataset_id}") base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id) base_view_id = (view_builder.build().materialized_view_table_id if view_builder.should_materialize and not check_determinism else view_builder.view_id) if not base_view_id: raise ValueError( "Unexpected empty base_view_id. view_id or materialized_view_table_id unset" f"for {view_builder}.") if not check_determinism and not bq_client.table_exists( base_dataset_ref, base_view_id): logging.warning( "View %s.%s does not exist. Skipping output comparison.", base_dataset_ref.dataset_id, base_view_id, ) skipped_views.append(f"{base_dataset_id}.{base_view_id}") continue if not bq_client.table_exists( bq_client.dataset_ref_for_id(sandbox_dataset_id), base_view_id): logging.warning( "View %s.%s does not exist in sandbox. Skipping output comparison.", sandbox_dataset_id, base_view_id, ) skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}") continue query_job, output_table_id = _view_output_comparison_job( bq_client, view_builder, base_view_id, base_dataset_id, sandbox_dataset_id, sandbox_comparison_output_dataset_id, check_determinism, allow_schema_changes, ) # Add query job to the list of running jobs query_jobs.append((query_job, output_table_id)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the view output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) views_with_different_output = bq_client.list_tables( sandbox_comparison_output_dataset_id) views_with_different_output = peekable(views_with_different_output) logging.info( "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n") if dataset_id_filters: logging.info( "Only compared metric view output for the following datasets: \n %s \n", dataset_id_filters, ) logging.info( "Skipped output comparison for the following metric views: \n %s \n", skipped_views, ) if views_with_different_output: for view in views_with_different_output: base_dataset_id, base_view_id = view.table_id.split("--") logging.warning( "View output differs for view %s.%s. See %s.%s for diverging rows.", base_dataset_id, base_view_id, sandbox_comparison_output_dataset_id, view.table_id, ) else: output_message = ( "identical between deployed views and sandbox datasets" if not check_determinism else "deterministic") logging.info( "View output %s. Deleting dataset %s.", output_message, sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)
def move_old_dataflow_metrics_to_cold_storage() -> None: """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other output is moved to cold storage, unless it is the most recent job_id for a metric in a state where that metric is regularly calculated, and where the year and month of the output falls into the window of what is regularly calculated for that metric and state. See the production_calculation_pipeline_templates.yaml file for a list of regularly scheduled calculations. If a metric has been entirely decommissioned, handles the deletion of the corresponding table. """ bq_client = BigQueryClientImpl() dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET cold_storage_dataset = dataflow_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset) month_range_for_metric_and_state = _get_month_range_for_metric_and_state() for table_ref in dataflow_metrics_tables: table_id = table_ref.table_id if table_id not in dataflow_config.DATAFLOW_TABLES_TO_METRIC_TYPES: # This metric has been deprecated. Handle the deletion of the table _decommission_dataflow_metric_table(bq_client, table_ref) continue is_unbounded_date_pipeline = any( pipeline in table_id for pipeline in dataflow_config.ALWAYS_UNBOUNDED_DATE_PIPELINES) # This means there are no currently scheduled pipelines writing metrics to # this table with specific month ranges no_active_month_range_pipelines = not month_range_for_metric_and_state[ table_id].items() if is_unbounded_date_pipeline or no_active_month_range_pipelines: source_data_join_clause = SOURCE_DATA_JOIN_CLAUSE_STANDARD_TEMPLATE.format( project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, materialized_metrics_dataset= DATAFLOW_METRICS_MATERIALIZED_DATASET, table_id=table_id, day_count_limit=dataflow_config. MAX_DAYS_IN_DATAFLOW_METRICS_TABLE, ) else: month_limit_by_state = "\nUNION ALL\n".join([ f"SELECT '{state_code}' as state_code, {month_limit} as month_limit" for state_code, month_limit in month_range_for_metric_and_state[table_id].items() ]) source_data_join_clause = ( SOURCE_DATA_JOIN_CLAUSE_WITH_MONTH_LIMIT_TEMPLATE.format( project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, materialized_metrics_dataset= DATAFLOW_METRICS_MATERIALIZED_DATASET, table_id=table_id, day_count_limit=dataflow_config. MAX_DAYS_IN_DATAFLOW_METRICS_TABLE, month_limit_by_state=month_limit_by_state, )) # Exclude these columns leftover from the exclusion join from being added to the metric tables in cold storage columns_to_exclude_from_transfer = ["keep_job_id", "keep_created_date"] # This filter will return the rows that should be moved to cold storage insert_filter_clause = "WHERE keep_job_id IS NULL AND keep_created_date IS NULL" # Query for rows to be moved to the cold storage table insert_query = """ SELECT * EXCEPT({columns_to_exclude}) FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` {source_data_join_clause} {insert_filter_clause} """.format( columns_to_exclude=", ".join(columns_to_exclude_from_transfer), project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_id, source_data_join_clause=source_data_join_clause, insert_filter_clause=insert_filter_clause, ) # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary insert_job = bq_client.insert_into_table_from_query( destination_dataset_id=cold_storage_dataset, destination_table_id=table_id, query=insert_query, allow_field_additions=True, write_disposition=WriteDisposition.WRITE_APPEND, ) # Wait for the insert job to complete before running the replace job insert_job.result() # This will return the rows that were not moved to cold storage and should remain in the table replace_query = """ SELECT * EXCEPT({columns_to_exclude}) FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` {source_data_join_clause} WHERE keep_job_id IS NOT NULL OR keep_created_date IS NOT NULL """.format( columns_to_exclude=", ".join(columns_to_exclude_from_transfer), project_id=table_ref.project, dataflow_metrics_dataset=table_ref.dataset_id, table_id=table_id, source_data_join_clause=source_data_join_clause, ) # Replace the Dataflow table with only the rows that should remain replace_job = bq_client.create_table_from_query_async( dataflow_metrics_dataset, table_ref.table_id, query=replace_query, overwrite=True, ) # Wait for the replace job to complete before moving on replace_job.result()