def _hydrate_unioned_regional_dataset_for_schema( config: CloudSqlToBQConfig, bq_region_override: Optional[str], dataset_override_prefix: Optional[str], ) -> None: """Given a set of already hydrated single-state datasets, unions the contents and copies the results to a dataset that lives in the same region as the CloudSQL instance (e.g. us-east1). For example given these tables: us_xx_operations_regional direct_ingest_raw_file_metadata direct_ingest_ingest_file_metadata us_yy_operations_regional direct_ingest_raw_file_metadata direct_ingest_ingest_file_metadata ...we will create a single dataset (or overwrite what exists): operations_regional direct_ingest_raw_file_metadata <-- has data from US_XX and US_YY direct_ingest_ingest_file_metadata <-- has data from US_XX and US_YY """ if not config.is_state_segmented_refresh_schema(): raise ValueError(f"Unexpected schema_type [{config.schema_type}].") state_codes = get_existing_direct_ingest_states() refreshed_source_table_datasets = { config.materialized_dataset_for_segment(state_code) for state_code in state_codes if state_code.value not in config.region_codes_to_exclude } stale_schema_datasets = { config.materialized_dataset_for_segment(state_code) for state_code in state_codes if state_code.value in config.region_codes_to_exclude } source_table_datasets = refreshed_source_table_datasets | stale_schema_datasets if stale_schema_datasets and refreshed_source_table_datasets: # We need to make sure the schemas match those that are refreshed. # # DISCLAIMER: if a column were renamed in a Postgres migration, that migration # would not be properly reflected with this schema update - the data in the new # column would be wiped for the new schemas. This code is meant to handle pure # column/table additions and deletions. reference_dataset_id = next(iter(refreshed_source_table_datasets)) if dataset_override_prefix: reference_dataset_id = f"{dataset_override_prefix}_{reference_dataset_id}" stale_schema_datasets = { f"{dataset_override_prefix}_{dataset_id}" for dataset_id in stale_schema_datasets } bq_client = BigQueryClientImpl(region_override=bq_region_override) bq_client.update_datasets_to_match_reference_schema( reference_dataset_id, list(stale_schema_datasets)) view_builders = [ UnionedStateSegmentsViewBuilder(config=config, table=t, state_codes=state_codes) for t in config.get_tables_to_export() ] dataset_overrides = None if dataset_override_prefix: dataset_overrides = dataset_overrides_for_view_builders( view_dataset_override_prefix=dataset_override_prefix, view_builders=view_builders, ) for dataset in source_table_datasets: dataset_overrides[dataset] = f"{dataset_override_prefix}_{dataset}" create_managed_dataset_and_deploy_views_for_view_builders( view_source_table_datasets=source_table_datasets, view_builders_to_update=view_builders, dataset_overrides=dataset_overrides, bq_region_override=bq_region_override, force_materialize=True, )
def export_all_tables(cloud_sql_to_bq_config: CloudSqlToBQConfig) -> None: tables = cloud_sql_to_bq_config.get_tables_to_export() for table in tables: export_table(table.name, cloud_sql_to_bq_config)