示例#1
0
def run_export(project_id: str, dry_run: bool, state_codes: List[str],
               target_dataset: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to tables with the same names in the target dataset."""
    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist')

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table, state_codes)
        logging.info(export_query)

        if not export_query:
            continue

        if target_dataset:
            if dry_run:
                logging.info(
                    "[DRY RUN] Exporting to target project.dataset.table [%s.%s.%s]",
                    project_id, target_dataset, table.table_id)
            else:
                logging.info(
                    "Exporting to target project.dataset.table [%s.%s.%s]",
                    project_id, target_dataset, table.table_id)
                copy_table_to_dataset(target_dataset, table.table_id,
                                      export_query, big_query_client)
def load_from_temp_to_permanent_table(bq_client: BigQueryClientImpl,
                                      project_id: str) -> None:
    """Query temporary table and persist view to permanent table"""
    num_rows_before = bq_client.get_table(
        dataset_ref=bigquery.DatasetReference(
            project=project_id,
            dataset_id=DATASET_ID,
        ),
        table_id=FINAL_DESTINATION_TABLE,
    ).num_rows

    insert_job = bq_client.insert_into_table_from_query(
        destination_dataset_id=DATASET_ID,
        destination_table_id=FINAL_DESTINATION_TABLE,
        query=INSERT_QUERY_TEMPLATE.format(
            project_id=project_id,
            dataset_id=DATASET_ID,
            temp_table=TEMP_DESTINATION_TABLE,
            final_table=FINAL_DESTINATION_TABLE,
        ),
        write_disposition=WriteDisposition.WRITE_APPEND,
    )

    insert_job_result = insert_job.result()

    logging.info(
        "Loaded [%d] non-duplicate rows into table [%s]",
        (insert_job_result.total_rows - num_rows_before),
        FINAL_DESTINATION_TABLE,
    )

    bq_client.delete_table(dataset_id=DATASET_ID,
                           table_id=TEMP_DESTINATION_TABLE)
示例#3
0
def handle_bq_export_task():
    """Worker function to handle BQ export task requests.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        table_name: Table to export then import. Table must be defined
            in export_config.COUNTY_TABLES_TO_EXPORT.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    table_name = data['table_name']
    schema_type_str = data['schema_type']

    bq_client = BigQueryClientImpl()
    if schema_type_str == SchemaType.JAILS.value:
        schema_type = SchemaType.JAILS
        dataset_ref = bq_client.dataset_ref_for_id(county_dataset_config.COUNTY_BASE_DATASET)
    elif schema_type_str == SchemaType.STATE.value:
        schema_type = SchemaType.STATE
        dataset_ref = bq_client.dataset_ref_for_id(state_dataset_config.STATE_BASE_DATASET)
    else:
        return '', HTTPStatus.INTERNAL_SERVER_ERROR

    logging.info("Starting BQ export task for table: %s", table_name)

    success = export_table_then_load_table(bq_client, table_name, dataset_ref, schema_type)

    return ('', HTTPStatus.OK if success else HTTPStatus.INTERNAL_SERVER_ERROR)
示例#4
0
def materialize_views(view_builders_to_materialize: List[BigQueryViewBuilder]):
    """Materializes views in VIEW_BUILDERS_FOR_VIEWS_TO_MATERIALIZE_FOR_DASHBOARD_EXPORT into tables."""
    bq_client = BigQueryClientImpl()

    for view_builder in view_builders_to_materialize:
        view = view_builder.build()
        bq_client.materialize_view_to_table(view)
def _decommission_dataflow_metric_table(bq_client: BigQueryClientImpl,
                                        table_ref: TableListItem) -> None:
    """Decommissions a deprecated Dataflow metric table. Moves all remaining rows
    to cold storage and deletes the table in the DATAFLOW_METRICS_DATASET."""
    logging.info("Decommissioning Dataflow metric table: [%s]",
                 table_ref.table_id)

    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = dataflow_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET
    table_id = table_ref.table_id

    # Move all rows in the table to cold storage
    insert_query = (
        """SELECT * FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}`"""
        .format(
            project_id=table_ref.project,
            dataflow_metrics_dataset=dataflow_metrics_dataset,
            table_id=table_id,
        ))

    insert_job = bq_client.insert_into_table_from_query(
        destination_dataset_id=cold_storage_dataset,
        destination_table_id=table_id,
        query=insert_query,
        allow_field_additions=True,
        write_disposition=WriteDisposition.WRITE_APPEND,
    )

    # Wait for the insert job to complete before deleting the table
    insert_job.result()

    bq_client.delete_table(dataset_id=dataflow_metrics_dataset,
                           table_id=table_id)
def main(*, source_project_id, source_dataset_id, destination_project_id,
         destination_dataset_id):
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)

    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)

    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if table.view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(dataset_id=table_ref.dataset_id,
                                  view_id=table.table_id,
                                  view_query_template=table.view_query),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset)
def update_dataflow_metric_tables_schemas(
    dataflow_metrics_dataset_id: str = DATAFLOW_METRICS_DATASET,
) -> None:
    """For each table that stores Dataflow metric output in the |dataflow_metrics_dataset_id|, ensures that all
    attributes on the corresponding metric are present in the table in BigQuery. If no |dataflow_metrics_dataset_id| is
    provided, defaults to the DATAFLOW_METRICS_DATASET."""
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset_ref = bq_client.dataset_ref_for_id(
        dataflow_metrics_dataset_id
    )

    bq_client.create_dataset_if_necessary(dataflow_metrics_dataset_ref)

    for metric_class, table_id in DATAFLOW_METRICS_TO_TABLES.items():
        schema_for_metric_class = metric_class.bq_schema_for_metric_table()

        if bq_client.table_exists(dataflow_metrics_dataset_ref, table_id):
            # Compare schema derived from metric class to existing dataflow views and update if necessary.
            bq_client.update_schema(
                dataflow_metrics_dataset_id, table_id, schema_for_metric_class
            )
        else:
            # Create a table with this schema
            bq_client.create_table_with_schema(
                dataflow_metrics_dataset_id, table_id, schema_for_metric_class
            )
示例#8
0
    def setUp(self) -> None:
        self.location = 'US'
        self.mock_project_id = 'fake-recidiviz-project'
        self.mock_dataset_id = 'fake-dataset'
        self.mock_table_id = 'test_table'
        self.mock_dataset_ref = bigquery.dataset.DatasetReference(
            self.mock_project_id, self.mock_dataset_id)
        self.mock_table = self.mock_dataset_ref.table(self.mock_table_id)

        self.metadata_patcher = mock.patch(
            'recidiviz.utils.metadata.project_id')
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = self.mock_project_id

        self.client_patcher = mock.patch(
            'recidiviz.big_query.big_query_client.client')
        self.mock_client = self.client_patcher.start().return_value

        self.mock_view = BigQueryView(
            dataset_id='dataset',
            view_id='test_view',
            view_query_template='SELECT NULL LIMIT 0',
            should_materialize=True)

        self.bq_client = BigQueryClientImpl()
示例#9
0
    def setUp(self) -> None:
        self.location = "US"
        self.mock_project_id = "fake-recidiviz-project"
        self.mock_dataset_id = "fake-dataset"
        self.mock_table_id = "test_table"
        self.mock_dataset_ref = bigquery.dataset.DatasetReference(
            self.mock_project_id, self.mock_dataset_id
        )
        self.mock_table = self.mock_dataset_ref.table(self.mock_table_id)

        self.metadata_patcher = mock.patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = self.mock_project_id

        self.client_patcher = mock.patch("recidiviz.big_query.big_query_client.client")
        self.mock_client = self.client_patcher.start().return_value

        self.mock_view = BigQueryView(
            dataset_id="dataset",
            view_id="test_view",
            view_query_template="SELECT NULL LIMIT 0",
            should_materialize=True,
        )

        self.bq_client = BigQueryClientImpl()
    def columns(self) -> List[str]:
        if self._columns is None:
            bq_client = BigQueryClientImpl()
            t = bq_client.get_table(
                bq_client.dataset_ref_for_id(self.dataset_id), self.table_id)
            self._columns = [col.name for col in t.schema]

        return self._columns
示例#11
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
示例#12
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
示例#13
0
def _create_dataset_and_update_views(
        views_to_update: List[BigQueryView],
        set_temp_dataset_table_expiration: bool = False) -> None:
    """Create and update the given views and their parent datasets.

    For each dataset key in the given dictionary, creates the dataset if it does not exist, and creates or updates the
    underlying views mapped to that dataset.

    If a view has a set materialized_view_table_id field, materializes the view into a table.

    Args:
        views_to_update: A list of view objects to be created or updated.
    """
    new_dataset_table_expiration_ms = (TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
                                       if set_temp_dataset_table_expiration
                                       else None)

    bq_client = BigQueryClientImpl()
    dataset_ids = set()
    for view in views_to_update:
        views_dataset_ref = bq_client.dataset_ref_for_id(view.dataset_id)
        if view.dataset_id not in dataset_ids:
            bq_client.create_dataset_if_necessary(
                views_dataset_ref, new_dataset_table_expiration_ms)
            dataset_ids.add(view.dataset_id)

        bq_client.create_or_update_view(views_dataset_ref, view)

        if view.materialized_view_table_id:
            bq_client.materialize_view_to_table(view)
示例#14
0
def refresh_bq_table() -> Tuple[str, int]:
    """Worker function to handle BQ export task requests.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        table_name: Table to export then import. Table must be defined
            in one of the base schema types.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    table_name = data['table_name']
    schema_type_str = data['schema_type']

    try:
        schema_type = SchemaType(schema_type_str)
    except ValueError:
        return (f'Unknown schema type [{schema_type_str}]',
                HTTPStatus.BAD_REQUEST)

    bq_client = BigQueryClientImpl()
    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type)

    logging.info("Starting BQ export task for table: %s", table_name)

    export_table_then_load_table(bq_client, table_name, cloud_sql_to_bq_config)
    return ('', HTTPStatus.OK)
def store_validation_results(
    validation_results: List[ValidationResultForStorage], ) -> None:
    if not environment.in_gcp():
        logging.info(
            "Skipping storing [%d] validation results in BigQuery.",
            len(validation_results),
        )
        return

    bq_client = BigQueryClientImpl()
    bq_client.insert_into_table(
        bq_client.dataset_ref_for_id(
            VALIDATION_RESULTS_BIGQUERY_ADDRESS.dataset_id),
        VALIDATION_RESULTS_BIGQUERY_ADDRESS.table_id,
        [result.to_serializable() for result in validation_results],
    )
示例#16
0
    def run_check(
        cls, validation_job: DataValidationJob[SamenessDataValidationCheck]
    ) -> DataValidationJobResult:
        comparison_columns = validation_job.validation.comparison_columns
        max_allowed_error = validation_job.validation.max_allowed_error

        query_job = BigQueryClientImpl().run_query_async(validation_job.query_str(), [])

        if (
            validation_job.validation.sameness_check_type
            == SamenessDataValidationCheckType.NUMBERS
        ):
            return SamenessValidationChecker.run_check_for_numbers(
                validation_job, comparison_columns, max_allowed_error, query_job
            )
        if (
            validation_job.validation.sameness_check_type
            == SamenessDataValidationCheckType.STRINGS
        ):
            return SamenessValidationChecker.run_check_for_strings(
                validation_job, comparison_columns, max_allowed_error, query_job
            )

        raise ValueError(
            f"Unexpected sameness_check_type of {validation_job.validation.sameness_check_type}."
        )
示例#17
0
def _create_dataset_and_deploy_views(
    views_to_update: List[BigQueryView], set_temp_dataset_table_expiration: bool = False
) -> None:
    """Create and update the given views and their parent datasets.

    For each dataset key in the given dictionary, creates the dataset if it does not exist, and creates or updates the
    underlying views mapped to that dataset.

    If a view has a set materialized_view_table_id field, materializes the view into a table.

    Args:
        views_to_update: A list of view objects to be created or updated.
        set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration of
            TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS.
    """

    bq_client = BigQueryClientImpl()
    _create_all_datasets_if_necessary(
        bq_client, views_to_update, set_temp_dataset_table_expiration
    )

    dag_walker = BigQueryViewDagWalker(views_to_update)

    def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView, bool]) -> bool:
        """Returns True if this view or any of its parents were updated."""
        return _create_or_update_view_and_materialize_if_necessary(
            bq_client, v, parent_results
        )

    dag_walker.process_dag(process_fn)
示例#18
0
def get_delegate_export_map(
    gcsfs_client: GCSFileSystem,
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> Dict[ExportOutputFormatType, BigQueryViewExporter]:
    """Builds the delegate_export_map, mapping the csv_exporter, json_exporter, and metric_exporter
    to the correct ExportOutputFormatType.
    """
    if override_view_exporter is None:
        bq_client = BigQueryClientImpl()

        # Some our views intentionally export empty files (e.g. some of the ingest_metadata views)
        # so we just check for existence
        csv_exporter = CSVBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        json_exporter = JsonLinesBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        metric_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client,
            OptimizedMetricBigQueryViewExportValidator(gcsfs_client))

        delegate_export_map = {
            ExportOutputFormatType.CSV: csv_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: csv_exporter,
            ExportOutputFormatType.JSON: json_exporter,
            ExportOutputFormatType.METRIC: metric_exporter,
        }
    else:
        delegate_export_map = {
            ExportOutputFormatType.CSV: override_view_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: override_view_exporter,
            ExportOutputFormatType.JSON: override_view_exporter,
            ExportOutputFormatType.METRIC: override_view_exporter,
        }
    return delegate_export_map
def move_old_dataflow_metrics_to_cold_storage():
    """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the
    MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other
    output is moved to cold storage.
    """
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        filter_clause = """WHERE created_on NOT IN
                          (SELECT DISTINCT created_on FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` 
                          ORDER BY created_on DESC
                          LIMIT {day_count_limit})""".format(
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_ref.table_id,
            day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE)

        cold_storage_dataset_ref = bq_client.dataset_ref_for_id(
            cold_storage_dataset)

        if bq_client.table_exists(cold_storage_dataset_ref, table_id):
            # Move data from the Dataflow metrics dataset into the cold storage dataset
            insert_job = bq_client.insert_into_table_from_table_async(
                source_dataset_id=dataflow_metrics_dataset,
                source_table_id=table_id,
                destination_dataset_id=cold_storage_dataset,
                destination_table_id=table_id,
                source_data_filter_clause=filter_clause,
                allow_field_additions=True)

            # Wait for the insert job to complete before running the delete job
            insert_job.result()
        else:
            # This table doesn't yet exist in cold storage. Create it.
            table_query = f"SELECT * FROM `{bq_client.project_id}.{dataflow_metrics_dataset}.{table_id}` " \
                          f"{filter_clause}"

            create_job = bq_client.create_table_from_query_async(
                cold_storage_dataset,
                table_id,
                table_query,
                query_parameters=[])

            # Wait for the create job to complete before running the delete job
            create_job.result()

        # Delete that data from the Dataflow dataset
        delete_job = bq_client.delete_from_table_async(
            dataflow_metrics_dataset, table_ref.table_id, filter_clause)

        # Wait for the delete job to complete before moving on
        delete_job.result()
示例#20
0
def rematerialize_views_for_namespace(
    # TODO(#5785): Clarify use case of BigQueryViewNamespace filter (see ticket for more)
    bq_view_namespace: BigQueryViewNamespace,
    candidate_view_builders: Sequence[BigQueryViewBuilder],
    dataset_overrides: Optional[Dict[str, str]] = None,
    skip_missing_views: bool = False,
) -> None:
    """For all views in a given namespace, re-materializes any materialized views. This should be called only when we
    want to refresh the data in the materialized view, not when we want to update the underlying query of the view.
    """
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours."
        )

    try:
        views_to_update = _build_views_to_update(
            candidate_view_builders=candidate_view_builders,
            dataset_overrides=dataset_overrides,
        )

        bq_client = BigQueryClientImpl()
        _create_all_datasets_if_necessary(
            bq_client, views_to_update, set_default_table_expiration_for_new_datasets
        )

        dag_walker = BigQueryViewDagWalker(views_to_update)

        def _materialize_view(
            v: BigQueryView, _parent_results: Dict[BigQueryView, None]
        ) -> None:
            if not v.materialized_view_table_id:
                logging.info(
                    "Skipping non-materialized view [%s.%s].", v.dataset_id, v.view_id
                )
                return

            if skip_missing_views and not bq_client.table_exists(
                bq_client.dataset_ref_for_id(dataset_id=v.dataset_id), v.view_id
            ):
                logging.info(
                    "Skipping materialization of view [%s.%s] which does not exist",
                    v.dataset_id,
                    v.view_id,
                )
                return

            bq_client.materialize_view_to_table(v)

        dag_walker.process_dag(_materialize_view)

    except Exception as e:
        with monitoring.measurements(
            {monitoring.TagKey.CREATE_UPDATE_VIEWS_NAMESPACE: bq_view_namespace.value}
        ) as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e from e
def load_from_gcs_to_temp_table(bq_client: BigQueryClientImpl, project_id: str,
                                blob_name: str) -> None:
    """Upload raw data from GCS to temporary BQ table """
    bucket_name = f"{project_id}{BUCKET_SUFFIX}"
    load_job = bq_client.load_table_from_cloud_storage_async(
        source_uri=f"gs://{bucket_name}/"
        f"{blob_name}",
        destination_dataset_ref=bigquery.DatasetReference(
            project=project_id,
            dataset_id=DATASET_ID,
        ),
        destination_table_id=TEMP_DESTINATION_TABLE,
        destination_table_schema=[
            bigquery.SchemaField("processed", "TIMESTAMP", mode="NULLABLE"),
            bigquery.SchemaField("message_id", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("event", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("api_key_id", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("recv_message_id", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("credential_id", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("subject", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("from", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("email", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("asm_group_id", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("template_id", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("originating_ip", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("reason", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("outbound_ip", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("outbound_ip_type", "STRING",
                                 mode="NULLABLE"),
            bigquery.SchemaField("mx", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("url", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("attempt", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("user_agent", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("type", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("is_unique", "BOOLEAN", mode="NULLABLE"),
            bigquery.SchemaField("username", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("categories", "STRING", mode="NULLABLE"),
            bigquery.SchemaField("marketing_campaign_id",
                                 "STRING",
                                 mode="NULLABLE"),
            bigquery.SchemaField("marketing_campaign_name",
                                 "STRING",
                                 mode="NULLABLE"),
            bigquery.SchemaField("marketing_campaign_split_id",
                                 "STRING",
                                 mode="NULLABLE"),
            bigquery.SchemaField("marketing_campaign_version",
                                 "STRING",
                                 mode="NULLABLE"),
            bigquery.SchemaField("unique_args", "STRING", mode="NULLABLE"),
        ],
        skip_leading_rows=1,
    )
    table_load_success = wait_for_table_load(bq_client, load_job)

    if not table_load_success:
        logging.info("Copy from cloud storage to temporary table failed")
        return
def update_dataflow_metric_tables_schemas() -> None:
    """For each table that stores Dataflow metric output, ensures that all attributes on the corresponding metric are
    present in the table in BigQuery."""
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset_id = DATAFLOW_METRICS_DATASET
    dataflow_metrics_dataset_ref = bq_client.dataset_ref_for_id(dataflow_metrics_dataset_id)

    bq_client.create_dataset_if_necessary(dataflow_metrics_dataset_ref)

    for metric_class, table_id in DATAFLOW_METRICS_TO_TABLES.items():
        schema_for_metric_class = metric_class.bq_schema_for_metric_table()

        if bq_client.table_exists(dataflow_metrics_dataset_ref, table_id):
            # Add any missing fields to the table's schema
            bq_client.add_missing_fields_to_schema(dataflow_metrics_dataset_id, table_id, schema_for_metric_class)
        else:
            # Create a table with this schema
            bq_client.create_table_with_schema(dataflow_metrics_dataset_id, table_id, schema_for_metric_class)
def main(*, project_id: str, local_filepath: str, backfill: bool) -> None:
    """If filepath, uploads file at filepath to Google Cloud Storage and adds rows that
     do not already exist in BigQuery table.
    If backfill, loads all rows in CSVs in GCS to BQ table.
    """

    fs = GcsfsFactory.build()
    bq_client = BigQueryClientImpl(project_id=project_id)

    # If backfill, clear out all data in BQ table and load in new rows
    if backfill and (input(
            "Are you sure? This action will delete all data from current raw data "
            "table and replace it with rows from CSVs currently in GCS. "
            "Enter 'backfill' if you are sure. \n") == "backfill"):
        # Clear out old rows from table
        bq_client.delete_from_table_async(
            dataset_id=DATASET_ID,
            table_id=FINAL_DESTINATION_TABLE,
            filter_clause="WHERE TRUE",
        )
        # For each file in table, load into BQ
        for blob in fs.ls_with_blob_prefix(f"{project_id}{BUCKET_SUFFIX}", ""):
            if isinstance(blob, GcsfsFilePath):
                logging.info(
                    "Back filling from blob [%s] in bucket [%s]",
                    blob.file_name,
                    f"{project_id}{BUCKET_SUFFIX}",
                )
                load_from_gcs_to_temp_table(bq_client, project_id,
                                            blob.file_name)
                load_from_temp_to_permanent_table(bq_client, project_id)

    # If local file path was provided, upload that file to GCS and load data into BQ
    if local_filepath:
        # If local file path provided, upload file at file path into GCS
        upload_raw_file_to_gcs(fs, local_filepath,
                               f"{project_id}{BUCKET_SUFFIX}")
        logging.info("Found local file path, uploading from [%s]",
                     local_filepath)

        # Load data to temporary table and then to permanent
        load_from_gcs_to_temp_table(bq_client, project_id,
                                    date.today().strftime(DATE_FORMAT))
        load_from_temp_to_permanent_table(bq_client, project_id)
示例#24
0
def copy_bq_views(
    source_project_id: str,
    source_dataset_id: str,
    destination_project_id: str,
    destination_dataset_id: str,
) -> None:
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)
    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)
    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)
        view_query = table.view_query

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Remove any references to the source_project_id from the view_query
            updated_view_query = view_query.replace(source_project_id,
                                                    "{project_id}")

            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(
                    project_id=destination_project_id,
                    dataset_id=destination_dataset_id,
                    view_id=table.table_id,
                    description=table.description,
                    view_query_template=updated_view_query,
                ),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset,
            )
示例#25
0
def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist")

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table,
                                                    [state_code.upper()])
        logging.info(export_query)

        if not export_query:
            continue

        export_dir = gcs_export_directory(target_bucket, today,
                                          state_code.lower())
        export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv"
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id="export_temporary_tables",
            intermediate_table_name=
            f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}",
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config,
            )
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(
            export_configs, print_header=True)
示例#26
0
    def test__get_excess_schema_fields_with_matching_schema(self) -> None:
        """Tests _get_excess_schema_fields() when base_schema is the same as extended_schema."""
        base_schema = [
            bigquery.SchemaField("field_1", "INT"),
            bigquery.SchemaField("field_2", "INT"),
        ]

        excess_fields = BigQueryClientImpl._get_excess_schema_fields(
            base_schema, base_schema
        )

        self.assertListEqual(excess_fields, [])
示例#27
0
def create_or_update_dataflow_metrics_sandbox(sandbox_dataset_prefix: str,
                                              allow_overwrite: bool = False
                                              ) -> None:
    """Creates or updates a Dataflow metrics sandbox dataset, prefixing the dataset name with the given prefix."""
    sandbox_dataset_id = (sandbox_dataset_prefix + "_" +
                          dataset_config.DATAFLOW_METRICS_DATASET)

    bq_client = BigQueryClientImpl()
    sandbox_dataset_ref = bq_client.dataset_ref_for_id(sandbox_dataset_id)

    if bq_client.dataset_exists(sandbox_dataset_ref) and not allow_overwrite:
        if __name__ == "__main__":
            logging.error(
                "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.",
                sandbox_dataset_id,
                bq_client.project_id,
            )
            sys.exit(1)
        else:
            raise ValueError(
                f"{sandbox_dataset_id} already exists in project {bq_client.project_id}. Cannot create a "
                f"Dataflow sandbox in an existing dataset.")

    logging.info(
        "Creating dataflow metrics sandbox in dataset %s. Tables will expire after 72 hours.",
        sandbox_dataset_ref,
    )
    bq_client.create_dataset_if_necessary(
        sandbox_dataset_ref, TEMP_DATAFLOW_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    update_dataflow_metric_tables_schemas(
        dataflow_metrics_dataset_id=sandbox_dataset_id)
示例#28
0
def export_dashboard_data_to_cloud_storage(bucket: str):
    """Exports data needed by the dashboard to the cloud storage bucket.

    Args:
        bucket: The cloud storage location where the exported data should go.
    """
    view_manager.create_dataset_and_update_views(view_config.VIEWS_TO_UPDATE)

    bq_client = BigQueryClientImpl()
    views_to_materialize = dashboard_export_config.VIEWS_TO_MATERIALIZE_FOR_DASHBOARD_EXPORT
    views_to_export = dashboard_export_config.VIEWS_TO_EXPORT

    # This step has to happen before the export because some views in views_to_export depend on the materialized
    # version of a view
    for view in views_to_materialize:
        bq_client.materialize_view_to_table(view)

    bq_client.export_query_results_to_cloud_storage([
        ExportQueryConfig.from_view_query(
            view=view,
            view_filter_clause=f" WHERE state_code = '{state}'",
            intermediate_table_name=f"{view.view_id}_table_{state}",
            output_uri=f"gs://{bucket}/{state}/{view.view_id}.json",
            output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
        for state in dashboard_export_config.STATES_TO_EXPORT
        for view in views_to_export
    ])
示例#29
0
def export_view_data_to_cloud_storage():
    """Exports data in BigQuery views to cloud storage buckets."""
    view_builders_for_views_to_update = view_config.VIEW_BUILDERS_FOR_VIEWS_TO_UPDATE
    view_manager.create_dataset_and_update_views_for_view_builders(view_builders_for_views_to_update)

    bq_client = BigQueryClientImpl()
    project_id = bq_client.project_id

    # This step has to happen before the export because some views in views_to_export depend on the materialized
    # version of a view
    view_builders_to_materialize = view_config.VIEW_BUILDERS_FOR_VIEWS_TO_MATERIALIZE_FOR_DASHBOARD_EXPORT
    materialize_views(view_builders_to_materialize)

    datasets_to_export = view_config.DATASETS_STATES_AND_VIEW_BUILDERS_TO_EXPORT.keys()

    for dataset_id in datasets_to_export:
        states_and_view_builders_to_export = view_config.DATASETS_STATES_AND_VIEW_BUILDERS_TO_EXPORT.get(dataset_id)
        output_uri_template = view_config.OUTPUT_URI_TEMPLATE_FOR_DATASET_EXPORT.get(dataset_id)

        if not states_and_view_builders_to_export or not output_uri_template:
            raise ValueError(f"Trying to export views from an unsupported dataset: {dataset_id}")

        views_to_export = [
            ExportQueryConfig.from_view_query(
                view=view,
                view_filter_clause=f" WHERE state_code = '{state_code}'",
                intermediate_table_name=(
                    f"{view.view_id if view.materialized_view_table_id is None else view.materialized_view_table_id}"
                    f"_table_{state_code}"),
                output_uri=output_uri_template.format(
                    project_id=project_id,
                    state_code=state_code,
                    view_id=view.view_id if view.materialized_view_table_id is None else view.materialized_view_table_id
                ),
                output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
            for state_code, view_builders in states_and_view_builders_to_export.items()
            for view in [view_builder.build() for view_builder in view_builders]
        ]

        bq_client.export_query_results_to_cloud_storage(views_to_export)
示例#30
0
    def run_check(
        cls, validation_job: DataValidationJob[ExistenceDataValidationCheck]
    ) -> DataValidationJobResult:
        query_job = BigQueryClientImpl().run_query_async(
            validation_job.query_str(), [])

        return DataValidationJobResult(
            validation_job=validation_job,
            result_details=ExistenceValidationResultDetails(
                num_invalid_rows=more_itertools.ilen(query_job),
                num_allowed_rows=validation_job.validation.num_allowed_rows,
            ),
        )