Пример #1
0
def wait_for_table_load(big_query_client: BigQueryClient,
                        load_job: bigquery.job.LoadJob) -> bool:
    """Wait for a table LoadJob to finish, and log its status.

    Args:
        big_query_client: A BigQueryClient for querying the result table
        load_job: BigQuery LoadJob whose result to wait for.
    Returns:
        True if no errors were raised, else False.
    """
    try:
        # Wait for table load job to complete.
        load_job.result(_BQ_LOAD_WAIT_TIMEOUT_SECONDS)
        logging.info("Load job %s for table %s.%s.%s completed successfully.",
                     load_job.job_id, load_job.destination.project,
                     load_job.destination.dataset_id,
                     load_job.destination.table_id)

        destination_table = big_query_client.get_table(
            big_query_client.dataset_ref_for_id(
                load_job.destination.dataset_id),
            load_job.destination.table_id)
        logging.info("Loaded %d rows in table %s.%s.%s",
                     destination_table.num_rows, load_job.destination.project,
                     load_job.destination.dataset_id,
                     load_job.destination.table_id)
        return True
    except (exceptions.NotFound, exceptions.BadRequest,
            concurrent.futures.TimeoutError):  # type: ignore
        logging.exception("Failed to load table %s.%s.%s",
                          load_job.destination.project,
                          load_job.destination.dataset_id,
                          load_job.destination.table_id)
        return False
Пример #2
0
def export_all_then_load_all(big_query_client: BigQueryClient, schema_type: SchemaType):
    """Export all tables from Cloud SQL in the given schema, then load all
    tables to BigQuery.

    Exports happen in sequence (one at a time),
    then once all exports are completed, the BigQuery loads happen in parallel.

    For example, for tables A, B, C:
    1. Export Table A
    2. Export Table B
    3. Export Table C
    4. Load Tables A, B, C in parallel.
    """

    if schema_type == SchemaType.JAILS:
        tables_to_export = export_config.COUNTY_TABLES_TO_EXPORT
        base_tables_dataset_ref = big_query_client.dataset_ref_for_id(county_dataset_config.COUNTY_BASE_DATASET)
        export_queries = export_config.COUNTY_TABLE_EXPORT_QUERIES
    elif schema_type == SchemaType.STATE:
        tables_to_export = export_config.STATE_TABLES_TO_EXPORT
        base_tables_dataset_ref = big_query_client.dataset_ref_for_id(state_dataset_config.STATE_BASE_DATASET)
        export_queries = export_config.STATE_TABLE_EXPORT_QUERIES
    else:
        logging.error("Invalid schema_type requested. Must be either"
                      " SchemaType.JAILS or SchemaType.STATE.")
        return

    logging.info("Beginning CloudSQL export")
    cloudsql_export.export_all_tables(schema_type,
                                      tables_to_export,
                                      export_queries)

    logging.info("Beginning BQ table load")
    bq_load.load_all_tables_concurrently(
        big_query_client, base_tables_dataset_ref, tables_to_export, schema_type)
def delete_unmanaged_views_and_tables_from_dataset(
    bq_client: BigQueryClient,
    dataset_id: str,
    managed_tables: Set[BigQueryAddress],
    dry_run: bool,
) -> Set[BigQueryAddress]:
    """This function takes in a set of managed views/tables and compares it to the list of
    tables BigQuery has. The function then deletes any views/tables that are in BigQuery but not
    in the set of managed views/tables. It then returns a set of the BigQueryAddress's
    from these unmanaged views/tables that are to be deleted."""
    unmanaged_views_and_tables: Set[BigQueryAddress] = set()
    dataset_ref = bq_client.dataset_ref_for_id(dataset_id)
    if not bq_client.dataset_exists(dataset_ref):
        raise ValueError("Dataset %s does not exist in BigQuery" % dataset_id)
    for table in list(bq_client.list_tables(dataset_id)):
        table_bq_address = BigQueryAddress.from_list_item(table)
        if table_bq_address not in managed_tables:
            unmanaged_views_and_tables.add(table_bq_address)
    for view in unmanaged_views_and_tables:
        if dry_run:
            logging.info(
                "[DRY RUN] Regular run would delete unmanaged table/view %s from dataset %s.",
                view.table_id,
                view.dataset_id,
            )

        else:
            logging.info(
                "Deleting unmanaged table/view %s from dataset %s.",
                view.table_id,
                view.dataset_id,
            )

            bq_client.delete_table(view.dataset_id, view.table_id)
    return unmanaged_views_and_tables
Пример #4
0
def copy_table_to_dataset(target_dataset: str, target_table: str,
                          export_query: str,
                          bq_client: BigQueryClient) -> None:
    """Copies the results of the given query to the target table and dataset, overwriting what lives there if the
    table already exists."""
    bq_client.create_table_from_query_async(target_dataset,
                                            target_table,
                                            export_query,
                                            overwrite=True)
Пример #5
0
def delete_temp_table_if_exists(
        big_query_client: BigQueryClient, temp_table_name: str,
        cloud_sql_to_bq_config: CloudSqlToBQConfig) -> None:
    dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client)
    if not big_query_client.table_exists(dataset_ref=dataset_ref,
                                         table_id=temp_table_name):
        logging.info('Delete temp table failed, table [%s] does not exist.',
                     temp_table_name)
        return
    big_query_client.delete_table(dataset_id=cloud_sql_to_bq_config.dataset_id,
                                  table_id=temp_table_name)
    logging.info('Deleted temporary table [%s]', temp_table_name)
def _get_all_null_columns(bq_client: BigQueryClient, project_id: str,
                          table_name: str, state_code: str) -> List[str]:
    formatted_query = _SEARCH_QUERY.format(project_id=project_id,
                                           state_code=state_code,
                                           table_name=table_name)
    query_job = bq_client.run_query_async(formatted_query)
    return sorted([row["null_column"] for row in query_job])
Пример #7
0
def _create_all_datasets_if_necessary(
    bq_client: BigQueryClient,
    dataset_ids: List[str],
    set_temp_dataset_table_expiration: bool,
) -> None:
    """Creates all required datasets for the list of dataset ids,
    with a table timeout if necessary. Done up front to avoid conflicts during a run of the DagWalker.
    """
    new_dataset_table_expiration_ms = (TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
                                       if set_temp_dataset_table_expiration
                                       else None)

    for dataset_id in dataset_ids:
        dataset_ref = bq_client.dataset_ref_for_id(dataset_id)
        bq_client.create_dataset_if_necessary(dataset_ref,
                                              new_dataset_table_expiration_ms)
Пример #8
0
def start_table_load(
        big_query_client: BigQueryClient,
        dataset_ref: bigquery.dataset.DatasetReference, table_name: str,
        schema_type: SchemaType) -> Optional[bigquery.job.LoadJob]:
    """Loads a table from CSV data in GCS to BigQuery.

    Given a table name, retrieve the export URI and schema from export_config,
    then load the table into BigQuery.

    This starts the job, but does not wait until it completes.

    Tables are created if they do not exist, and overwritten if they do exist.

    Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's
    data will be completely wiped and overwritten with the contents of the CSV.

    Args:
        big_query_client: A BigQueryClient.
        dataset_ref: The BigQuery dataset to load the table into. Gets created
            if it does not already exist.
        table_name: Table to import. Table must be defined
            in the export_config.*_TABLES_TO_EXPORT for the given module
        schema_type: The schema of the table being loaded, either
            SchemaType.JAILS or SchemaType.STATE.
    Returns:
        (load_job, table_ref) where load_job is the LoadJob object containing
            job details, and table_ref is the destination TableReference object.
            If the job fails to start, returns None.
    """
    if schema_type == SchemaType.JAILS:
        export_schema = export_config.COUNTY_TABLE_EXPORT_SCHEMA
    elif schema_type == SchemaType.STATE:
        export_schema = export_config.STATE_TABLE_EXPORT_SCHEMA
    elif schema_type == SchemaType.OPERATIONS:
        export_schema = export_config.OPERATIONS_TABLE_EXPORT_SCHEMA
    else:
        logging.exception("Unknown schema type: %s", schema_type)
        return None

    uri = export_config.gcs_export_uri(table_name)

    try:
        bq_schema = [
            bigquery.SchemaField(field['name'], field['type'], field['mode'])
            for field in export_schema[table_name]
        ]
    except KeyError:
        logging.exception(
            "Unknown table name '%s'. Is it listed in "
            "the TABLES_TO_EXPORT for the %s module?", schema_type, table_name)
        return None

    load_job = big_query_client.load_table_from_cloud_storage_async(
        source_uri=uri,
        destination_dataset_ref=dataset_ref,
        destination_table_id=table_name,
        destination_table_schema=bq_schema)

    return load_job
def cleanup_datasets_and_delete_unmanaged_views(
    bq_client: BigQueryClient,
    managed_views_map: Dict[str, Set[BigQueryAddress]],
    dry_run: bool = True,
) -> None:
    """This function filters through a list of managed dataset ids and a map of managed views to their corresponding
    datasets (which is obtained through get_managed_views_for_dataset_map()) and checks that the dataset is in the
    master list DATASETS_THAT_HAVE_EVER_BEEN_MANAGED. It then cleans up the datasets by deleting unmanaged datasets
    and deleting any unmanaged views within managed datasets."""
    datasets_that_have_ever_been_managed = get_datasets_that_have_ever_been_managed(
    )
    managed_dataset_ids: List[str] = list(managed_views_map.keys())

    for dataset_id in managed_dataset_ids:
        if dataset_id not in datasets_that_have_ever_been_managed:
            raise ValueError(
                "Managed dataset %s not found in the list DATASETS_THAT_HAVE_EVER_BEEN_MANAGED."
                % dataset_id, )

    for dataset_id in datasets_that_have_ever_been_managed:
        if dataset_id not in managed_views_map:
            if bq_client.dataset_exists(
                    bq_client.dataset_ref_for_id(dataset_id)):
                if dry_run:
                    logging.info(
                        "[DRY RUN] Regular run would delete unmanaged dataset %s.",
                        dataset_id,
                    )
                else:
                    logging.info(
                        "Deleting dataset %s, which is no longer managed.",
                        dataset_id,
                    )
                    bq_client.delete_dataset(
                        bq_client.dataset_ref_for_id(dataset_id),
                        delete_contents=True)
            else:
                logging.info(
                    "Dataset %s isn't being managed and no longer exists in BigQuery. It can be safely removed from "
                    "the list DATASETS_THAT_HAVE_EVER_BEEN_MANAGED.",
                    dataset_id,
                )

        else:
            delete_unmanaged_views_and_tables_from_dataset(
                bq_client, dataset_id, managed_views_map[dataset_id], dry_run)
Пример #10
0
def _create_or_update_view_and_materialize_if_necessary(
    bq_client: BigQueryClient,
    view: BigQueryView,
    parent_results: Dict[BigQueryView, bool],
    force_materialize: bool,
) -> bool:
    """Creates or updates the provided view in BigQuery and materializes that view into a table when appropriate.
    Returns True if this view or any views in its parent chain have been updated from the version that was saved in
    BigQuery before this update.
    """
    parent_changed = any(parent_results.values())
    view_changed = False
    dataset_ref = bq_client.dataset_ref_for_id(view.dataset_id)

    try:
        existing_view = bq_client.get_table(dataset_ref, view.view_id)
        if existing_view.view_query != view.view_query:
            # If the view query has changed, the view has changed
            view_changed = True
        old_schema = existing_view.schema
    except exceptions.NotFound:
        view_changed = True
        old_schema = None

    # TODO(https://issuetracker.google.com/issues/180636362): Currently we have to delete and recreate the view for
    # changes from underlying tables to be reflected in its schema.
    if old_schema is not None:
        bq_client.delete_table(dataset_ref.dataset_id, view.view_id)
    updated_view = bq_client.create_or_update_view(view)

    if updated_view.schema != old_schema:
        # We also check for schema changes, just in case a parent view or table has added a column
        view_changed = True

    if view.materialized_address:
        materialized_view_dataset_ref = bq_client.dataset_ref_for_id(
            view.materialized_address.dataset_id)
        if (view_changed or parent_changed or
                not bq_client.table_exists(materialized_view_dataset_ref,
                                           view.materialized_address.table_id)
                or force_materialize):
            bq_client.materialize_view_to_table(view)
        else:
            logging.info(
                "Skipping materialization of view [%s.%s] which has not changed.",
                view.dataset_id,
                view.view_id,
            )
    return view_changed or parent_changed or force_materialize
Пример #11
0
def _create_all_datasets_if_necessary(
    bq_client: BigQueryClient,
    views_to_update: List[BigQueryView],
    set_temp_dataset_table_expiration: bool,
) -> None:
    """Creates all required datasets for the list of views, with a table timeout if necessary. Done up front to avoid
    conflicts during a run of the DagWalker.
    """
    new_dataset_table_expiration_ms = (
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
        if set_temp_dataset_table_expiration
        else None
    )
    dataset_ids = set()
    for view in views_to_update:
        views_dataset_ref = bq_client.dataset_ref_for_id(view.dataset_id)
        if view.dataset_id not in dataset_ids:
            bq_client.create_dataset_if_necessary(
                views_dataset_ref, new_dataset_table_expiration_ms
            )
            dataset_ids.add(view.dataset_id)
def _has_any_rows(bq_client: BigQueryClient, project_id: str, table_name: str,
                  state_code: str) -> bool:
    formatted_query = _ANY_ROWS_QUERY.format(project_id=project_id,
                                             state_code=state_code,
                                             table_name=table_name)
    query_job = bq_client.run_query_async(formatted_query)
    for row in query_job:
        return int(row["count"]) > 0

    raise ValueError(
        f"Unable to check for {state_code} rows in {project_id}.state.{table_name}"
    )
Пример #13
0
def delete_from_table(dry_run: bool, bq_client: BigQueryClient, table_id: str,
                      filter_clause: str) -> None:
    if dry_run:
        logging.info(
            "[DRY RUN] Would delete rows from [%s].[%s] %s",
            DATAFLOW_METRICS_DATASET,
            table_id,
            filter_clause,
        )
    else:
        # Delete these rows from the Dataflow metrics table
        delete_job = bq_client.delete_from_table_async(
            DATAFLOW_METRICS_DATASET, table_id, filter_clause=filter_clause)

        # Wait for the delete job to complete before moving on
        delete_job.result()
Пример #14
0
def load_table_from_gcs_and_wait(
    big_query_client: BigQueryClient,
    table_name: str,
    cloud_sql_to_bq_config: CloudSqlToBQConfig,
    destination_table_id: str,
) -> None:
    """Loads a table from CSV data in GCS to BigQuery.

    Given a table name and a destination_table_id, retrieve the export URI and schema from cloud_sql_to_bq_config,
    then load the table into the destination_table_id.

    This starts the job, but does not wait until it completes.

    Tables are created if they do not exist, and overwritten if they do exist.

    Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's
    data will be completely wiped and overwritten with the contents of the CSV.

    Args:
        big_query_client: A BigQueryClient.
        table_name: Table to import. Table must be defined in the base schema.
        cloud_sql_to_bq_config: Export config class for a specific SchemaType.
        destination_table_id: Optional destination table name. If none is given,
        the provided table name is used.
    Returns:
        If the table load succeeds, returns None. If it fails it raises a ValueError.
    """
    uri = cloud_sql_to_bq_config.get_gcs_export_uri_for_table(table_name)

    logging.info("GCS URI [%s] in project [%s]", uri, metadata.project_id())

    bq_schema = cloud_sql_to_bq_config.get_bq_schema_for_table(table_name)
    dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client)

    load_job = big_query_client.load_table_from_cloud_storage_async(
        source_uri=uri,
        destination_dataset_ref=dataset_ref,
        destination_table_id=destination_table_id,
        destination_table_schema=bq_schema,
    )

    table_load_success = wait_for_table_load(big_query_client, load_job)

    if not table_load_success:
        raise ValueError(
            f"Copy from cloud storage to temp table failed. Skipping refresh for BQ table [{table_name}]"
        )
Пример #15
0
def load_rows_excluded_from_refresh_into_temp_table_and_wait(
    big_query_client: BigQueryClient,
    table_name: str,
    cloud_sql_to_bq_config: CloudSqlToBQConfig,
    destination_table_id: str,
) -> None:
    """Load the stale rows excluded from the CLoudSQL export to the temporary table.

    New columns in the CloudSQL export data that are missing from the stale BQ Table will be added to the schema,
    using the flag hydrate_missing_columns_with_null.

    Because we are using bigquery.WriteDisposition.WRITE_APPEND, the table is not truncated and new data
    is appended.

    Args:
        big_query_client: A BigQueryClient.
        table_name: Table to select from to copy rows into the temp table.
        cloud_sql_to_bq_config: Export config class for a specific SchemaType.
        destination_table_id: Name for the temp table. If it doesn't exist, it will be created.
    Returns:
        If the table load succeeds, returns None. If it fails it raises a ValueError.
    """
    table_refresh_query_builder = (
        cloud_sql_to_bq_config.
        get_stale_bq_rows_for_excluded_regions_query_builder(table_name))

    load_job = big_query_client.insert_into_table_from_table_async(
        source_dataset_id=cloud_sql_to_bq_config.dataset_id,
        source_table_id=table_name,
        destination_dataset_id=cloud_sql_to_bq_config.dataset_id,
        destination_table_id=destination_table_id,
        source_data_filter_clause=table_refresh_query_builder.filter_clause(),
        hydrate_missing_columns_with_null=True,
        allow_field_additions=True,
    )

    table_load_success = wait_for_table_load(big_query_client, load_job)

    if not table_load_success:
        raise ValueError(
            f"Failed to load temp table with excluded rows from existing BQ table. Skipping copy of "
            f"temp table [{destination_table_id}] to BQ table [{table_name}]")
Пример #16
0
 def get_dataset_ref(
     self, big_query_client: BigQueryClient
 ) -> Optional[bigquery.DatasetReference]:
     """Uses the dataset_id to request the BigQuery dataset reference to load the table into.
         Gets created if it does not already exist."""
     return big_query_client.dataset_ref_for_id(self.dataset_id)
Пример #17
0
def refresh_bq_table_from_gcs_export_synchronous(
    big_query_client: BigQueryClient,
    table_name: str,
    cloud_sql_to_bq_config: CloudSqlToBQConfig,
) -> None:
    """Loads data from Cloud SQL export and rows excluded from the SQL export from the current BQ table
    into a target BQ table. If target BQ table does not exist, it is created.

    For example:
    1. Load data from GCS to temp table and wait.
    2. Load data from stale BQ table to temp table and wait and filter for rows excluded from SQL export.
        If stale BQ table does not exist, create the table. If the temp table has schema fields missing in the
        stale BQ table, add missing fields to the BQ table query.
    3. Load data from the temp table to the final BQ table. Overwrite all the data with the temp table and add any
        missing fields to the destination table.
    4. Delete temporary table.

    Waits until each BigQuery load is completed.

    Args:
        big_query_client: A BigQueryClient.
        table_name: Table to import from temp table. Table must be defined
            in the metadata_base class for its corresponding SchemaType.
        cloud_sql_to_bq_config: The config class for the given SchemaType.
    Returns:
        If the table load succeeds, returns None. If it fails it raises a ValueError.
    """
    temp_table_name = TEMP_TABLE_NAME.format(table_name=table_name)
    # Load GCS exported CSVs to temp table
    load_table_from_gcs_and_wait(
        big_query_client,
        table_name,
        cloud_sql_to_bq_config,
        destination_table_id=temp_table_name,
    )

    # Load rows excluded from CloudSQL export to temp table if table exists.
    # If table does not exist, create BQ destination table.
    dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client)

    if big_query_client.table_exists(dataset_ref=dataset_ref,
                                     table_id=table_name):
        load_rows_excluded_from_refresh_into_temp_table_and_wait(
            big_query_client,
            table_name,
            cloud_sql_to_bq_config,
            destination_table_id=temp_table_name,
        )
    else:
        logging.info(
            "Destination table [%s.%s] does not exist! Creating table from schema.",
            cloud_sql_to_bq_config.dataset_id,
            table_name,
        )

        create_table_success = big_query_client.create_table_with_schema(
            dataset_id=cloud_sql_to_bq_config.dataset_id,
            table_id=table_name,
            schema_fields=cloud_sql_to_bq_config.get_bq_schema_for_table(
                table_name),
        )

        if not create_table_success:
            raise ValueError(
                f"Failed to create table [{table_name}. Skipping table refresh from GCS."
            )

    logging.info("Loading BQ Table [%s] from temp table [%s]", table_name,
                 temp_table_name)

    load_job = big_query_client.load_table_from_table_async(
        source_dataset_id=cloud_sql_to_bq_config.dataset_id,
        source_table_id=temp_table_name,
        destination_dataset_id=cloud_sql_to_bq_config.dataset_id,
        destination_table_id=table_name,
    )

    table_load_success = wait_for_table_load(big_query_client, load_job)

    if not table_load_success:
        raise ValueError(
            f"Failed to load BigQuery table [{table_name}] from temp table [{temp_table_name}]."
        )

    delete_temp_table_if_exists(big_query_client, temp_table_name,
                                cloud_sql_to_bq_config)
Пример #18
0
def export_tables_to_cloud_storage(export_configs: List[ExportQueryConfig],
                                   bq_client: BigQueryClient):
    """Exports tables with the given export configurations to Google Cloud Storage."""
    bq_client.export_query_results_to_cloud_storage(export_configs)