Пример #1
0
def handle_bq_export_task():
    """Worker function to handle BQ export task requests.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        table_name: Table to export then import. Table must be defined
            in export_config.COUNTY_TABLES_TO_EXPORT.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    table_name = data['table_name']
    schema_type_str = data['schema_type']

    if schema_type_str == SchemaType.JAILS.value:
        schema_type = SchemaType.JAILS
        dataset_ref = bq_utils.client().dataset(
            export_config.COUNTY_BASE_TABLES_BQ_DATASET)
    elif schema_type_str == SchemaType.STATE.value:
        schema_type = SchemaType.STATE
        dataset_ref = bq_utils.client().dataset(
            export_config.STATE_BASE_TABLES_BQ_DATASET)
    else:
        return '', HTTPStatus.INTERNAL_SERVER_ERROR

    logging.info("Starting BQ export task for table: %s", table_name)

    success = export_table_then_load_table(table_name, dataset_ref, schema_type)

    return ('', HTTPStatus.OK if success else HTTPStatus.INTERNAL_SERVER_ERROR)
Пример #2
0
def export_then_load_all_sequentially(schema_type: SchemaType):
    """Exports then loads each table sequentially.

    No operations for a new table happen until all operations for
    the previous table have completed.

    For example, for Tables A, B, C:
    1. Export Table A
    2. Load Table A
    3. Export Table B
    4. Load Table B
    5. Export Table C
    6. Load Table C

    There is no reason to load sequentially, but we must export sequentially
    because Cloud SQL can only support one export operation at a time.
    """
    if schema_type == SchemaType.JAILS:
        tables_to_export = export_config.COUNTY_TABLES_TO_EXPORT
        dataset_ref = bq_utils.client().dataset(
            export_config.COUNTY_BASE_TABLES_BQ_DATASET)
    elif schema_type == SchemaType.STATE:
        tables_to_export = export_config.STATE_TABLES_TO_EXPORT
        dataset_ref = bq_utils.client().dataset(
            export_config.STATE_BASE_TABLES_BQ_DATASET)
    else:
        logging.error("Invalid schema_type requested. Must be either"
                      " SchemaType.JAILS or SchemaType.STATE.")
        return

    for table in tables_to_export:
        export_table_then_load_table(table.name, dataset_ref, schema_type)
Пример #3
0
def wait_for_table_load(load_job: bigquery.job.LoadJob,
                        table_ref: bigquery.table.TableReference) -> bool:
    """Wait for a table LoadJob to finish, and log its status.

    Args:
        load_job: BigQuery LoadJob whose result to wait for.
        table_ref: TableReference to retrieve final table status.
    Returns:
        True if no errors were raised, else False.
    """
    try:
        # Wait for table load job to complete.
        load_job.result(_BQ_LOAD_WAIT_TIMEOUT_SECONDS)
        logging.info("Load job %s for table %s.%s.%s completed successfully.",
                     load_job.job_id, table_ref.project, table_ref.dataset_id,
                     table_ref.table_id)

        destination_table = bq_utils.client().get_table(table_ref)
        logging.info("Loaded %d rows in table %s.%s.%s",
                     destination_table.num_rows, destination_table.project,
                     destination_table.dataset_id, destination_table.table_id)
        return True
    except (exceptions.NotFound, exceptions.BadRequest,
            concurrent.futures.TimeoutError):  # type: ignore
        logging.exception("Failed to load table %s.%s.%s", table_ref.project,
                          table_ref.dataset_id, table_ref.table_id)
        return False
Пример #4
0
def export_all_then_load_all(schema_type: SchemaType):
    """Export all tables from Cloud SQL in the given schema, then load all
    tables to BigQuery.

    Exports happen in sequence (one at a time),
    then once all exports are completed, the BigQuery loads happen in parallel.

    For example, for tables A, B, C:
    1. Export Table A
    2. Export Table B
    3. Export Table C
    4. Load Tables A, B, C in parallel.
    """
    if schema_type == SchemaType.JAILS:
        tables_to_export = export_config.COUNTY_TABLES_TO_EXPORT
        base_tables_dataset_ref = bq_utils.client().dataset(
            export_config.COUNTY_BASE_TABLES_BQ_DATASET)
        export_queries = export_config.COUNTY_TABLE_EXPORT_QUERIES
    elif schema_type == SchemaType.STATE:
        tables_to_export = export_config.STATE_TABLES_TO_EXPORT
        base_tables_dataset_ref = bq_utils.client().dataset(
            export_config.STATE_BASE_TABLES_BQ_DATASET)
        export_queries = export_config.STATE_TABLE_EXPORT_QUERIES
    else:
        logging.error("Invalid schema_type requested. Must be either"
                      " SchemaType.JAILS or SchemaType.STATE.")
        return

    logging.info("Beginning CloudSQL export")
    cloudsql_export.export_all_tables(schema_type,
                                      tables_to_export,
                                      export_queries)

    logging.info("Beginning BQ table load")
    bq_load.load_all_tables_concurrently(
        base_tables_dataset_ref, tables_to_export, schema_type)
Пример #5
0
def create_dataset_and_update_views(
        dataset_name: str, views_to_update: List[bqview.BigQueryView]):
    """Create and update Views and their parent Dataset.

    Create a parent Views dataset if it does not exist, and
    creates or updates the underlying Views as defined in
    recidiviz.calculator.bq.views.bqview

    Args:
        dataset_name: Name of BigQuery dataset to contain Views. Gets created
            if it does not already exist.
        views_to_update: View objects to be created or updated.
    """
    views_dataset_ref = bq_utils.client().dataset(dataset_name)
    bq_utils.create_dataset_if_necessary(views_dataset_ref)

    for view in views_to_update:
        bq_utils.create_or_update_view(views_dataset_ref, view)
def export_dashboard_data_to_cloud_storage(bucket: str):
    """Exports data needed by the dashboard to the cloud storage bucket.

    This is a two-step process. First, for each view, the view query is executed
    and the entire result is loaded into a table in BigQuery. Then, for each
    table, the contents are exported to the cloud storage bucket in JSON format.
    This has to be a two-step process because BigQuery doesn't support exporting
    a view directly, it must be materialized in a table first.

    Args:
        bucket: The cloud storage location where the exported data should go.
    """
    view_manager.create_dataset_and_update_views(
        view_config.DASHBOARD_VIEWS_DATASET, view_manager.VIEWS_TO_UPDATE)

    dataset_ref = bq_utils.client().dataset(
        view_config.DASHBOARD_VIEWS_DATASET)
    views_to_export = dashboard_export_config.VIEWS_TO_EXPORT

    _export_views_to_tables(dataset_ref, views_to_export)

    _export_view_tables_to_cloud_storage(dataset_ref, views_to_export, bucket)
Пример #7
0
def start_table_load(
        dataset_ref: bigquery.dataset.DatasetReference,
        table_name: str, schema_type: SchemaType) -> \
        Optional[Tuple[bigquery.job.LoadJob, bigquery.table.TableReference]]:
    """Loads a table from CSV data in GCS to BigQuery.

    Given a table name, retrieve the export URI and schema from export_config,
    then load the table into BigQuery.

    This starts the job, but does not wait until it completes.

    Tables are created if they do not exist, and overwritten if they do exist.

    Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's
    data will be completely wiped and overwritten with the contents of the CSV.

    Args:
        dataset_ref: The BigQuery dataset to load the table into. Gets created
            if it does not already exist.
        table_name: Table to import. Table must be defined
            in the export_config.*_TABLES_TO_EXPORT for the given module
        schema_type: The schema of the table being loaded, either
            SchemaType.JAILS or SchemaType.STATE.
    Returns:
        (load_job, table_ref) where load_job is the LoadJob object containing
            job details, and table_ref is the destination TableReference object.
            If the job fails to start, returns None.
    """
    if schema_type == SchemaType.JAILS:
        export_schema = export_config.COUNTY_TABLE_EXPORT_SCHEMA
    elif schema_type == SchemaType.STATE:
        export_schema = export_config.STATE_TABLE_EXPORT_SCHEMA
    else:
        logging.exception("Unknown schema type: %s", schema_type)
        return None

    bq_utils.create_dataset_if_necessary(dataset_ref)

    uri = export_config.gcs_export_uri(table_name)
    table_ref = dataset_ref.table(table_name)

    try:
        bq_schema = [
            bigquery.SchemaField(field['name'], field['type'], field['mode'])
            for field in export_schema[table_name]
        ]
    except KeyError:
        logging.exception(
            "Unknown table name '%s'. Is it listed in "
            "the TABLES_TO_EXPORT for the %s module?", schema_type, table_name)
        return None

    job_config = bigquery.LoadJobConfig()
    job_config.schema = bq_schema
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    load_job = bq_utils.client().load_table_from_uri(uri,
                                                     table_ref,
                                                     job_config=job_config)

    logging.info("Started load job %s for table %s.%s.%s", load_job.job_id,
                 table_ref.project, table_ref.dataset_id, table_ref.table_id)

    return load_job, table_ref