Exemplo n.º 1
0
def export_then_load_all_sequentially(module: ModuleType):
    """Exports then loads each table sequentially.

    No operations for a new table happen until all operations for
    the previous table have completed.

    For example, for Tables A, B, C:
    1. Export Table A
    2. Load Table A
    3. Export Table B
    4. Load Table B
    5. Export Table C
    6. Load Table C

    There is no reason to load sequentially, but we must export sequentially
    because Cloud SQL can only support one export operation at a time.
    """
    if module == ModuleType.COUNTY:
        tables_to_export = export_config.COUNTY_TABLES_TO_EXPORT
        dataset_ref = bq_utils.client().dataset(
            export_config.COUNTY_BASE_TABLES_BQ_DATASET)
    elif module == ModuleType.STATE:
        tables_to_export = export_config.STATE_TABLES_TO_EXPORT
        dataset_ref = bq_utils.client().dataset(
            export_config.STATE_BASE_TABLES_BQ_DATASET)
    else:
        logging.error("Invalid module requested. Must be either"
                      " ModuleType.COUNTY or ModuleType.STATE.")
        return

    for table in tables_to_export:
        export_table_then_load_table(table.name, dataset_ref, module)
Exemplo n.º 2
0
def handle_bq_export_task():
    """Worker function to handle BQ export task requests.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        table_name: Table to export then import. Table must be defined
            in export_config.COUNTY_TABLES_TO_EXPORT.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    table_name = data['table_name']
    module = data['module']

    if module == ModuleType.COUNTY.value:
        module_type = ModuleType.COUNTY
        dataset_ref = bq_utils.client().dataset(
            export_config.COUNTY_BASE_TABLES_BQ_DATASET)
    elif module == ModuleType.STATE.value:
        module_type = ModuleType.STATE
        dataset_ref = bq_utils.client().dataset(
            export_config.STATE_BASE_TABLES_BQ_DATASET)
    else:
        return HTTPStatus.INTERNAL_SERVER_ERROR

    logging.info("Starting BQ export task for table: %s", table_name)

    success = export_table_then_load_table(table_name, dataset_ref,
                                           module_type)

    return ('', HTTPStatus.OK if success else HTTPStatus.INTERNAL_SERVER_ERROR)
Exemplo n.º 3
0
def export_all_then_load_all(module: ModuleType):
    """Export all tables from Cloud SQL in the given module, then load all
    tables to BigQuery.

    Exports happen in sequence (one at a time),
    then once all exports are completed, the BigQuery loads happen in parallel.

    For example, for tables A, B, C:
    1. Export Table A
    2. Export Table B
    3. Export Table C
    4. Load Tables A, B, C in parallel.
    """
    if module == ModuleType.COUNTY:
        tables_to_export = export_config.COUNTY_TABLES_TO_EXPORT
        base_tables_dataset_ref = bq_utils.client().dataset(
            export_config.COUNTY_BASE_TABLES_BQ_DATASET)
        export_queries = export_config.COUNTY_TABLE_EXPORT_QUERIES
    elif module == ModuleType.STATE:
        tables_to_export = export_config.STATE_TABLES_TO_EXPORT
        base_tables_dataset_ref = bq_utils.client().dataset(
            export_config.STATE_BASE_TABLES_BQ_DATASET)
        export_queries = export_config.STATE_TABLE_EXPORT_QUERIES
    else:
        logging.error("Invalid module requested. Must be either"
                      " ModuleType.COUNTY or ModuleType.STATE.")
        return

    logging.info("Beginning CloudSQL export")
    cloudsql_export.export_all_tables(tables_to_export, export_queries)

    logging.info("Beginning BQ table load")
    bq_load.load_all_tables_concurrently(base_tables_dataset_ref,
                                         tables_to_export, module)
Exemplo n.º 4
0
def wait_for_table_load(load_job: bigquery.job.LoadJob,
                        table_ref: bigquery.table.TableReference) -> bool:
    """Wait for a table LoadJob to finish, and log its status.

    Args:
        load_job: BigQuery LoadJob whose result to wait for.
        table_ref: TableReference to retrieve final table status.
    Returns:
        True if no errors were raised, else False.
    """
    try:
        # Wait for table load job to complete.
        load_job.result(_BQ_LOAD_WAIT_TIMEOUT_SECONDS)
        logging.info("Load job %s for table %s.%s.%s completed successfully.",
                     load_job.job_id, table_ref.project, table_ref.dataset_id,
                     table_ref.table_id)

        destination_table = bq_utils.client().get_table(table_ref)
        logging.info("Loaded %d rows in table %s.%s.%s",
                     destination_table.num_rows, destination_table.project,
                     destination_table.dataset_id, destination_table.table_id)
        return True
    except (exceptions.NotFound, exceptions.BadRequest,
            concurrent.futures.TimeoutError):  # type: ignore
        logging.exception("Failed to load table %s.%s.%s", table_ref.project,
                          table_ref.dataset_id, table_ref.table_id)
        return False
Exemplo n.º 5
0
def create_dataset_and_update_views(
        dataset_name: str, views_to_update: List[bqview.BigQueryView]):
    """Create and update Views and their parent Dataset.

    Create a parent Views dataset if it does not exist, and
    creates or updates the underlying Views as defined in
    recidiviz.calculator.bq.views.bqview

    Args:
        dataset_name: Name of BigQuery dataset to contain Views. Gets created
            if it does not already exist.
        views_to_update: View objects to be created or updated.
    """
    views_dataset_ref = bq_utils.client().dataset(dataset_name)
    bq_utils.create_dataset_if_necessary(views_dataset_ref)

    for view in views_to_update:
        bq_utils.create_or_update_view(views_dataset_ref, view)
Exemplo n.º 6
0
def start_table_load(
        dataset_ref: bigquery.dataset.DatasetReference,
        table_name: str, module: ModuleType) -> \
        Optional[Tuple[bigquery.job.LoadJob, bigquery.table.TableReference]]:
    """Loads a table from CSV data in GCS to BigQuery.

    Given a table name, retrieve the export URI and schema from export_config,
    then load the table into BigQuery.

    This starts the job, but does not wait until it completes.

    Tables are created if they do not exist, and overwritten if they do exist.

    Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's
    data will be completely wiped and overwritten with the contents of the CSV.

    Args:
        dataset_ref: The BigQuery dataset to load the table into. Gets created
            if it does not already exist.
        table_name: Table to import. Table must be defined
            in the export_config.*_TABLES_TO_EXPORT for the given module
        module: The module of the table being loaded, either ModuleType.COUNTY
            or ModuleType.STATE.
    Returns:
        (load_job, table_ref) where load_job is the LoadJob object containing
            job details, and table_ref is the destination TableReference object.
            If the job fails to start, returns None.
    """
    if module == ModuleType.COUNTY:
        export_schema = export_config.COUNTY_TABLE_EXPORT_SCHEMA
    elif module == ModuleType.STATE:
        export_schema = export_config.STATE_TABLE_EXPORT_SCHEMA
    else:
        logging.exception("Unknown module name: %s", module)
        return None

    bq_utils.create_dataset_if_necessary(dataset_ref)

    uri = export_config.gcs_export_uri(table_name)
    table_ref = dataset_ref.table(table_name)

    try:
        bq_schema = [
            bigquery.SchemaField(field['name'], field['type'], field['mode'])
            for field in export_schema[table_name]
        ]
    except KeyError:
        logging.exception(
            "Unknown table name '%s'. Is it listed in "
            "the TABLES_TO_EXPORT for the %s module?", module, table_name)
        return None

    job_config = bigquery.LoadJobConfig()
    job_config.schema = bq_schema
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    load_job = bq_utils.client().load_table_from_uri(uri,
                                                     table_ref,
                                                     job_config=job_config)

    logging.info("Started load job %s for table %s.%s.%s", load_job.job_id,
                 table_ref.project, table_ref.dataset_id, table_ref.table_id)

    return load_job, table_ref