示例#1
0
def _export_table(
    client: bigquery.Client,
    project_id: str,
    dataset_id: str,
    table: str,
    bucket: str,
    storage_client: storage.Client,
):
    """Export a single table or view to GCS as JSON."""
    # since views cannot get exported directly, write data into a temporary table
    job = client.query(
        f"""
        SELECT *
        FROM {dataset_id}.{table}
    """
    )

    job.result()

    destination_uri = f"gs://{bucket}/{table}.ndjson"
    dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    logger.info(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(
        table_ref, destination_uri, location="US", job_config=job_config
    )
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, table, storage_client)
示例#2
0
def _export_table(
    client: bigquery.Client,
    project_id: str,
    dataset_id: str,
    table: str,
    bucket: str,
    target_path: str,
    storage_client: storage.Client,
):
    """Export a single table or view to GCS as JSON."""
    # since views cannot get exported directly, write data into a temporary table
    job = client.query(
        f"""
        SELECT *
        FROM {dataset_id}.{table}
        WHERE analysis_basis = 'enrollments'
    """
    )  # todo: once experimenter supports different analysis_bases, remove filter

    job.result()

    # add a random string to the identifier to prevent collision errors if there
    # happen to be multiple instances running that export data for the same experiment
    tmp = "".join(random.choices(string.ascii_lowercase, k=8))
    destination_uri = f"gs://{bucket}/{target_path}/{table}-{tmp}.ndjson"
    dataset_ref = bigquery.DatasetReference(project_id,
                                            job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    logger.info(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(table_ref,
                                       destination_uri,
                                       location="US",
                                       job_config=job_config)
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, target_path, table, storage_client, tmp)
示例#3
0
def extract_rows(table_name: str = table_name, bucket_name: str = dest_bucket_name, path: str = dest_path,
                 diff_type: str = diff_type, dest_data_project: str = dest_data_project,
                 dest_dataset_name: str = dest_dataset_name, client: bigquery.Client = dest_client):
    job_config = bigquery.ExtractJobConfig(printHeader=False, destination_format="NEWLINE_DELIMITED_JSON")

    diff_type_val = DiffType[diff_type].value

    destination_uri = f"gs://{bucket_name}/{path}/{table_name}/{diff_type_val}/*"
    dataset_ref = bigquery.DatasetReference(dest_data_project, dest_dataset_name)
    table_ref = dataset_ref.table(f"{table_name}_{diff_type_val}")

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        job_config=job_config
    )  # API request
    print(f"The write destination is: {destination_uri}")
    try:
        extract_job.result()
    except GoogleCloudError as err:
        print(f"There was a {type(err)}")
        print(err)
示例#4
0
def query_to_dataframe(
    query: str,
    bigquery_client: bigquery.Client = None,
    storage_client: storage.Client = None,
    project: str = None,
    templocation: str = None,
) -> pd.DataFrame:
    """
    Run a query job on BigQuery and return the result in Pandas DataFrame format

    Args:
        query: BigQuery query e.g. "SELECT * FROM dataset.table"
        bigquery_client:
        storage_client:
        project: Google Cloud project id
        templocation: Google Cloud Storage location to store intermediate files, must start with "gs://"

    Returns: Pandas DataFrame of the query result

    """
    if isinstance(templocation, str) and not templocation.startswith("gs://"):
        raise RuntimeError('templocation must start with "gs://"')

    if bigquery_client is None:
        bigquery_client = bigquery.Client(project=project)

    if project is None:
        project = bigquery_client.project

    query_job = bigquery_client.query(query, project=project)
    query_job_state = ""

    while not query_job.done():
        if query_job.state != query_job_state:
            print(f"Query status: {query_job.state}")
            query_job_state = query_job.state
        time.sleep(5)

    if query_job.state != query_job_state:
        print(f"Query status: {query_job.state}")

    if query_job.exception():
        raise query_job.exception()

    if not templocation:
        templocation = get_default_templocation(bigquery_client,
                                                project=project)

    if templocation.endswith("/"):
        templocation += templocation[:-1]

    destination_uri = (
        f"{templocation}/bq-{datetime.now(pytz.utc).strftime('%Y%m%dT%H%M%SZ')}.avro"
    )
    extract_job_config = bigquery.job.ExtractJobConfig(
        destination_format="AVRO")
    extract_job = bigquery_client.extract_table(query_job.destination,
                                                destination_uri,
                                                job_config=extract_job_config)

    while not extract_job.done():
        time.sleep(5)

    if extract_job.exception():
        raise extract_job.exception()

    if not storage_client:
        storage_client = storage.Client(project=project)

    print("Reading query result into DataFrame")

    bucket_name, blob_name = (
        destination_uri.split("/")[2],
        "/".join(destination_uri.split("/")[3:]),
    )
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.get_blob(blob_name)
    downloaded_avro_filename = tempfile.NamedTemporaryFile().name
    blob.download_to_filename(downloaded_avro_filename)

    with open(downloaded_avro_filename, "rb") as avro_file:
        avro_reader = fastavro.reader(avro_file)
        df = pd.DataFrame.from_records(avro_reader)

    return df