Exemplo n.º 1
0
def main(gcp_credentials_filepath: str):
    query_sql = """
        SELECT
          id,
          CONCAT(
            'https://stackoverflow.com/questions/',
            CAST(id as STRING)) as url,
          view_count
        FROM `bigquery-public-data.stackoverflow.posts_questions`
        WHERE tags like '%google-bigquery%'
        ORDER BY view_count DESC
        LIMIT 10"""

    gcp_credentials, project_id = load_gcp_credentials_and_project_from_file(
        gcp_credentials_filepath)
    bq_client = create_bq_client(gcp_credentials, project_id)
    results_iter = execute_bq_query(bq_client, query_sql)

    # Print schema **of the results**, not the source table.
    print(results_iter.schema)

    # Evaluate results.
    results = list(results_iter)  # each row is an instance of 'bigquery.Row'

    # note: if the iterator has not been evaluated then 'total_rows' will be None.
    print(results_iter.total_rows)

    for row in results:
        print("{} - {} : {} views".format(row.id, row.url, row.view_count))
Exemplo n.º 2
0
def main(gcp_credentials_filepath: str):
    gcp_credentials, project_id = load_gcp_credentials_and_project_from_file(
        gcp_credentials_filepath)
    bq_client = create_bq_client(gcp_credentials, project_id)

    dataset = get_bq_dataset(bq_client, 'dataset_x_1')

    # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
    table_schema = [
        SchemaField('field_INT64', 'INT64', 'NULLABLE'),
        SchemaField('field_NUMERIC', 'NUMERIC', 'NULLABLE'),
        SchemaField('field_FLOAT64', 'FLOAT64', 'NULLABLE'),
        SchemaField('field_BOOL', 'BOOL', 'NULLABLE'),
        SchemaField('field_STRING', 'STRING', 'NULLABLE'),
        SchemaField('field_BYTES', 'BYTES', 'NULLABLE'),
        SchemaField('field_DATE', 'DATE', 'NULLABLE'),
        SchemaField('field_DATETIME', 'DATETIME', 'NULLABLE'),
        SchemaField('field_TIME', 'TIME', 'NULLABLE'),
        SchemaField('field_TIMESTAMP', 'TIMESTAMP', 'NULLABLE'),
    ]

    table = create_bq_table(
        bq_client, dataset, table_id='table_2', table_schema=table_schema,
        table_description='some table description')

    print("table fully-qualified ID:", table.full_table_id)
    print("table GRN:", table.path)
    print("table description:", table.description)
    print("table type:", table.table_type)
    print("table size [MiB]:", table.num_bytes / 1014)
    print("table size [rows]:", table.num_rows)
    print("table schema fields:")
    for _field in table.schema:
        print("\t", _field)
    print("table API representation:", table.to_api_repr())
Exemplo n.º 3
0
def main(gcp_credentials_filepath: str):
    gcp_credentials, project_id = load_gcp_credentials_and_project_from_file(
        gcp_credentials_filepath)
    bq_client = create_bq_client(gcp_credentials, project_id)

    dataset = create_bq_dataset(bq_client,
                                'dataset_x_1',
                                dataset_description='some description')

    print("dataset fully-qualified ID:", dataset.full_dataset_id)
    print("dataset GRN:", dataset.path)
    print("dataset description:", dataset.description)
    print("dataset API representation:", dataset.to_api_repr())
Exemplo n.º 4
0
def main(gcp_credentials_filepath: str):
    gcp_credentials, project_id = load_gcp_credentials_and_project_from_file(
        gcp_credentials_filepath)
    bq_client = create_bq_client(gcp_credentials, project_id)

    dataset = get_bq_dataset(
        bq_client, dataset_id='samples', project_id='bigquery-public-data')
    table = get_bq_table(
        bq_client, dataset_id='samples', table_id='shakespeare', project_id='bigquery-public-data')

    print("dataset fully-qualified ID:", dataset.full_dataset_id)
    print("dataset GRN:", dataset.path)
    print("dataset description:", dataset.description)
    # print("dataset API representation:", dataset.to_api_repr())

    print("table fully-qualified ID:", table.full_table_id)
    print("table GRN:", table.path)
    print("table description:", table.description)
    print("table type:", table.table_type)
    print("table size [MiB]:", table.num_bytes / 1014)
    print("table size [rows]:", table.num_rows)
    print("table schema fields:")
    for _field in table.schema:
        print("\t", _field)
Exemplo n.º 5
0
def main(gcp_credentials_filepath: str):
    gcp_credentials, project_id = load_gcp_credentials_and_project_from_file(
        gcp_credentials_filepath)
    bq_client = create_bq_client(gcp_credentials, project_id)

    table = get_bq_table(bq_client, 'dataset_x_1', 'table_2')

    # note: to generate a valid value for table field type 'TIME' we tried we a few functions in
    #   :mod:`time` but had no luck.
    rows_to_insert = [
        (
            24 * 38,  # SchemaField('field_INT64', 'INT64', 'NULLABLE'),
            Decimal('20.83'),  # SchemaField('field_NUMERIC', 'NUMERIC', 'NULLABLE'),
            math.e ** math.pi,  # SchemaField('field_FLOAT64', 'FLOAT64', 'NULLABLE'),
            True,  # SchemaField('field_BOOL', 'BOOL', 'NULLABLE'),
            "Jürgen loves Ω! ✔ \n\r\t 123",  # SchemaField('field_STRING', 'STRING', 'NULLABLE'),
            b'SsO8cmdlbiBsb3ZlcyDOqSEg4pyUIAoNCSAxMjM=',  # SchemaField('field_BYTES', 'BYTES', 'NULLABLE'),
            date.today(),  # SchemaField('field_DATE', 'DATE', 'NULLABLE'),
            datetime.now(),  # SchemaField('field_DATETIME', 'DATETIME', 'NULLABLE'),
            datetime.now().time(),  # SchemaField('field_TIME', 'TIME', 'NULLABLE'),
            datetime.now(),  # SchemaField('field_TIMESTAMP', 'TIMESTAMP', 'NULLABLE'),
        ),
    ]

    # warning: 'insert_rows' performs serialization to JSON-compatible native Python types,
    #   and then calls 'insert_rows_json'. Obviously this impacts performance.
    # warning: if 'insert_rows' does not raise any exceptions it **does not** mean the insertion
    #   succeeded.
    # warning: this kind of insert is actually implemented as **Streaming Data into BigQuery**
    #   https://cloud.google.com/bigquery/streaming-data-into-bigquery
    # TODO: insertion using load jobs
    #   https://cloud.google.com/bigquery/docs/loading-data-local#loading_data_from_a_local_data_source
    #   https://github.com/googleapis/google-cloud-python/blob/master/bigquery/docs/snippets.py
    #   https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/tables.html#creating-a-table
    #   https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.load_table_from_file
    errors = bq_client.insert_rows(
        table, rows_to_insert, skip_invalid_rows=False, ignore_unknown_values=False)
    if errors:
        print(errors)
        raise Exception

    query_sql = """
        SELECT
          *
        FROM `dataset_x_1.table_2`
        WHERE true
        LIMIT 10"""

    results_iter = execute_bq_query(bq_client, query_sql)

    # Print schema **of the results**, not the source table.
    print(results_iter.schema)

    # Evaluate results.
    results = list(results_iter)  # each row is an instance of 'bigquery.Row'

    # note: if the iterator has not been evaluated then 'total_rows' will be None.
    print(results_iter.total_rows)

    for row in results:
        print(list(row.items()))