def instance_database_bigquery(name_database):
    dataset_ref = client.dataset(name_database)
    dataset = bigquery.Dataset(dataset_ref)
    return dataset
Пример #2
0
def test_data_scientist_tutorial(delete_dataset):
    # [START bqml_data_scientist_tutorial_create_dataset]
    dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
    dataset.location = 'US'
    client.create_dataset(dataset)
    # [END bqml_data_scientist_tutorial_create_dataset]

    # [START bqml_data_scientist_tutorial_create_model]
    sql = """
        CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`
        OPTIONS(model_type='logistic_reg') AS
        SELECT
            IF(totals.transactions IS NULL, 0, 1) AS label,
            IFNULL(device.operatingSystem, "") AS os,
            device.isMobile AS is_mobile,
            IFNULL(geoNetwork.country, "") AS country,
            IFNULL(totals.pageviews, 0) AS pageviews
        FROM
            `bigquery-public-data.google_analytics_sample.ga_sessions_*`
        WHERE
            _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_data_scientist_tutorial_create_model]

    # [START bqml_data_scientist_tutorial_get_training_statistics]
    sql = """
        SELECT
        *
        FROM
        ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_data_scientist_tutorial_get_training_statistics]

    # [START bqml_data_scientist_tutorial_evaluate_model]
    sql = """
        SELECT
            *
        FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (
            SELECT
                IF(totals.transactions IS NULL, 0, 1) AS label,
                IFNULL(device.operatingSystem, "") AS os,
                device.isMobile AS is_mobile,
                IFNULL(geoNetwork.country, "") AS country,
                IFNULL(totals.pageviews, 0) AS pageviews
            FROM
                `bigquery-public-data.google_analytics_sample.ga_sessions_*`
            WHERE
                _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_data_scientist_tutorial_evaluate_model]

    # [START bqml_data_scientist_tutorial_predict_transactions]
    sql = """
        SELECT
            country,
            SUM(predicted_label) as total_predicted_purchases
        FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
            SELECT
                IFNULL(device.operatingSystem, "") AS os,
                device.isMobile AS is_mobile,
                IFNULL(totals.pageviews, 0) AS pageviews,
                IFNULL(geoNetwork.country, "") AS country
            FROM
                `bigquery-public-data.google_analytics_sample.ga_sessions_*`
            WHERE
                _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
            GROUP BY country
            ORDER BY total_predicted_purchases DESC
            LIMIT 10
    """
    df = client.query(sql).to_dataframe()
    print(df)
    # [END bqml_data_scientist_tutorial_predict_transactions]

    # [START bqml_data_scientist_tutorial_predict_purchases]
    sql = """
        SELECT
            fullVisitorId,
            SUM(predicted_label) as total_predicted_purchases
        FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
            SELECT
                IFNULL(device.operatingSystem, "") AS os,
                device.isMobile AS is_mobile,
                IFNULL(totals.pageviews, 0) AS pageviews,
                IFNULL(geoNetwork.country, "") AS country,
                fullVisitorId
            FROM
                `bigquery-public-data.google_analytics_sample.ga_sessions_*`
            WHERE
                _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
            GROUP BY fullVisitorId
            ORDER BY total_predicted_purchases DESC
            LIMIT 10
    """
    df = client.query(sql).to_dataframe()
    print(df)
Пример #3
0
def create_dataset(dataset_name, bigquery_client, location=DEFAULT_LOCATION):
    dataset = bigquery.Dataset('{project_id}.{dataset_name}'.format(
        project_id=bigquery_client.project, dataset_name=dataset_name))
    dataset.location = location
    return bigquery_client.create_dataset(dataset, exists_ok=True)
import requests
import os
from google.cloud import bigquery
from google.oauth2 import service_account
from urllib.parse import urlparse, urlencode, urlunparse, parse_qsl

# Construct a credentials object
credentials = service_account.Credentials.from_service_account_file(
    '<path of your service account>',
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

# Construct a BigQuery client object.
bigquery_client = bigquery.Client(credentials=credentials,
                                  project=credentials.project_id)
dataset = bigquery.Dataset('<project-name>.<schema-name>')


def set_query_field(url, field, value, replace=False):
    # Parse out the different parts of the URL.
    components = urlparse(url)
    query_pairs = parse_qsl(urlparse(url).query)

    if replace:
        query_pairs = [(f, v) for (f, v) in query_pairs if f != field]
    query_pairs.append((field, value))

    new_query_str = urlencode(query_pairs)

    # Finally, construct the new URL
    new_components = (components.scheme, components.netloc, components.path,
Пример #5
0
def _create_dataset(client, dataset_ref, location):
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = location
    return client.create_dataset(dataset)
Пример #6
0
from google.cloud import bigquery
from google.api_core.exceptions import Conflict

from pysparkdp.config import Config, logger

Config().init_connect()

bc = bigquery.Client()

# ds == dataset
# Dataset IDs must be alphanumeric (plus underscores)
# and must be at most 1024 characters long.
ds_id = "bigquery_python"
ds_ref = bc.dataset(ds_id)
ds = bigquery.Dataset(ds_ref)
try:
    ds = bc.create_dataset(ds)
except Conflict:
    logger.info("Dataset %s already exist", ds_id)
    ds = bc.get_dataset(ds_ref)

table_id = "lines"
table_ref = ds_ref.table(table_id)
table = bigquery.Table(table_ref)
table.description = "Lines from GS process"
table.schema = [
    bigquery.SchemaField('line', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('id', 'INTEGER', mode='REQUIRED')
]
Пример #7
0
    def load_csv_to_table(self,
                          dataset_id,
                          table_id,
                          csv_path,
                          auto_detect_schema=True,
                          schema=[],
                          skip_rows=0,
                          append_to_table=False):
        '''
        :param dataset_id: dataset name
        :param table_id: table name
        :param csv_path: path to exported csv file
        :param auto_detect_schema: auto detect schema types from CSV (default True)
        :param schema: (default None)
            schema = [
                bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED', description='blah'),
                bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED'),
            ]
        :param skip_rows: how many rows in CSV to skip (default 0)
        :return: None

        Details on loading local data
        https://cloud.google.com/bigquery/docs/loading-data-local
        '''

        location = 'US'

        dataset_ref = self.client.get_dataset(dataset_id)
        table_ref = dataset_ref.table(table_id)
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = location

        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.CSV
        # https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#overwriting_a_table_with_csv_data
        #job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY
        job_config.skip_leading_rows = skip_rows
        job_config.quote_character = '\"'

        if not append_to_table:
            job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY

            if auto_detect_schema:
                assert schema == [], 'Auto-detect is True, but schema is specified'
                job_config.autodetect = True
            else:
                job_config.autodetect = False
                assert schema != [], 'Auto-detect is False, but no schema specified'
                job_config.schema = schema

        with open(csv_path, 'rb') as csv_file:
            load_table_job = self.client.load_table_from_file(
                csv_file,
                table_ref,
                location=location,
                job_config=job_config,
            )  # API request

        log.info('Uploading table... ')
        #print('Uploading table... ')
        try:
            load_table_job.result()  # Wait for load to complete
        except:
            errors = load_table_job.errors
            log.error(errors)
            return errors
        print(load_table_job.error_result)
        table = self.client.get_table(table_ref)
        log.info('''
                {} rows loaded into
                table {} in dataset {}
                in project: {}.
                '''.format(load_table_job.output_rows, table.table_id,
                           dataset_id, self.client.project))
        '''
import json

#BigQuery connection
bigquery_client = bigquery.Client.from_service_account_json(
    os.environ['bigquery_creds'])

#Create BigQuery structures if they do not exist
#Define dataset and table
dataset = bigquery_client.dataset('trafficdata')
table = dataset.table('chicagodata')

#Create dataset if it does not exist
try:
    bigquery_client.get_dataset(dataset)
except:
    dataset_ref = bigquery_client.create_dataset(bigquery.Dataset(dataset))
    print('Dataset created.')

#Create table if it does not exist
try:
    bigquery_client.get_table(table)
except:
    schema = [
        bigquery.SchemaField('dateAdded', 'TIMESTAMP', mode='REQUIRED'),
        bigquery.SchemaField('segmentid', 'INTEGER', mode='REQUIRED'),
        bigquery.SchemaField('_strheading', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('_traffic', 'INTEGER', mode='REQUIRED'),
        bigquery.SchemaField('_tost', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('_fromst', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('_last_updt', 'TIMESTAMP', mode='REQUIRED'),
        bigquery.SchemaField('_length', 'FLOAT', mode='REQUIRED'),
Пример #9
0
 def createDatasetBQ(self, dataset_id, client):
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "asia-south1"
     dataset = client.create_dataset(dataset)
     print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
     return dataset
Пример #10
0
# Create new Google BigQuery client using Cloud Platform project default
client = bigquery.Client()

# Reference to project id and new or existing dataset
project_id = "bigquery-tour"
dataset_id = "tour-dataset"
full_dataset_id = "{}.{}".format(project_id, dataset_id)

# Check to see if dataset already exists within project, if not then create new BigQuery dataset
try:
    dataset = client.get_dataset(full_dataset_id)
    print("Dataset {} exists".format(full_dataset_id))
except NotFound:
    print("Dataset {} not found, creating new dataset".format(full_dataset_id))
    db = bigquery.Dataset(full_dataset_id)
    dataset = client.create_dataset(db)

# Reference to new table for storing query results
table_id = dataset.table("nearest_ports")

# Configure query job and set destination as new table mentioned above
job_config = bigquery.QueryJobConfig(destination=table_id)

# Query to find 5 nearest ports to Singapore's JURONG ISLAND port
query = """
    SELECT port_name, ST_DISTANCE(port_geom, (SELECT port_geom
                                              FROM bigquery-public-data.geo_international_ports.world_port_index
                                              WHERE country = "SG" AND port_name = "JURONG ISLAND")) AS distance_in_meters
    FROM bigquery-public-data.geo_international_ports.world_port_index
    ORDER BY 2
Пример #11
0
def main(mysqlConfigs, redshiftConfigs):
  SCHEMA = [
      bigquery.SchemaField('id', 'INTEGER', mode='REQUIRED'),
      bigquery.SchemaField('name', 'STRING', mode='REQUIRED'),
  ]

  project_id = os.getenv('PROJECT_ID')
  dataset_id = 'testdb'
  table_id = 'testtbl'

  client = bigquery.Client()

  
  client.delete_dataset(client.dataset(dataset_id), delete_contents=True)

  dataset = bigquery.Dataset(client.dataset(dataset_id))
  dataset = client.create_dataset(dataset)
  dataset.location = 'US'

  table = bigquery.Table(dataset.table(table_id), schema=SCHEMA)
  table = client.create_table(table)

  query = "SELECT * FROM `{}`.`{}`.`{}` limit 100".format(project_id, dataset_id, table_id)


  client = bigquery.Client()
  table_ref = client.dataset(dataset_id).table(table_id)
  '''  
  table = client.get_table(table_ref)
  rows_to_insert = [
      ('Phred Phlyntstone', 32),
      ('Wylma Phlyntstone', 1),
  ]
  errors = client.insert_rows(table, rows_to_insert)
  print(errors)
  assert errors == []
  '''

  query_job = client.query(query, location='US')
  for row in query_job:
      print(row)
  conn = pymysql.connect(**mysqlConfigs)


  stream = BinLogStreamReader(
    connection_settings = mysqlConfigs,
    server_id=100,
    blocking=True,
    resume_stream=True,
    only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent])

  cursor = conn.cursor()
  for binlogevent in stream:
    e_start_pos, last_pos = stream.log_pos, stream.log_pos
    #print([a for a in dir(binlogevent) if not a.startswith('__')])
    for row in binlogevent.rows:
      event = {"schema": binlogevent.schema,
      "table": binlogevent.table,
      "type": type(binlogevent).__name__,
      "row": row
      }

      #if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN':
      #  e_start_pos = last_pos
      #print(json.dumps(event))
      binlog2sql = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=row, e_start_pos=e_start_pos).replace('`', "").replace('testtbl', '`testdb.testtbl`')
      print(binlog2sql)

      query_job = client.query(binlog2sql, location='US')
      result = query_job.result()
Пример #12
0
def run_natality_tutorial(override_values={}):
    # [START bigquery_query_natality_tutorial]
    """Create a Google BigQuery linear regression input table.

    In the code below, the following actions are taken:
    * A new dataset is created "natality_regression."
    * A query is run against the public dataset,
        bigquery-public-data.samples.natality, selecting only the data of
        interest to the regression, the output of which is stored in a new
        "regression_input" table.
    * The output table is moved over the wire to the user's default project via
        the built-in BigQuery Connector for Spark that bridges BigQuery and
        Cloud Dataproc.
    """

    from google.cloud import bigquery

    # Create a new Google BigQuery client using Google Cloud Platform project
    # defaults.
    client = bigquery.Client()

    # Prepare a reference to a new dataset for storing the query results.
    dataset_id = 'natality_regression'
    # [END bigquery_query_natality_tutorial]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    dataset_id = override_values.get("dataset_id", dataset_id)
    # [START bigquery_query_natality_tutorial]

    dataset = bigquery.Dataset(client.dataset(dataset_id))

    # Create the new BigQuery dataset.
    dataset = client.create_dataset(dataset)

    # In the new BigQuery dataset, create a reference to a new table for
    # storing the query results.
    table_ref = dataset.table('regression_input')

    # Configure the query job.
    job_config = bigquery.QueryJobConfig()

    # Set the destination table to the table reference created above.
    job_config.destination = table_ref

    # Set up a query in Standard SQL, which is the default for the BigQuery
    # Python client library.
    # The query selects the fields of interest.
    query = """
        SELECT
            weight_pounds, mother_age, father_age, gestation_weeks,
            weight_gain_pounds, apgar_5min
        FROM
            `bigquery-public-data.samples.natality`
        WHERE
            weight_pounds IS NOT NULL
            AND mother_age IS NOT NULL
            AND father_age IS NOT NULL
            AND gestation_weeks IS NOT NULL
            AND weight_gain_pounds IS NOT NULL
            AND apgar_5min IS NOT NULL
    """

    # Run the query.
    query_job = client.query(query, job_config=job_config)
    query_job.result()  # Waits for the query to finish
Пример #13
0
 def get_dataset(self, name):
     dataset_ref = self._bigquery_client.dataset(name)
     return bigquery.Dataset(dataset_ref)
Пример #14
0
 def create_dataset(self, name):
     dataset_ref = self._bigquery_client.dataset(name)
     dataset = bigquery.Dataset(dataset_ref)
     dataset = self._bigquery_client.create_dataset(dataset)
     return dataset
Пример #15
0
def test_load_table_add_column(client, to_delete):
    dataset_id = "load_table_add_column_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"
    dataset = client.create_dataset(dataset)
    to_delete.append(dataset)

    snippets_dir = os.path.abspath(os.path.dirname(__file__))
    filepath = os.path.join(snippets_dir, "..", "tests", "data", "people.csv")
    table_ref = dataset_ref.table("my_table")
    old_schema = [bigquery.SchemaField("full_name", "STRING", mode="REQUIRED")]
    table = client.create_table(bigquery.Table(table_ref, schema=old_schema))

    # [START bigquery_add_column_load_append]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # dataset_ref = client.dataset('my_dataset')
    # filepath = 'path/to/your_file.csv'

    # Retrieves the destination table and checks the length of the schema
    table_id = "my_table"
    table_ref = dataset_ref.table(table_id)
    table = client.get_table(table_ref)
    print("Table {} contains {} columns.".format(table_id, len(table.schema)))

    # Configures the load job to append the data to the destination table,
    # allowing field addition
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    job_config.schema_update_options = [
        bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
    ]
    # In this example, the existing table contains only the 'full_name' column.
    # 'REQUIRED' fields cannot be added to an existing schema, so the
    # additional column must be 'NULLABLE'.
    job_config.schema = [
        bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
    ]
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1

    with open(filepath, "rb") as source_file:
        job = client.load_table_from_file(
            source_file,
            table_ref,
            location="US",  # Must match the destination dataset location.
            job_config=job_config,
        )  # API request

    job.result()  # Waits for table load to complete.
    print(
        "Loaded {} rows into {}:{}.".format(
            job.output_rows, dataset_id, table_ref.table_id
        )
    )

    # Checks the updated length of the schema
    table = client.get_table(table)
    print("Table {} now contains {} columns.".format(table_id, len(table.schema)))
    # [END bigquery_add_column_load_append]
    assert len(table.schema) == 2
    assert table.num_rows > 0
Пример #16
0
 def _create_dataset(self):
     from google.cloud import bigquery
     dataset = bigquery.Dataset(self.full_dataset_name)
     dataset.location = 'US'
     self.client.create_dataset(dataset, timeout=30)
Пример #17
0
def test_load_table_relax_column(client, to_delete):
    dataset_id = "load_table_relax_column_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"
    dataset = client.create_dataset(dataset)
    to_delete.append(dataset)

    snippets_dir = os.path.abspath(os.path.dirname(__file__))
    filepath = os.path.join(snippets_dir, "..", "tests", "data", "people.csv")
    table_ref = dataset_ref.table("my_table")
    old_schema = [
        bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("favorite_color", "STRING", mode="REQUIRED"),
    ]
    table = client.create_table(bigquery.Table(table_ref, schema=old_schema))

    # [START bigquery_relax_column_load_append]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # dataset_ref = client.dataset('my_dataset')
    # filepath = 'path/to/your_file.csv'

    # Retrieves the destination table and checks the number of required fields
    table_id = "my_table"
    table_ref = dataset_ref.table(table_id)
    table = client.get_table(table_ref)
    original_required_fields = sum(field.mode == "REQUIRED" for field in table.schema)
    # In this example, the existing table has 3 required fields.
    print("{} fields in the schema are required.".format(original_required_fields))

    # Configures the load job to append the data to a destination table,
    # allowing field relaxation
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    job_config.schema_update_options = [
        bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION
    ]
    # In this example, the existing table contains three required fields
    # ('full_name', 'age', and 'favorite_color'), while the data to load
    # contains only the first two fields.
    job_config.schema = [
        bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
    ]
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1

    with open(filepath, "rb") as source_file:
        job = client.load_table_from_file(
            source_file,
            table_ref,
            location="US",  # Must match the destination dataset location.
            job_config=job_config,
        )  # API request

    job.result()  # Waits for table load to complete.
    print(
        "Loaded {} rows into {}:{}.".format(
            job.output_rows, dataset_id, table_ref.table_id
        )
    )

    # Checks the updated number of required fields
    table = client.get_table(table)
    current_required_fields = sum(field.mode == "REQUIRED" for field in table.schema)
    print("{} fields in the schema are now required.".format(current_required_fields))
    # [END bigquery_relax_column_load_append]
    assert original_required_fields - current_required_fields == 1
    assert len(table.schema) == 3
    assert table.schema[2].mode == "NULLABLE"
    assert table.num_rows > 0
Пример #18
0
#### Authentication

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=scopes)

client = bigquery.Client(credentials=credentials, project=project)

#### Create dataset if it does not exist

dataset_id_full = f'{client.project}.{dataset_id}'
table_id_full = f"{project}.{dataset_id}.{table_id}"

datasets = list(client.list_datasets())
if len(datasets) == 0:
    dataset = bigquery.Dataset(dataset_id_full)
    dataset = client.create_dataset(dataset)
elif datasets[0].dataset_id == dataset_id:
    dataset = datasets[0]

#### sort schema_json in same order as df columns
def _get_schema(schema_fname, csv_fname, table_id):
    with open(schema_fname, 'r') as fp:
        schema_json = json.load(fp)[table_id]
    df = pd.read_csv(csv_fname)
    schema = list()
    for key in df.columns:
        val = schema_json[key]
        schema.append(
            bigquery.SchemaField(val['name'], val['type'], val['mode'])
        )
    def setup(self):
        """Sets up resources for tests.
        """
        # set up bq datasets to hold staging tables
        self.staging_dataset_id = "bqbml_test_staging_dataset"
        self.resized_dataset_id = "bqbml_test_resized_staging_dataset"
        self.bq_client = bigquery.Client()
        staging_dataset_ref = self.bq_client.dataset(self.staging_dataset_id)
        self.staging_dataset = bigquery.Dataset(staging_dataset_ref)
        resized_staging_dataset_ref = self.bq_client.dataset(
            self.resized_dataset_id)
        self.resized_dataset = bigquery.Dataset(resized_staging_dataset_ref)

        # define path that holds schemas used for creating staging tables
        abs_path = os.path.abspath(os.path.dirname(__file__))
        self.json_schema_path = os.path.join(abs_path, 'test_schemas')

        # add test schemas to schema path
        schema_50_string_50_numeric = {
            "fields": [{
                "type": "STRING",
                "name": "string1",
                "mode": "REQUIRED"
            }, {
                "type": "STRING",
                "name": "string2",
                "mode": "REQUIRED"
            }, {
                "type": "NUMERIC",
                "name": "numeric1",
                "mode": "REQUIRED"
            }, {
                "type": "NUMERIC",
                "name": "numeric2",
                "mode": "REQUIRED"
            }]
        }
        with open(self.json_schema_path + '/50_STRING_50_NUMERIC_4.json',
                  'w') as sch2:
            json.dump(schema_50_string_50_numeric, sch2)

        # set up test params
        self.test_file_parameters = {
            'fileType': ['csv', 'json'],
            'fileCompressionTypes': {
                'csv': ['none'],
                'json': ['none']
            },
            'numColumns': [4],
            'numFiles': [1, 100, 1000, 10000],
            'targetDataSizes': [.000001],
            'stagingDataSizes': ['1KB'],
            'columnTypes': ['50_STRING_50_NUMERIC'],
        }

        # set up GCS resources needed for dataglow job
        df_staging_bucket_id = 'bq_benchmark_dataflow_test'
        gcs_client = storage.Client()
        self.df_staging_bucket = gcs_client.create_bucket(df_staging_bucket_id)
        staging_blob = self.df_staging_bucket.blob('staging/')
        temp_blob = self.df_staging_bucket.blob('temp/')
        staging_blob.upload_from_string('')
        temp_blob.upload_from_string('')
        self.df_staging_path = 'gs://{0:1}/staging'.format(
            df_staging_bucket_id)
        self.df_temp_path = 'gs://{0:1}/temp'.format(df_staging_bucket_id)
Пример #20
0
    def submit(self, sql, create, dml=None):
        """
        Submit the sql query to create a de-identified table.

        :param sql:  The sql to send.
        :param create: a flag to identify if this query should create a new
            table or append to an existing table.
        :param dml:  boolean flag identifying if a statement is a dml statement
        """
        dml = False if dml is None else dml
        table_name = self.get_tablename()
        client = bq.Client.from_service_account_json(self.private_key)
        #
        # Let's make sure the out dataset exists
        datasets = list(client.list_datasets())
        found = np.sum(
            [1 for dataset in datasets if dataset.dataset_id == self.odataset])
        if not found:
            dataset = bq.Dataset(client.dataset(self.odataset))
            client.create_dataset(dataset)

        # create the output table
        if create:
            LOGGER.info('creating new table:\t%s', self.tablename)
            bq_utils.create_standard_table(self.tablename,
                                           self.tablename,
                                           drop_existing=True,
                                           dataset_id=self.odataset)
            write_disposition = bq_consts.WRITE_EMPTY
        else:
            write_disposition = bq_consts.WRITE_APPEND
            LOGGER.info('appending results to table:\t%s', self.tablename)

        job = bq.QueryJobConfig()
        job.priority = self.priority
        job.dry_run = True

        dml_job = None
        if not dml:
            job.destination = client.dataset(self.odataset).table(
                self.tablename)
            job.use_query_cache = True
            job.allow_large_results = True
            job.write_disposition = write_disposition
            if self.partition:
                job._properties['timePartitioning'] = {'type': 'DAY'}
                job._properties['clustering'] = {'field': 'person_id'}
        else:
            # create a copy of the job config to use if the dry-run passes
            dml_job = copy(job)

        LOGGER.info(
            'submitting a dry-run for:\t%s\t\tpriority:\t%s\t\tpartition:\t%s',
            self.get_tablename(), self.priority, self.partition)

        logpath = os.path.join(self.logpath, self.idataset)
        try:
            os.makedirs(logpath)
        except OSError:
            # log path already exists and we don't care
            pass

        try:
            response = client.query(sql, location='US', job_config=job)
        except Exception:
            LOGGER.exception(
                'dry run query failed for:\t%s\n'
                '\t\tSQL:\t%s\n'
                '\t\tjob config:\t%s', self.get_tablename(), sql, job)
        else:

            if response.state == 'DONE':
                if dml_job:
                    job = dml_job

                job.dry_run = False

                LOGGER.info('dry-run passed.  submitting query for execution.')

                response = client.query(sql, location='US', job_config=job)
                LOGGER.info(
                    'submitted a %s job for table:\t%s\t\tstatus:\t%s\t\tvalue:\t%s',
                    'bigquery', table_name, 'pending', response.job_id)
                self.wait(client, response.job_id)
Пример #21
0
 def __enter__(self):
     client = bigquery.Client()
     dataset = bigquery.Dataset(f"{client.project}.{self.name}")
     dataset.location = "US"
     dataset = client.create_dataset(dataset, exists_ok=True)
     return dataset
Пример #22
0
# In[ ]:


get_ipython().run_cell_magic('bash', '', 'gcloud config set project $PROJECT\ngcloud config set compute/region $REGION')


# ## Create BigQuery tables

# If you have not already created a BigQuery dataset for our data, run the following cell:

# In[ ]:


# Created a BigQuery dataset for our data
bq = bigquery.Client(project = PROJECT)
dataset = bigquery.Dataset(bq.dataset("taxifare"))

try:
    bq.create_dataset(dataset)
    print("Dataset created")
except:
    print("Dataset already exists")


# Let's create a table with 1 million examples.
# 
# Note that the order of columns is exactly what was in our CSV files.

# In[ ]:

    def create_resized_tables(self):
        """Creates resized staging tables using the targetDataSizes parameters.

        Utilizes the bq_table_resizer module from the Dataflow Data
        Generator tool to create resized staging tables.

        """
        staging_dataset_ref = self.bq_client.dataset(self.staging_dataset_id)
        staging_dataset = bigquery.Dataset(staging_dataset_ref)
        sizes = self.file_params['targetDataSizes']
        sizes.sort()
        abs_path = os.path.abspath(os.path.dirname(__file__))
        bq_resizer_path = os.path.join(
            abs_path,
            '../../dataflow-data-generator/bigquery-scripts'
        )
        # Gather staging tables that were created in
        # self.create_staging_tables()
        tables = list(self.bq_client.list_tables(staging_dataset))
        for table in tables:
            for i in range(0, len(sizes)):
                # If the size of the current iteration is the smallest size
                # in the sizes list, use the corresponding staging table
                # created in create_resized_table() as the source base table.
                # Otherwise, if the file size of the current iteration is
                # greater than the last iteration, use the resized table from
                # the previous iteration as the source base table when running
                # so that each table can take advantage of the size of the last.
                size = sizes[i]
                if size > sizes[i - 1]:
                    base = sizes[i - 1]
                    source_table = get_resized_table_name(
                        table.table_id,
                        base
                    )
                    source_dataset = self.resized_dataset_id
                else:
                    source_table = table.table_id
                    source_dataset = self.staging_dataset_id

                destination_table = get_resized_table_name(
                    table.table_id,
                    size
                )
                target_gb = size
                command_str = ('python {0:s}/bq_table_resizer.py '
                               '--project {1:s} '
                               '--source_dataset {2:s} '
                               '--source_table {3:s} '
                               '--destination_dataset {4:s} '
                               '--destination_table {5:s} '
                               '--target_gb {6:f} ')
                command = command_str.format(
                    bq_resizer_path,
                    self.project,
                    source_dataset,
                    source_table,
                    self.resized_dataset_id,
                    destination_table,
                    target_gb,
                )
                os.system(command)
                logging.info('Created resized table from {0:s}'.format(
                    source_table
                ))
                logging.info('Resized table complete: {0:s}'.format(
                    destination_table
                ))
Пример #24
0
def bigquery(data_directory, ignore_missing_dependency, **params):
    try:
        import google.api_core.exceptions
        from google.cloud import bigquery
    except ImportError:
        msg = 'google-cloud-bigquery dependency is missing'
        if ignore_missing_dependency:
            logger.warning('Ignored: %s', msg)
            return 0
        else:
            raise click.ClickException(msg)

    project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID']
    bqclient = bigquery.Client(project=project_id)

    # Create testing dataset.
    testing_dataset = bqclient.dataset('testing')
    try:
        bqclient.create_dataset(bigquery.Dataset(testing_dataset))
    except google.api_core.exceptions.Conflict:
        pass  # Skip if already created.

    # Set up main data table.
    data_directory = Path(data_directory)
    functional_alltypes_path = data_directory / 'functional_alltypes.csv'
    functional_alltypes_schema = []
    schema_path = data_directory / 'functional_alltypes_bigquery_schema.json'
    with open(str(schema_path)) as schemafile:
        schema_json = json.load(schemafile)
        for field in schema_json:
            functional_alltypes_schema.append(
                bigquery.SchemaField.from_api_repr(field)
            )
    load_config = bigquery.LoadJobConfig()
    load_config.skip_leading_rows = 1  # skip the header row.
    load_config.schema = functional_alltypes_schema

    # Load main data table.
    functional_alltypes_schema = []
    with open(str(functional_alltypes_path), 'rb') as csvfile:
        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('functional_alltypes'),
            job_config=load_config,
        ).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Load an ingestion time partitioned table.
    functional_alltypes_path = data_directory / 'functional_alltypes.csv'
    with open(str(functional_alltypes_path), 'rb') as csvfile:
        load_config.time_partitioning = bigquery.TimePartitioning()
        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('functional_alltypes_parted'),
            job_config=load_config,
        ).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Create a table with complex data types (nested and repeated).
    struct_table_path = data_directory / 'struct_table.avro'
    with open(str(struct_table_path), 'rb') as avrofile:
        load_config = bigquery.LoadJobConfig()
        load_config.source_format = 'AVRO'
        job = bqclient.load_table_from_file(
            avrofile,
            testing_dataset.table('struct_table'),
            job_config=load_config,
        )

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Create empty date-partitioned table.
    date_table = bigquery.Table(testing_dataset.table('date_column_parted'))
    date_table.schema = [
        bigquery.SchemaField('my_date_parted_col', 'DATE'),
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('int_col', 'INTEGER'),
    ]
    date_table.time_partitioning = bigquery.TimePartitioning(
        field='my_date_parted_col'
    )
    bqclient.create_table(date_table)

    # Create empty timestamp-partitioned tables.
    timestamp_table = bigquery.Table(
        testing_dataset.table('timestamp_column_parted')
    )
    timestamp_table.schema = [
        bigquery.SchemaField('my_timestamp_parted_col', 'DATE'),
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('int_col', 'INTEGER'),
    ]
    timestamp_table.time_partitioning = bigquery.TimePartitioning(
        field='my_timestamp_parted_col'
    )
    bqclient.create_table(timestamp_table)

    # Create a table with a numeric column
    numeric_table = bigquery.Table(testing_dataset.table('numeric_table'))
    numeric_table.schema = [
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('numeric_col', 'NUMERIC'),
    ]
    bqclient.create_table(numeric_table)

    df = pd.read_csv(
        str(data_directory / 'functional_alltypes.csv'),
        usecols=['string_col', 'double_col'],
        header=0,
    )
    with tempfile.NamedTemporaryFile(mode='a+b') as csvfile:
        df.to_csv(csvfile, header=False, index=False)
        csvfile.seek(0)

        load_config = bigquery.LoadJobConfig()
        load_config.skip_leading_rows = 1  # skip the header row.
        load_config.schema = numeric_table.schema

        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('numeric_table'),
            job_config=load_config,
        ).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))
def create_dataset():
    print('Creating destination dataset')
    dataset_ref = bq_client.dataset(TO_DATASET, project=TO_PROJECT)
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = TO_LOCATION
    dataset = bq_client.create_dataset(dataset)
def run_authorized_view_tutorial():
    # Note to user: This is a group email for testing purposes. Replace with
    # your own group email address when running this code.
    analyst_group_email = '*****@*****.**'

    # [START bigquery_authorized_view_tutorial]
    # Create a source dataset
    # [START bigquery_avt_create_source_dataset]
    from google.cloud import bigquery

    client = bigquery.Client()
    source_dataset_id = 'github_source_data'

    source_dataset = bigquery.Dataset(client.dataset(source_dataset_id))
    # Specify the geographic location where the dataset should reside.
    source_dataset.location = 'US'
    source_dataset = client.create_dataset(source_dataset)  # API request
    # [END bigquery_avt_create_source_dataset]

    # Populate a source table
    # [START bigquery_avt_create_source_table]
    source_table_id = 'github_contributors'
    job_config = bigquery.QueryJobConfig()
    job_config.destination = source_dataset.table(source_table_id)
    sql = """
        SELECT commit, author, committer, repo_name
        FROM `bigquery-public-data.github_repos.commits`
        LIMIT 1000
    """
    query_job = client.query(
        sql,
        # Location must match that of the dataset(s) referenced in the query
        # and of the destination table.
        location='US',
        job_config=job_config)  # API request - starts the query

    query_job.result()  # Waits for the query to finish
    # [END bigquery_avt_create_source_table]

    # Create a separate dataset to store your view
    # [START bigquery_avt_create_shared_dataset]
    shared_dataset_id = 'shared_views'
    shared_dataset = bigquery.Dataset(client.dataset(shared_dataset_id))
    shared_dataset.location = 'US'
    shared_dataset = client.create_dataset(shared_dataset)  # API request
    # [END bigquery_avt_create_shared_dataset]

    # Create the view in the new dataset
    # [START bigquery_avt_create_view]
    shared_view_id = 'github_analyst_view'
    view = bigquery.Table(shared_dataset.table(shared_view_id))
    sql_template = """
        SELECT
            commit, author.name as author,
            committer.name as committer, repo_name
        FROM
            `{}.{}.{}`
    """
    view.view_query = sql_template.format(
        client.project, source_dataset_id, source_table_id)
    view = client.create_table(view)  # API request
    # [END bigquery_avt_create_view]

    # Assign access controls to the dataset containing the view
    # [START bigquery_avt_shared_dataset_access]
    # analyst_group_email = '*****@*****.**'
    access_entries = shared_dataset.access_entries
    access_entries.append(
        bigquery.AccessEntry('READER', 'groupByEmail', analyst_group_email)
    )
    shared_dataset.access_entries = access_entries
    shared_dataset = client.update_dataset(
        shared_dataset, ['access_entries'])  # API request
    # [END bigquery_avt_shared_dataset_access]

    # Authorize the view to access the source dataset
    # [START bigquery_avt_source_dataset_access]
    access_entries = source_dataset.access_entries
    access_entries.append(
        bigquery.AccessEntry(None, 'view', view.reference.to_api_repr())
    )
    source_dataset.access_entries = access_entries
    source_dataset = client.update_dataset(
        source_dataset, ['access_entries'])  # API request
    # [END bigquery_avt_source_dataset_access]
    # [END bigquery_authorized_view_tutorial]
    return (source_dataset, shared_dataset)
Пример #27
0
if datasets:
    print("Datasets in project {}:".format(project))
    for dataset in datasets:  # API request(s)
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(project))

dataset_id = "hackathon_dataset"

print(f"dataset ids: {[d.dataset_id for d in datasets]}")

if dataset_id in [d.dataset_id for d in datasets]:
    print("Already created")
else:
    # Construct a full Dataset object to send to the API.
    dataset = bigquery.Dataset(dataset_id)

    dataset = client.create_dataset(dataset)  # API request
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

#Read Airports avro file and append to dataset

airports_uri = "gs://hackathon-mtl-2019/avro/flight_airports.avro/*"
airport_table_id = "flight_airports"

job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.source_format = bigquery.SourceFormat.AVRO

dataset_ref = client.dataset(dataset_id)
Пример #28
0
def test_manage_views(client, to_delete):
    project = client.project
    source_dataset_id = "source_dataset_{}".format(_millis())
    source_dataset_ref = client.dataset(source_dataset_id)
    source_dataset = bigquery.Dataset(source_dataset_ref)
    source_dataset = client.create_dataset(source_dataset)
    to_delete.append(source_dataset)

    job_config = bigquery.LoadJobConfig()
    job_config.schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
    ]
    job_config.skip_leading_rows = 1
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
    source_table_id = "us_states"
    load_job = client.load_table_from_uri(
        uri, source_dataset.table(source_table_id), job_config=job_config
    )
    load_job.result()

    shared_dataset_id = "shared_dataset_{}".format(_millis())
    shared_dataset_ref = client.dataset(shared_dataset_id)
    shared_dataset = bigquery.Dataset(shared_dataset_ref)
    shared_dataset = client.create_dataset(shared_dataset)
    to_delete.append(shared_dataset)

    # [START bigquery_create_view]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # project = 'my-project'
    # source_dataset_id = 'my_source_dataset'
    # source_table_id = 'us_states'
    # shared_dataset_ref = client.dataset('my_shared_dataset')

    # This example shows how to create a shared view of a source table of
    # US States. The source table contains all 50 states, while the view will
    # contain only states with names starting with 'W'.
    view_ref = shared_dataset_ref.table("my_shared_view")
    view = bigquery.Table(view_ref)
    sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "W%"'
    view.view_query = sql_template.format(project, source_dataset_id, source_table_id)
    view = client.create_table(view)  # API request

    print("Successfully created view at {}".format(view.full_table_id))
    # [END bigquery_create_view]

    # [START bigquery_update_view_query]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # project = 'my-project'
    # source_dataset_id = 'my_source_dataset'
    # source_table_id = 'us_states'
    # shared_dataset_ref = client.dataset('my_shared_dataset')

    # This example shows how to update a shared view of a source table of
    # US States. The view's query will be updated to contain only states with
    # names starting with 'M'.
    view_ref = shared_dataset_ref.table("my_shared_view")
    view = bigquery.Table(view_ref)
    sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "M%"'
    view.view_query = sql_template.format(project, source_dataset_id, source_table_id)
    view = client.update_table(view, ["view_query"])  # API request
    # [END bigquery_update_view_query]

    # [START bigquery_get_view]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # shared_dataset_id = 'my_shared_dataset'

    view_ref = client.dataset(shared_dataset_id).table("my_shared_view")
    view = client.get_table(view_ref)  # API Request

    # Display view properties
    print("View at {}".format(view.full_table_id))
    print("View Query:\n{}".format(view.view_query))
    # [END bigquery_get_view]
    assert view.view_query is not None

    analyst_group_email = "*****@*****.**"
    # [START bigquery_grant_view_access]
    # from google.cloud import bigquery
    # client = bigquery.Client()

    # Assign access controls to the dataset containing the view
    # shared_dataset_id = 'my_shared_dataset'
    # analyst_group_email = '*****@*****.**'
    shared_dataset = client.get_dataset(
        client.dataset(shared_dataset_id)
    )  # API request
    access_entries = shared_dataset.access_entries
    access_entries.append(
        bigquery.AccessEntry("READER", "groupByEmail", analyst_group_email)
    )
    shared_dataset.access_entries = access_entries
    shared_dataset = client.update_dataset(
        shared_dataset, ["access_entries"]
    )  # API request

    # Authorize the view to access the source dataset
    # project = 'my-project'
    # source_dataset_id = 'my_source_dataset'
    source_dataset = client.get_dataset(
        client.dataset(source_dataset_id)
    )  # API request
    view_reference = {
        "projectId": project,
        "datasetId": shared_dataset_id,
        "tableId": "my_shared_view",
    }
    access_entries = source_dataset.access_entries
    access_entries.append(bigquery.AccessEntry(None, "view", view_reference))
    source_dataset.access_entries = access_entries
    source_dataset = client.update_dataset(
        source_dataset, ["access_entries"]
    )  # API request
Пример #29
0
    exit()

project_name = sys.argv[1]
bucket_name = sys.argv[2]

# Set up our GCS, BigQuery, and Natural Language clients
storage_client = storage.Client()
bq_client = bigquery.Client(project=project_name)
nl_client = language.LanguageServiceClient()

# Set up client objects for the vision and translate_v2 API Libraries
vision_client = vision_v1.ImageAnnotatorClient()

# Setup the BigQuery dataset and table objects
dataset_ref = bq_client.dataset('image_classification_dataset')
dataset = bigquery.Dataset(dataset_ref)
table_ref = dataset.table('image_text_detail')
table = bq_client.get_table(table_ref)

# Create an array to store results data to be inserted into the BigQuery table
rows_for_bq = []

# Get a list of the files in the Cloud Storage Bucket
files = storage_client.bucket(bucket_name).list_blobs()
bucket = storage_client.bucket(bucket_name)

print('Processing image files from GCS. This will take a few minutes..')

# Process files from Cloud Storage and save the result to send to BigQuery
for file in files:
    if file.name.endswith('jpg') or file.name.endswith('png'):
Пример #30
0
import csv

from google.api_core.exceptions import NotFound
from google.cloud import bigquery
from google.cloud.bigquery import LoadJobConfig


# TODO: support for multiple datasets
DATASET_NAME = 'g_sheets'

client = bigquery.Client().from_service_account_json('service-account-key.json')

dataset = client.dataset(DATASET_NAME)

if not DATASET_NAME in [l.dataset_id for l in client.list_datasets()]:
    dataset = client.create_dataset(bigquery.Dataset(dataset))


def load_from_file(file_path, table):
    table_ref = dataset.table(table)
    load_config = LoadJobConfig()
    load_config.autodetect = True

    with open(file_path, 'rb') as readable:
        result = client.load_table_from_file(readable, table_ref, job_config=load_config)

    # return result
    return table_ref.dataset_id + '.' + table_ref.table_id


def table_exists(table_name):