def copy_table(dataset_name, table_name, new_table_name, project=None):
    """Copies a table.

    If no project is specified, then the currently active project is used.
    """
    bigquery_client = bigquery.Client(project=project)
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)

    # This sample shows the destination table in the same dataset and project,
    # however, it's possible to copy across datasets and projects. You can
    # also copy muliple source tables into a single destination table by
    # providing addtional arguments to `copy_table`.
    destination_table = dataset.table(new_table_name)

    # Create a job to copy the table to the destination table.
    job_id = str(uuid.uuid4())
    job = bigquery_client.copy_table(
        job_id, destination_table, table)

    # Create the table if it doesn't exist.
    job.create_disposition = (
        gcloud.bigquery.job.CreateDisposition.CREATE_IF_NEEDED)

    # Start the job.
    job.begin()

    # Wait for the the job to finish.
    print('Waiting for job to finish...')
    wait_for_job(job)

    print('Table {} copied to {}.'.format(table_name, new_table_name))
示例#2
0
def handler(event, context):
    rows = []

    for r in event['Records']:
        payload = r['kinesis']['data']
        try:
            data = json.loads(base64.b64decode(payload))
            row = []
            for key in ['time', 'tag', 'value']:
                if key == 'time':
                    row.append(datetime.datetime.fromtimestamp(data[key]))
                else:
                    row.append(data[key])
            rows.append(tuple(row))
        except Exception as e:
            print('Invalid data "{0}": {1}'.format(payload, e))
            pass

    if len(rows) == 0:
        return

    kms = boto3.client('kms')
    blob = base64.b64decode(BQ_CREDENTIALS)
    dec = kms.decrypt(CiphertextBlob=blob)
    keyfile_dict = json.loads(dec['Plaintext'])
    credentials = ServiceAccountCredentials.from_json_keyfile_dict(
        keyfile_dict)

    bq = bigquery.Client(credentials=credentials, project=BQ_PROJECT)
    dataset = bq.dataset(BQ_DATASET)
    table = dataset.table(BQ_TABLE)
    table.reload()
    res = table.insert_data(rows)

    print(res)
示例#3
0
def async_query(query):
    client = bigquery.Client()
    query_job = client.run_async_query(str(uuid.uuid4()), query)
    query_job.use_legacy_sql = False
    query_job.begin()

    wait_for_job(query_job)

    # Manually construct the QueryResults.
    # TODO: The client library will provide a helper method that does this.
    # https://github.com/GoogleCloudPlatform/gcloud-python/issues/2083
    query_results = bigquery.query.QueryResults('', client)
    query_results._properties['jobReference'] = {
        'jobId': query_job.name,
        'projectId': query_job.project
    }

    # Drain the query results by requesting a page at a time.
    page_token = None

    while True:
        rows, total_rows, page_token = query_results.fetch_data(
            max_results=10, page_token=page_token)

        for row in rows:
            print(row)

        if not page_token:
            break
示例#4
0
def delete_table(dataset_name, table_name, project=None):
    """Deletes a table in a given dataset.

    If no project is specified, then the currently active project is used.
    """
    bigquery_client = bigquery.Client(project=project)
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)

    table.delete()

    print('Table {}:{} deleted.'.format(dataset_name, table_name))
示例#5
0
def test_delete_table(capsys):
    # Create a table to delete
    bigquery_client = bigquery.Client()
    dataset = bigquery_client.dataset(DATASET_ID)
    table = dataset.table('test_delete_table')

    if not table.exists():
        table.schema = [bigquery.SchemaField('id', 'INTEGER')]
        table.create()

    snippets.delete_table(DATASET_ID, table.name)

    assert not table.exists()
示例#6
0
def export_data_to_gcs(dataset_name, table_name, destination):
    bigquery_client = bigquery.Client()
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)
    job_name = str(uuid.uuid4())

    job = bigquery_client.extract_table_to_storage(job_name, table,
                                                   destination)

    job.begin()

    wait_for_job(job)

    print('Exported {}:{} to {}'.format(dataset_name, table_name, destination))
示例#7
0
def load_data_from_gcs(dataset_name, table_name, source):
    bigquery_client = bigquery.Client()
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)
    job_name = str(uuid.uuid4())

    job = bigquery_client.load_table_from_storage(
        job_name, table, source)

    job.begin()

    wait_for_job(job)

    print('Loaded {} rows into {}:{}.'.format(
        job.output_rows, dataset_name, table_name))
示例#8
0
def stream_data(dataset_name, table_name, json_data):
    bigquery_client = bigquery.Client()
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)
    data = json.loads(json_data)

    # Reload the table to get the schema.
    table.reload()

    rows = [data]
    errors = table.insert_data(rows)

    if not errors:
        print('Loaded 1 row into {}:{}'.format(dataset_name, table_name))
    else:
        print('Errors:')
        pprint(errors)
示例#9
0
def load_data_from_file(dataset_name, table_name, source_file_name):
    bigquery_client = bigquery.Client()
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)

    # Reload the table to get the schema.
    table.reload()

    with open(source_file_name, 'rb') as source_file:
        # This example uses CSV, but you can use other formats.
        # See https://cloud.google.com/bigquery/loading-data
        job = table.upload_from_file(source_file, source_format='text/csv')

    job.begin()

    wait_for_job(job)

    print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_name,
                                              table_name))
示例#10
0
def list_rows(dataset_name, table_name, project=None):
    """Prints rows in the given table.

    Will print 25 rows at most for brevity as tables can contain large amounts
    of rows.

    If no project is specified, then the currently active project is used.
    """
    bigquery_client = bigquery.Client(project=project)
    dataset = bigquery_client.dataset(dataset_name)
    table = dataset.table(table_name)

    if not table.exists():
        print('Table {}:{} does not exist.'.format(dataset_name, table_name))
        return

    # Reload the table so that the schema is available.
    table.reload()

    rows = []
    page_token = None

    # Load at most 25 results. You can change this to `while True` and change
    # the max_results argument to load more rows from BigQuery, but note
    # that this can take some time. It's preferred to use a query.
    while len(rows) < 25:
        results, total_rows, page_token = table.fetch_data(
            max_results=25, page_token=page_token)
        rows.extend(results)

        if not page_token:
            break

    # Use format to create a simple table.
    format_string = '{:<16} ' * len(table.schema)

    # Print schema field names
    field_names = [field.name for field in table.schema]
    print(format_string.format(*field_names))

    for row in rows:
        print(format_string.format(*row))
示例#11
0
def list_datasets(project=None):
    """Lists all datasets in a given project.

    If no project is specified, then the currently active project is used
    """
    bigquery_client = bigquery.Client(project=project)

    datasets = []
    page_token = None

    while True:
        results, page_token = bigquery_client.list_datasets(
            page_token=page_token)
        datasets.extend(results)

        if not page_token:
            break

    for dataset in datasets:
        print(dataset.name)
示例#12
0
    def _init_bigquery_dataset(self):
        from gcloud import bigquery
        from gcloud.bigquery.dataset import AccessGrant
        DATASET_URI = 'bigquery.googleapis.com/projects/%s/datasets/%s' % (
            Config.CLIENT.project,
            DATASET_NAME,
        )

        # Create the destination dataset, and set up the ACL to allow
        # Stackdriver Logging to write into it.
        bigquery_client = bigquery.Client()
        dataset = bigquery_client.dataset(DATASET_NAME)
        dataset.create()
        self.to_delete.append(dataset)
        dataset.reload()
        grants = dataset.access_grants
        grants.append(
            AccessGrant('WRITER', 'groupByEmail', '*****@*****.**'))
        dataset.access_grants = grants
        dataset.update()
        return DATASET_URI
示例#13
0
def sync_query(query):
    client = bigquery.Client()
    query_results = client.run_sync_query(query)

    # Use standard SQL syntax for queries.
    # See: https://cloud.google.com/bigquery/sql-reference/
    query_results.use_legacy_sql = False

    query_results.run()

    # Drain the query results by requesting a page at a time.
    page_token = None

    while True:
        rows, total_rows, page_token = query_results.fetch_data(
            max_results=10, page_token=page_token)

        for row in rows:
            print(row)

        if not page_token:
            break
示例#14
0
def list_tables(dataset_name, project=None):
    """Lists all of the tables in a given dataset.

    If no project is specified, then the currently active project is used.
    """
    bigquery_client = bigquery.Client(project=project)
    dataset = bigquery_client.dataset(dataset_name)

    if not dataset.exists():
        print('Dataset {} does not exist.'.format(dataset_name))
        return

    tables = []
    page_token = None

    while True:
        results, page_token = dataset.list_tables(page_token=page_token)
        tables.extend(results)

        if not page_token:
            break

    for table in tables:
        print(table.name)
def create_table(dataset_name, table_name, project=None):
    """Creates a simple table in the given dataset.

    If no project is specified, then the currently active project is used.
    """
    bigquery_client = bigquery.Client(project=project)
    dataset = bigquery_client.dataset(dataset_name)

    if not dataset.exists():
        print('Dataset {} does not exist.'.format(dataset_name))
        return

    table = dataset.table(table_name)

    # Set the table schema
    table.schema = (
        bigquery.SchemaField('Name', 'STRING'),
        bigquery.SchemaField('Age', 'INTEGER'),
        bigquery.SchemaField('Weight', 'FLOAT'),
    )

    table.create()

    print('Created table {} in dataset {}.'.format(table_name, dataset_name))
def update_table_schema(destination_table, source_vcf, description=None):
  """Updates a BigQuery table with the variants schema using a VCF header.

  Args:
    destination_table: BigQuery table name, PROJECT_ID.DATASET_NAME.TABLE_NAME.
    source_vcf: Path to local or remote (Cloud Storage) VCF or gzipped VCF file.
    description: Optional description for the BigQuery table.

  Raises:
    ValueError: If destination_table cannot be parsed.
  """

  dest_table = tokenize_table_name(destination_table)
  dest_project_id, dest_dataset_name, dest_table_name = dest_table

  # Load the source VCF
  descriptions = Descriptions()
  descriptions.add_from_vcf(source_vcf)

  # Initialize the BQ client
  client = bigquery.Client(project=dest_project_id)

  # Load the destination table
  dest_dataset = client.dataset(dest_dataset_name)
  dest_dataset.reload()

  dest_table = dest_dataset.table(dest_table_name)
  dest_table.reload()

  if description is not None:
    dest_table.patch(description=description[:_MAX_LENGTH])
    if len(description) > _MAX_LENGTH:
      logging.warning(_TRUNCATION_WARNING, 'table description')

  # Set the description on the variant fields and the call fields.
  #
  # The (non-fixed) variant field descriptions come from the ##INFO headers
  # The (non-fixed) call fields descriptions can come from the ##FORMAT headers
  #   as well as the ##INFO headers.

  # Process variant fields
  call_field = None
  for field in dest_table.schema:
    if field.name.lower() in _FIXED_VARIANT_FIELDS:
      field.description = _FIXED_VARIANT_FIELDS[field.name.lower()]
      logging.debug('Variant(fixed): %s: %s', field.name, field.description)

    elif field.name in descriptions.info_fields:
      field.description = descriptions.info_fields[field.name]
      logging.debug('Variant(INFO) %s: %s', field.name, field.description)

    elif field.name.lower() == 'filter':
      field.description = descriptions.filter_description

    if field.name == 'call':
      call_field = field

    if field.description is not None and len(field.description) > _MAX_LENGTH:
      logging.warning(_TRUNCATION_WARNING, field.name)
      field.description = field.description[:_MAX_LENGTH]

  # Process call fields
  for field in call_field.fields:
    if field.name.lower() in _FIXED_CALL_FIELDS:
      field.description = _FIXED_CALL_FIELDS[field.name.lower()]
      logging.debug('Call(fixed): %s: %s', field.name, field.description)

    elif field.name in descriptions.format_fields:
      field.description = descriptions.format_fields[field.name]
      logging.debug('Call(FORMAT) %s: %s', field.name, field.description)

    elif field.name in descriptions.info_fields:
      field.description = descriptions.info_fields[field.name]
      logging.debug('Call(INFO) %s: %s', field.name, field.description)

    elif field.name.lower() == 'filter':
      field.description = descriptions.filter_description

    if field.description is not None and len(field.description) > _MAX_LENGTH:
      logging.warning(_TRUNCATION_WARNING, field.name)
      field.description = field.description[:_MAX_LENGTH]

  logging.info('Updating table %s', dest_table.path)
  dest_table.patch(schema=dest_table.schema)
示例#17
0
TABLE_NAME = 'airport'
BUCKET_NAME = 'satish123'
FILE = 'airport.csv'
SOURCE = 'https://storage.cloud.google.com/satish123/airport.csv?_ga=2.200274028.-331489596.1519587350&_gac=1.252996475.1519744301.CjwKCAiAoNTUBRBUEiwAWje2ltt6Onlm-oURmJ0zEqOD_dy_wmi_5yUsCdGXFro37ANM_5QjwIFk5RoC4PUQAvD_BwE'.format(
    BUCKET_NAME, FILE)

SCHEMA = [
    bq.SchemaField('name', 'STRING', mode='required'),
    bq.SchemaField('country', 'STRING', mode='required'),
    bq.SchemaField('area_code', 'STRING', mode='required'),
    bq.SchemaField('origin', 'STRING', mode='required')
]

# CREDENTIALS = GoogleCredentials.get_application_efault()

client = bq.Client(project=BILLING_PROJECT_ID)


# Dataset
# Check if the dataset exists
def create_datasets(name):
    dataset = client.dataset(name)
    try:
        assert not dataset.exists()
        dataset.create()
        assert dataset.exists()
        print("Dataset {} created".format(name))
    except (AssertionError):
        pass

示例#18
0
def setUpModule():
    _helpers.PROJECT = TESTS_PROJECT
    Config.CLIENT = bigquery.Client()
示例#19
0
  * A query is run against the public dataset,
    bigquery-public-data.samples.natality, selecting only the data of interest
    to the regression, the output of which is stored in the “regression_input”
    table.
  * The output table is moved over the wire to the user's default project via
    the built-in BigQuery Connector for Spark that bridges BigQuery and Cloud
    Dataproc.
"""

from gcloud import bigquery
from gcloud.bigquery import job
from gcloud.bigquery.table import *

# Create a new Google BigQuery client using Google Cloud Platform project
# defaults.
bq = bigquery.Client()

# Create a new BigQuery dataset.
reg_dataset = bq.dataset("natality_regression")
reg_dataset.create()

# In the new BigQuery dataset, create a new table.
table = reg_dataset.table(name="regression_input")
# The table needs a schema before it can be created and accept data.
# We create an ordered list of the columns using SchemaField objects.
schema = []
schema.append(SchemaField("weight_pounds", "float"))
schema.append(SchemaField("mother_age", "integer"))
schema.append(SchemaField("father_age", "integer"))
schema.append(SchemaField("gestation_weeks", "integer"))
schema.append(SchemaField("weight_gain_pounds", "integer"))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import operator
import time

import unittest2

from gcloud import _helpers
from gcloud.environment_vars import TESTS_PROJECT
from gcloud import bigquery


_helpers.PROJECT = TESTS_PROJECT
CLIENT = bigquery.Client()
DATASET_NAME = 'system_tests_%012d' % (1000 * time.time(),)


class TestBigQuery(unittest2.TestCase):

    def setUp(self):
        self.to_delete = []

    def tearDown(self):
        for doomed in self.to_delete:
            doomed.delete()

    def test_create_dataset(self):
        dataset = CLIENT.dataset(DATASET_NAME)
        self.assertFalse(dataset.exists())