示例#1
0
def load(project_id, bq_client, src_dataset_id, dst_dataset_id):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id: Identifies the BQ project
    :param bq_client: a BigQuery client object
    :param src_dataset_id: reference to source dataset object
    :param dst_dataset_id: reference to destination dataset object
    :return: List of BQ job_ids
    """
    dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}')
    dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}'
    dst_dataset.labels = {'type': 'vocabulary'}
    dst_dataset.location = "US"
    bq_client.create_dataset(dst_dataset, exists_ok=True)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
        query_job.result()
    return query_jobs
示例#2
0
def createDataset(datasetname):
    dataset_ref = bigquery_client.dataset(datasetname)

    dataset = Dataset(dataset_ref)
    dataset.location = 'US'

    dataset = bigquery_client.create_dataset(dataset)

    return
示例#3
0
def create_bq_dataset(dataset_name='price_data'):
    '''Create dataset if not exists'''
    client = Client()
    datasets = [
        client.project + "." + i.dataset_id
        for i in list(client.list_datasets())
    ]
    if client.project + "." + dataset_name not in datasets:
        dataset = Dataset(dataset_name)
        dataset.location = "US"
        client.create_dataset(dataset)
    else:
        print("Dataset already exists")
示例#4
0
    def _create_brand_new_datasets(self, product_id):
        """
    Creates a new Google BigQuery dataset

    Args:
      product_id (str):
        A string representing the desired product_id 

    """
        dataset = Dataset(self.client.dataset(dataset_id=product_id))
        dataset.location = _DATASET_GEOLOCATION

        self.datasets[product_id]['datasets'] = (
            self.client.create_dataset(dataset))
        self._create_brand_new_tables(product_id=product_id)
        return
示例#5
0
def check_and_create_staging_dataset(dst_dataset_id, bucket_name, bq_client):
    """

    :param dst_dataset_id: final destination to load the vocabulary in BigQuery
    :param bucket_name: the location in GCS containing the vocabulary files
    :param bq_client: google bigquery client
    :return: staging dataset object
    """
    staging_dataset_id = f'{dst_dataset_id}_staging'
    staging_dataset = Dataset(f'{bq_client.project}.{staging_dataset_id}')
    try:
        bq_client.get_dataset(staging_dataset)
    except NotFound:
        staging_dataset.description = f'Vocabulary loaded from gs://{bucket_name}'
        staging_dataset.labels = {'type': 'vocabulary', 'phase': 'staging'}
        staging_dataset.location = "US"
        staging_dataset = bq_client.create_dataset(staging_dataset)
        LOGGER.info(f'Successfully created dataset {staging_dataset_id}')
    return staging_dataset
    def create_data_set(self, data_set_name):
        """
        :param data_set_name: str - The name of the dataset to be created
        :return: 0 indicates success
        """

        data_set_ref = self.client.dataset(data_set_name)
        data_set = Dataset(data_set_ref)
        data_set.description = ''
        data_set.location = 'EU'

        try:
            self.client.create_dataset(data_set)  # API request
            logging.info('Data set - ' + data_set_name +
                         ' successfully created')
        except Conflict:
            logging.info('Data set - ' + data_set_name + ' already exists')

        return 0
示例#7
0
def make_dataset(project,
                 dataset_id,
                 friendly_name=None,
                 description=None,
                 default_table_expiration_ms=None,
                 location=None,
                 labels=None,
                 access_entries=None):
    dataset_ref = DatasetReference(project, dataset_id)
    dataset = Dataset(dataset_ref)
    dataset.friendly_name = friendly_name
    dataset.description = description
    dataset.default_table_expiration_ms = default_table_expiration_ms
    dataset.location = location
    if labels is not None:
        dataset.labels = labels
    if access_entries is not None:
        dataset.access_entries = access_entries
    return dataset
示例#8
0
    def create(self, dataset_id):
        """ Create a dataset in Google BigQuery

        Parameters
        ----------
        dataset : str
            Name of dataset to be written
        """
        from google.cloud.bigquery import Dataset

        if self.exists(dataset_id):
            raise DatasetCreationError("Dataset {0} already "
                                       "exists".format(dataset_id))

        dataset = Dataset(self.client.dataset(dataset_id))

        if self.location is not None:
            dataset.location = self.location

        try:
            self.client.create_dataset(dataset)
        except self.http_error as ex:
            self.process_http_error(ex)
示例#9
0
    logger.info("Found Dataset %s.", repr(dataset_id))
    try:
        table = client.get_table(table_ref)
        logger.info("Found Table %s.", repr(table_id))
        checkForDuplicates = True

    except:  # TODO find teh right exception for this
        logger.info("Creating Table %s.", repr(table_id))
        table = Table(table_ref, schema=airmonitorSchema)
        table = client.create_table(table)

except:  # TODO find the right exception for this
    # create the dataset
    logger.info("Creating Dataset %s.", repr(dataset_id))
    dataset = Dataset(dataset_ref)
    dataset.location = "EU"
    dataset = client.create_dataset(dataset)
    # create a table
    logger.info("Creating Table %s.", repr(table_id))
    table = Table(table_ref, schema=airmonitorSchema)
    table = client.create_table(table)


# functions -------------------------------------------------------------------
def queryThis(query: Query) -> list:
    """Query the given query object and return the resulting list."""
    q = str(query)  # extra line for clarity. calls the __str__ magic function

    return list(client.query(q).result())