def load(project_id, bq_client, src_dataset_id, dst_dataset_id): """ Transform safely loaded tables and store results in target dataset. :param project_id: Identifies the BQ project :param bq_client: a BigQuery client object :param src_dataset_id: reference to source dataset object :param dst_dataset_id: reference to destination dataset object :return: List of BQ job_ids """ dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}') dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}' dst_dataset.labels = {'type': 'vocabulary'} dst_dataset.location = "US" bq_client.create_dataset(dst_dataset, exists_ok=True) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) query_job.result() return query_jobs
def createDataset(datasetname): dataset_ref = bigquery_client.dataset(datasetname) dataset = Dataset(dataset_ref) dataset.location = 'US' dataset = bigquery_client.create_dataset(dataset) return
def create_bq_dataset(dataset_name='price_data'): '''Create dataset if not exists''' client = Client() datasets = [ client.project + "." + i.dataset_id for i in list(client.list_datasets()) ] if client.project + "." + dataset_name not in datasets: dataset = Dataset(dataset_name) dataset.location = "US" client.create_dataset(dataset) else: print("Dataset already exists")
def _create_brand_new_datasets(self, product_id): """ Creates a new Google BigQuery dataset Args: product_id (str): A string representing the desired product_id """ dataset = Dataset(self.client.dataset(dataset_id=product_id)) dataset.location = _DATASET_GEOLOCATION self.datasets[product_id]['datasets'] = ( self.client.create_dataset(dataset)) self._create_brand_new_tables(product_id=product_id) return
def check_and_create_staging_dataset(dst_dataset_id, bucket_name, bq_client): """ :param dst_dataset_id: final destination to load the vocabulary in BigQuery :param bucket_name: the location in GCS containing the vocabulary files :param bq_client: google bigquery client :return: staging dataset object """ staging_dataset_id = f'{dst_dataset_id}_staging' staging_dataset = Dataset(f'{bq_client.project}.{staging_dataset_id}') try: bq_client.get_dataset(staging_dataset) except NotFound: staging_dataset.description = f'Vocabulary loaded from gs://{bucket_name}' staging_dataset.labels = {'type': 'vocabulary', 'phase': 'staging'} staging_dataset.location = "US" staging_dataset = bq_client.create_dataset(staging_dataset) LOGGER.info(f'Successfully created dataset {staging_dataset_id}') return staging_dataset
def create_data_set(self, data_set_name): """ :param data_set_name: str - The name of the dataset to be created :return: 0 indicates success """ data_set_ref = self.client.dataset(data_set_name) data_set = Dataset(data_set_ref) data_set.description = '' data_set.location = 'EU' try: self.client.create_dataset(data_set) # API request logging.info('Data set - ' + data_set_name + ' successfully created') except Conflict: logging.info('Data set - ' + data_set_name + ' already exists') return 0
def make_dataset(project, dataset_id, friendly_name=None, description=None, default_table_expiration_ms=None, location=None, labels=None, access_entries=None): dataset_ref = DatasetReference(project, dataset_id) dataset = Dataset(dataset_ref) dataset.friendly_name = friendly_name dataset.description = description dataset.default_table_expiration_ms = default_table_expiration_ms dataset.location = location if labels is not None: dataset.labels = labels if access_entries is not None: dataset.access_entries = access_entries return dataset
def create(self, dataset_id): """ Create a dataset in Google BigQuery Parameters ---------- dataset : str Name of dataset to be written """ from google.cloud.bigquery import Dataset if self.exists(dataset_id): raise DatasetCreationError("Dataset {0} already " "exists".format(dataset_id)) dataset = Dataset(self.client.dataset(dataset_id)) if self.location is not None: dataset.location = self.location try: self.client.create_dataset(dataset) except self.http_error as ex: self.process_http_error(ex)
logger.info("Found Dataset %s.", repr(dataset_id)) try: table = client.get_table(table_ref) logger.info("Found Table %s.", repr(table_id)) checkForDuplicates = True except: # TODO find teh right exception for this logger.info("Creating Table %s.", repr(table_id)) table = Table(table_ref, schema=airmonitorSchema) table = client.create_table(table) except: # TODO find the right exception for this # create the dataset logger.info("Creating Dataset %s.", repr(dataset_id)) dataset = Dataset(dataset_ref) dataset.location = "EU" dataset = client.create_dataset(dataset) # create a table logger.info("Creating Table %s.", repr(table_id)) table = Table(table_ref, schema=airmonitorSchema) table = client.create_table(table) # functions ------------------------------------------------------------------- def queryThis(query: Query) -> list: """Query the given query object and return the resulting list.""" q = str(query) # extra line for clarity. calls the __str__ magic function return list(client.query(q).result())