def dataset_id(bigquery_client: bigquery.Client, project_id: str): dataset_id = prefixer.create_prefix() full_dataset_id = f"{project_id}.{dataset_id}" dataset = bigquery.Dataset(full_dataset_id) bigquery_client.create_dataset(dataset) yield dataset_id bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
def dataset(bq: bigquery.Client, dataset_id: str): """Context manager for creating and deleting the BigQuery dataset for a test.""" try: bq.get_dataset(dataset_id) except NotFound: bq.create_dataset(dataset_id) try: yield bq.dataset(dataset_id) finally: bq.delete_dataset(dataset_id, delete_contents=True)
def create_bq_dataset(dataset_name='price_data'): '''Create dataset if not exists''' client = Client() datasets = [ client.project + "." + i.dataset_id for i in list(client.list_datasets()) ] if client.project + "." + dataset_name not in datasets: dataset = Dataset(dataset_name) dataset.location = "US" client.create_dataset(dataset) else: print("Dataset already exists")
def _get_table_id_for_new_entity(client: Client, project: str, dataset_name: str) -> str: """Gets the table_id for the new entity to be uploaded.""" # First create the BigQuery dataset if it doesn't exist dataset = bigquery.Dataset(f"{client.project}.{dataset_name}") dataset.location = "US" try: client.get_dataset(dataset) except NotFound: # Only create the dataset if it does not exist client.create_dataset(dataset, exists_ok=True) return f"{client.project}.{dataset_name}.entity_df_{project}_{int(time.time())}"
def get_client(): # If this raises a DefaultCredentialsError: # * on a developer's machine, run `gcloud auth application-default login` # to use OAuth # * elsewhere, ensure that GOOGLE_APPLICATION_CREDENTIALS is set and # points to a valid set of credentials for a service account # # A warning is raised when authenticating with OAuth, recommending that # server applications use a service account. We can ignore this. with warnings.catch_warnings(): warnings.simplefilter("ignore") client = Client(project="ebmdatalab", location="EU") client.create_dataset("qof", exists_ok=True) return client
def bigquery_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = "test_pybigquery" dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset, exists_ok=True) sample_table_id = f"{project_id}.{dataset_id}.sample" try: # Since the data changes rarely and the tests are mostly read-only, # only create the tables if they don't already exist. # TODO: Create shared sample data tables in bigquery-public-data that # include test values for all data types. bigquery_client.get_table(sample_table_id) except google.api_core.exceptions.NotFound: job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job1.result() one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row" try: bigquery_client.get_table(one_row_table_id) except google.api_core.exceptions.NotFound: job2 = load_sample_data( one_row_table_id, bigquery_client, bigquery_schema, filename="sample_one_row.json", ) job2.result() view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", ) view.view_query = f"SELECT string FROM `{dataset_id}.sample`" bigquery_client.create_table(view, exists_ok=True) return dataset_id
def get_or_create_table(client: bigquery.Client, dataset_id: str, table_id: str) -> bigquery.Table: """ BigQueryのデータセットとテーブルを作成する。既に存在する場合は取得する。 """ logging.info(f'Creating dataset {dataset_id} if not exists...') dataset = client.create_dataset(dataset_id, exists_ok=True) # データセットを作成または取得する。 logging.info(f'Creating table {dataset_id}.{table_id} if not exists...') table_ref = dataset.table(table_id) return client.create_table( # テーブルを作成または取得する。 bigquery.Table(table_ref, schema=[ bigquery.SchemaField('id', 'string', description='ツイートのID'), bigquery.SchemaField('lang', 'string', description='ツイートの言語'), bigquery.SchemaField('screen_name', 'string', description='ユーザー名'), bigquery.SchemaField('text', 'string', description='ツイートの本文'), bigquery.SchemaField('created_at', 'timestamp', description='ツイートの日時'), ]), exists_ok=True)
def create_dataset( client: bq.Client, dataset_id: str, location: str = "US", timeout: int = 30 ): dataset_full_id = get_full_dataset_name(client, dataset_id) dataset = bq.Dataset(dataset_full_id) dataset.location = location dataset = client.create_dataset(dataset, timeout=timeout)
def dataset_id(client: bigquery.Client): project_id = client.project dataset_id = prefixer.create_prefix() dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = client.create_dataset(dataset) yield dataset_id client.delete_dataset(dataset_id, delete_contents=True)
def create_dataset( client: bigquery.Client, dataset_name: str, description: str = 'Automatic imports of known FusionTables' ) -> bigquery.Dataset: ds = bigquery.Dataset(f'{client.project}.{to_safe_name(dataset_name)}') ds.description = description return client.create_dataset(ds)
def _get_table_reference_for_new_entity(client: Client, dataset_project: str, dataset_name: str) -> str: """Gets the table_id for the new entity to be uploaded.""" # First create the BigQuery dataset if it doesn't exist dataset = bigquery.Dataset(f"{dataset_project}.{dataset_name}") dataset.location = "US" try: client.get_dataset(dataset) except NotFound: # Only create the dataset if it does not exist client.create_dataset(dataset, exists_ok=True) table_name = offline_utils.get_temp_entity_table_name() return f"{dataset_project}.{dataset_name}.{table_name}"
def bigquery_dml_dataset(bigquery_client: bigquery.Client): project_id = bigquery_client.project dataset_id = "test_pybigquery_dml" dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") # Add default table expiration in case cleanup fails. dataset.default_table_expiration_ms = 1000 * int( datetime.timedelta(days=1).total_seconds()) dataset = bigquery_client.create_dataset(dataset, exists_ok=True) return dataset_id
def create_dataset(client: bigquery.Client, dataset_id: str, location: str = 'EU', description: str = "Creation date: {}".format(datetime.datetime.now())): """ Creates a dataset with following referece project_id:dataset_id Args: client: BQ API client dataset_id: dataset to be created location: location of the dataset (default is Europe for legal reasons) description: description of the dataset (default is date oc creation) Returns: """ # TODO: ADD CHECK OF EXISTENCE dataset_ref = client.dataset(dataset_id) dataset = bigquery.Dataset(dataset_ref) dataset.location = location dataset.description = description client.create_dataset(dataset)
def create_dataset(client: bigquery.Client, dataset_id: str, description: str, location: str) -> None: """Creates a dataset in GCP. Args: client: The client used to create the dataset. Client should have a project defined. dataset_id: The dataset id of the dataset to create. description: The description of the dataset to create. location: The GCP location of the dataset to create. """ dataset = bigquery.Dataset("{}.{}".format(client.project, dataset_id)) dataset.description = description dataset.location = location try: client.create_dataset(dataset) print("Created dataset {}.{}".format(client.project, dataset_id)) except gexceptions.Conflict: print( "Dataset {} already existing in project {}. Skipping dataset creation..." .format(dataset_id, client.project))
def bigquery_alt_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = "test_pybigquery_alt" dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset, exists_ok=True) sample_table_id = f"{project_id}.{dataset_id}.sample_alt" try: bigquery_client.get_table(sample_table_id) except google.api_core.exceptions.NotFound: job = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job.result() return dataset_id
def get_or_create_table(client: bigquery.Client) -> bigquery.Table: try: dataset = client.get_dataset("sensors") except NotFound as _: dataset = client.create_dataset("sensors") # The default project ID is not set and hence a fully-qualified ID is required. table_ref = bigquery.TableReference(dataset, table_id="particulate_matter") try: return client.get_table(table_ref) except NotFound as _: return client.create_table( bigquery.Table( table_ref, schema=[ bigquery.SchemaField( "humidity", "NUMERIC", description="Sensor DHT22humidity in %"), bigquery.SchemaField("max_micro", "NUMERIC", description=""), bigquery.SchemaField("min_micro", "NUMERIC", description=""), bigquery.SchemaField("samples", "NUMERIC", description=""), bigquery.SchemaField( "sds_p1", "NUMERIC", description="Sensor SDS011 PM10 in µg/m³"), bigquery.SchemaField( "sds_p2", "NUMERIC", description="Sensor SDS011 PM2.5 in µg/m³"), bigquery.SchemaField( "signal", "NUMERIC", description="WiFi signal strength in dBm"), bigquery.SchemaField( "temperature", "NUMERIC", description="Sensor DHT22 temperature in °C"), bigquery.SchemaField("datetime", "DATETIME", description="Datetime of measurement", mode="REQUIRED"), ], ))
def create_dataset(dataset_name: str = dest_dataset_name, client: bigquery.Client = dest_client) -> str: # should use the destination project client client dataset_id = f"{client.project}.{dataset_name}" # Construct a full Dataset object to send to the API. dataset = bigquery.Dataset(dataset_id) # 1 week in milliseconds dataset.default_table_expiration_ms = 7 * 24 * 60 * 60 * 1000 dataset.location = "US" # Send the dataset to the API for creation, with an explicit timeout. # Raises google.api_core.exceptions.Conflict if the Dataset already # exists within the project. dataset = client.create_dataset(dataset, timeout=30) print("Created dataset {}.{}".format(client.project, dataset.dataset_id)) return dataset_name
def get_or_create_dataset(bq_client: bigquery.Client, dataset_id: str, location: str = "us-west1") -> bigquery.Dataset: """ Tries to create a dataset in bigquery. If it already exists, just return dataset object :param bq_client: Client object to bigquery :param dataset_id: Id of the new (of existent) dataset :param location: Geographic location in GCP :return: """ full_dataset_id = f"{client.project}.{dataset_id}" dataset = bigquery.Dataset(full_dataset_id) dataset.location = location dataset = bq_client.create_dataset(dataset, exists_ok=True, timeout=30) return dataset
def create_bq_dataset( client: bigquery.Client, dataset_id: str, dataset_description: str = None, ) -> bigquery.Dataset: """ Create empty dataset. """ # TODO: validate 'dataset_id'. # > Dataset IDs must be alphanumeric (plus underscores) and must be at most 1024 chars long. # note: it is not intuitive the dual instantiation of a 'Dataset' object. dataset = bigquery.Dataset( client.dataset(dataset_id)) # type: bigquery.Dataset dataset.description = dataset_description # API request return client.create_dataset(dataset) # type: bigquery.Dataset
def bigquery_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = prefixer.create_prefix() dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset) sample_table_id = f"{project_id}.{dataset_id}.sample" job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job1.result() one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row" job2 = load_sample_data( one_row_table_id, bigquery_client, bigquery_schema, filename="sample_one_row.json", ) job2.result() view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", ) view.view_query = f"SELECT string FROM `{dataset_id}.sample`" bigquery_client.create_table(view) yield dataset_id bigquery_client.delete_dataset(dataset_id, delete_contents=True)