Exemplo n.º 1
0
def dataset_id(bigquery_client: bigquery.Client, project_id: str):
    dataset_id = prefixer.create_prefix()
    full_dataset_id = f"{project_id}.{dataset_id}"
    dataset = bigquery.Dataset(full_dataset_id)
    bigquery_client.create_dataset(dataset)
    yield dataset_id
    bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
Exemplo n.º 2
0
def dataset(bq: bigquery.Client, dataset_id: str):
    """Context manager for creating and deleting the BigQuery dataset for a test."""
    try:
        bq.get_dataset(dataset_id)
    except NotFound:
        bq.create_dataset(dataset_id)
    try:
        yield bq.dataset(dataset_id)
    finally:
        bq.delete_dataset(dataset_id, delete_contents=True)
Exemplo n.º 3
0
def create_bq_dataset(dataset_name='price_data'):
    '''Create dataset if not exists'''
    client = Client()
    datasets = [
        client.project + "." + i.dataset_id
        for i in list(client.list_datasets())
    ]
    if client.project + "." + dataset_name not in datasets:
        dataset = Dataset(dataset_name)
        dataset.location = "US"
        client.create_dataset(dataset)
    else:
        print("Dataset already exists")
Exemplo n.º 4
0
def _get_table_id_for_new_entity(client: Client, project: str,
                                 dataset_name: str) -> str:
    """Gets the table_id for the new entity to be uploaded."""

    # First create the BigQuery dataset if it doesn't exist
    dataset = bigquery.Dataset(f"{client.project}.{dataset_name}")
    dataset.location = "US"

    try:
        client.get_dataset(dataset)
    except NotFound:
        # Only create the dataset if it does not exist
        client.create_dataset(dataset, exists_ok=True)

    return f"{client.project}.{dataset_name}.entity_df_{project}_{int(time.time())}"
Exemplo n.º 5
0
def get_client():
    # If this raises a DefaultCredentialsError:
    #  * on a developer's machine, run `gcloud auth application-default login`
    #   to use OAuth
    #  * elsewhere, ensure that GOOGLE_APPLICATION_CREDENTIALS is set and
    #    points to a valid set of credentials for a service account
    #
    # A warning is raised when authenticating with OAuth, recommending that
    # server applications use a service account.  We can ignore this.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        client = Client(project="ebmdatalab", location="EU")

    client.create_dataset("qof", exists_ok=True)

    return client
Exemplo n.º 6
0
def bigquery_dataset(bigquery_client: bigquery.Client,
                     bigquery_schema: List[bigquery.SchemaField]):
    project_id = bigquery_client.project
    dataset_id = "test_pybigquery"
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = bigquery_client.create_dataset(dataset, exists_ok=True)
    sample_table_id = f"{project_id}.{dataset_id}.sample"
    try:
        # Since the data changes rarely and the tests are mostly read-only,
        # only create the tables if they don't already exist.
        # TODO: Create shared sample data tables in bigquery-public-data that
        #       include test values for all data types.
        bigquery_client.get_table(sample_table_id)
    except google.api_core.exceptions.NotFound:
        job1 = load_sample_data(sample_table_id, bigquery_client,
                                bigquery_schema)
        job1.result()
    one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row"
    try:
        bigquery_client.get_table(one_row_table_id)
    except google.api_core.exceptions.NotFound:
        job2 = load_sample_data(
            one_row_table_id,
            bigquery_client,
            bigquery_schema,
            filename="sample_one_row.json",
        )
        job2.result()
    view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", )
    view.view_query = f"SELECT string FROM `{dataset_id}.sample`"
    bigquery_client.create_table(view, exists_ok=True)
    return dataset_id
def get_or_create_table(client: bigquery.Client, dataset_id: str,
                        table_id: str) -> bigquery.Table:
    """
    BigQueryのデータセットとテーブルを作成する。既に存在する場合は取得する。
    """
    logging.info(f'Creating dataset {dataset_id} if not exists...')
    dataset = client.create_dataset(dataset_id,
                                    exists_ok=True)  # データセットを作成または取得する。

    logging.info(f'Creating table {dataset_id}.{table_id} if not exists...')
    table_ref = dataset.table(table_id)
    return client.create_table(  # テーブルを作成または取得する。
        bigquery.Table(table_ref,
                       schema=[
                           bigquery.SchemaField('id',
                                                'string',
                                                description='ツイートのID'),
                           bigquery.SchemaField('lang',
                                                'string',
                                                description='ツイートの言語'),
                           bigquery.SchemaField('screen_name',
                                                'string',
                                                description='ユーザー名'),
                           bigquery.SchemaField('text',
                                                'string',
                                                description='ツイートの本文'),
                           bigquery.SchemaField('created_at',
                                                'timestamp',
                                                description='ツイートの日時'),
                       ]),
        exists_ok=True)
Exemplo n.º 8
0
def create_dataset(
    client: bq.Client, dataset_id: str, location: str = "US", timeout: int = 30
):
    dataset_full_id = get_full_dataset_name(client, dataset_id)
    dataset = bq.Dataset(dataset_full_id)
    dataset.location = location
    dataset = client.create_dataset(dataset, timeout=timeout)
Exemplo n.º 9
0
def dataset_id(client: bigquery.Client):
    project_id = client.project
    dataset_id = prefixer.create_prefix()
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = client.create_dataset(dataset)
    yield dataset_id
    client.delete_dataset(dataset_id, delete_contents=True)
Exemplo n.º 10
0
def create_dataset(
    client: bigquery.Client,
    dataset_name: str,
    description: str = 'Automatic imports of known FusionTables'
) -> bigquery.Dataset:
    ds = bigquery.Dataset(f'{client.project}.{to_safe_name(dataset_name)}')
    ds.description = description
    return client.create_dataset(ds)
Exemplo n.º 11
0
def _get_table_reference_for_new_entity(client: Client, dataset_project: str,
                                        dataset_name: str) -> str:
    """Gets the table_id for the new entity to be uploaded."""

    # First create the BigQuery dataset if it doesn't exist
    dataset = bigquery.Dataset(f"{dataset_project}.{dataset_name}")
    dataset.location = "US"

    try:
        client.get_dataset(dataset)
    except NotFound:
        # Only create the dataset if it does not exist
        client.create_dataset(dataset, exists_ok=True)

    table_name = offline_utils.get_temp_entity_table_name()

    return f"{dataset_project}.{dataset_name}.{table_name}"
Exemplo n.º 12
0
def bigquery_dml_dataset(bigquery_client: bigquery.Client):
    project_id = bigquery_client.project
    dataset_id = "test_pybigquery_dml"
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    # Add default table expiration in case cleanup fails.
    dataset.default_table_expiration_ms = 1000 * int(
        datetime.timedelta(days=1).total_seconds())
    dataset = bigquery_client.create_dataset(dataset, exists_ok=True)
    return dataset_id
Exemplo n.º 13
0
def create_dataset(client: bigquery.Client, dataset_id: str, location: str = 'EU',
                   description: str = "Creation date: {}".format(datetime.datetime.now())):
    """
    Creates a dataset with following referece project_id:dataset_id

    Args:
        client: BQ API client
        dataset_id: dataset to be created
        location: location of the dataset (default is Europe for legal reasons)
        description: description of the dataset (default is date oc creation)

    Returns:

    """
    # TODO: ADD CHECK OF EXISTENCE
    dataset_ref = client.dataset(dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = location
    dataset.description = description
    client.create_dataset(dataset)
Exemplo n.º 14
0
def create_dataset(client: bigquery.Client, dataset_id: str, description: str,
                   location: str) -> None:
    """Creates a dataset in GCP.

    Args:
        client: The client used to create the dataset. Client should have a project defined.
        dataset_id: The dataset id of the dataset to create.
        description: The description of the dataset to create.
        location: The GCP location of the dataset to create.

    """
    dataset = bigquery.Dataset("{}.{}".format(client.project, dataset_id))
    dataset.description = description
    dataset.location = location
    try:
        client.create_dataset(dataset)
        print("Created dataset {}.{}".format(client.project, dataset_id))
    except gexceptions.Conflict:
        print(
            "Dataset {} already existing in project {}. Skipping dataset creation..."
            .format(dataset_id, client.project))
Exemplo n.º 15
0
def bigquery_alt_dataset(bigquery_client: bigquery.Client,
                         bigquery_schema: List[bigquery.SchemaField]):
    project_id = bigquery_client.project
    dataset_id = "test_pybigquery_alt"
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = bigquery_client.create_dataset(dataset, exists_ok=True)
    sample_table_id = f"{project_id}.{dataset_id}.sample_alt"
    try:
        bigquery_client.get_table(sample_table_id)
    except google.api_core.exceptions.NotFound:
        job = load_sample_data(sample_table_id, bigquery_client,
                               bigquery_schema)
        job.result()
    return dataset_id
Exemplo n.º 16
0
def get_or_create_table(client: bigquery.Client) -> bigquery.Table:
    try:
        dataset = client.get_dataset("sensors")
    except NotFound as _:
        dataset = client.create_dataset("sensors")

    # The default project ID is not set and hence a fully-qualified ID is required.
    table_ref = bigquery.TableReference(dataset, table_id="particulate_matter")
    try:
        return client.get_table(table_ref)
    except NotFound as _:
        return client.create_table(
            bigquery.Table(
                table_ref,
                schema=[
                    bigquery.SchemaField(
                        "humidity",
                        "NUMERIC",
                        description="Sensor DHT22humidity in %"),
                    bigquery.SchemaField("max_micro",
                                         "NUMERIC",
                                         description=""),
                    bigquery.SchemaField("min_micro",
                                         "NUMERIC",
                                         description=""),
                    bigquery.SchemaField("samples", "NUMERIC", description=""),
                    bigquery.SchemaField(
                        "sds_p1",
                        "NUMERIC",
                        description="Sensor SDS011 PM10 in µg/m³"),
                    bigquery.SchemaField(
                        "sds_p2",
                        "NUMERIC",
                        description="Sensor SDS011 PM2.5 in µg/m³"),
                    bigquery.SchemaField(
                        "signal",
                        "NUMERIC",
                        description="WiFi signal strength in dBm"),
                    bigquery.SchemaField(
                        "temperature",
                        "NUMERIC",
                        description="Sensor DHT22 temperature in °C"),
                    bigquery.SchemaField("datetime",
                                         "DATETIME",
                                         description="Datetime of measurement",
                                         mode="REQUIRED"),
                ],
            ))
Exemplo n.º 17
0
def create_dataset(dataset_name: str = dest_dataset_name, client: bigquery.Client = dest_client) -> str:
    # should use the destination project client client
    dataset_id = f"{client.project}.{dataset_name}"

    # Construct a full Dataset object to send to the API.
    dataset = bigquery.Dataset(dataset_id)
    # 1 week in milliseconds
    dataset.default_table_expiration_ms = 7 * 24 * 60 * 60 * 1000
    dataset.location = "US"

    # Send the dataset to the API for creation, with an explicit timeout.
    # Raises google.api_core.exceptions.Conflict if the Dataset already
    # exists within the project.
    dataset = client.create_dataset(dataset, timeout=30)
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
    return dataset_name
    def get_or_create_dataset(bq_client: bigquery.Client,
                              dataset_id: str,
                              location: str = "us-west1") -> bigquery.Dataset:
        """
            Tries to create a dataset in bigquery. If it already exists, just return dataset object
        :param bq_client: Client object to bigquery
        :param dataset_id: Id of the new (of existent) dataset
        :param location: Geographic location in GCP
        :return:
        """
        full_dataset_id = f"{client.project}.{dataset_id}"
        dataset = bigquery.Dataset(full_dataset_id)
        dataset.location = location

        dataset = bq_client.create_dataset(dataset, exists_ok=True, timeout=30)

        return dataset
Exemplo n.º 19
0
def create_bq_dataset(
    client: bigquery.Client,
    dataset_id: str,
    dataset_description: str = None,
) -> bigquery.Dataset:
    """
    Create empty dataset.
    """
    # TODO: validate 'dataset_id'.
    #   > Dataset IDs must be alphanumeric (plus underscores) and must be at most 1024 chars long.

    # note: it is not intuitive the dual instantiation of a 'Dataset' object.
    dataset = bigquery.Dataset(
        client.dataset(dataset_id))  # type: bigquery.Dataset
    dataset.description = dataset_description

    # API request
    return client.create_dataset(dataset)  # type: bigquery.Dataset
Exemplo n.º 20
0
def bigquery_dataset(bigquery_client: bigquery.Client,
                     bigquery_schema: List[bigquery.SchemaField]):
    project_id = bigquery_client.project
    dataset_id = prefixer.create_prefix()
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = bigquery_client.create_dataset(dataset)
    sample_table_id = f"{project_id}.{dataset_id}.sample"
    job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema)
    job1.result()
    one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row"
    job2 = load_sample_data(
        one_row_table_id,
        bigquery_client,
        bigquery_schema,
        filename="sample_one_row.json",
    )
    job2.result()
    view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", )
    view.view_query = f"SELECT string FROM `{dataset_id}.sample`"
    bigquery_client.create_table(view)
    yield dataset_id
    bigquery_client.delete_dataset(dataset_id, delete_contents=True)