Exemplo n.º 1
0
def exist_dataset(client: bq.Client, dataset_id: str) -> bool:
    dataset_full_id = get_full_dataset_name(client, dataset_id)
    try:
        client.get_dataset(dataset_full_id)
    except NotFound:
        return False
    return True
Exemplo n.º 2
0
def dataset(bq: bigquery.Client, dataset_id: str):
    """Context manager for creating and deleting the BigQuery dataset for a test."""
    try:
        bq.get_dataset(dataset_id)
    except NotFound:
        bq.create_dataset(dataset_id)
    try:
        yield bq.dataset(dataset_id)
    finally:
        bq.delete_dataset(dataset_id, delete_contents=True)
Exemplo n.º 3
0
def _get_table_id_for_new_entity(client: Client, project: str,
                                 dataset_name: str) -> str:
    """Gets the table_id for the new entity to be uploaded."""

    # First create the BigQuery dataset if it doesn't exist
    dataset = bigquery.Dataset(f"{client.project}.{dataset_name}")
    dataset.location = "US"

    try:
        client.get_dataset(dataset)
    except NotFound:
        # Only create the dataset if it does not exist
        client.create_dataset(dataset, exists_ok=True)

    return f"{client.project}.{dataset_name}.entity_df_{project}_{int(time.time())}"
Exemplo n.º 4
0
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object,
                  output_table_name: str, time_partition: bool) -> None:
    """Executes transformation query to a new destination table.
    Args:
        bq_client: bigquery.Client object
        env_vars: Dictionary of key: value, where value is environment variable
        query_path: Object representing location of SQL query to execute
        output_table_name: String representing name of table that holds output
        time_partition: Boolean indicating whether to time-partition output
    """
    dataset_ref = bq_client.get_dataset(
        bigquery.DatasetReference(project=bq_client.project,
                                  dataset_id=env_vars['corrected_dataset_id']))
    table_ref = dataset_ref.table(output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE

    # Time Partitioning table is only needed for final output query
    if time_partition:
        job_config.time_partitioning = bigquery.TimePartitioning(
            field='usage_start_time', expiration_ms=None)
    logging.info('Attempting query...')
    # Execute Query
    query_job = bq_client.query(query=render_template(query_path, env_vars),
                                job_config=job_config)

    query_job.result()  # Waits for the query to finish
Exemplo n.º 5
0
def _get_table_reference_for_new_entity(client: Client, dataset_project: str,
                                        dataset_name: str) -> str:
    """Gets the table_id for the new entity to be uploaded."""

    # First create the BigQuery dataset if it doesn't exist
    dataset = bigquery.Dataset(f"{dataset_project}.{dataset_name}")
    dataset.location = "US"

    try:
        client.get_dataset(dataset)
    except NotFound:
        # Only create the dataset if it does not exist
        client.create_dataset(dataset, exists_ok=True)

    table_name = offline_utils.get_temp_entity_table_name()

    return f"{dataset_project}.{dataset_name}.{table_name}"
Exemplo n.º 6
0
def get_bq_dataset(
    client: bigquery.Client,
    dataset_id: str,
    project_id: str = None,
) -> bigquery.Dataset:
    # If `project_id is None` then the default project of `client` will be used.
    dataset_ref = client.dataset(
        dataset_id, project=project_id)  # type: bigquery.DatasetReference

    # API request
    return client.get_dataset(dataset_ref)  # type: bigquery.Dataset
Exemplo n.º 7
0
def get_or_create_table(client: bigquery.Client) -> bigquery.Table:
    try:
        dataset = client.get_dataset("sensors")
    except NotFound as _:
        dataset = client.create_dataset("sensors")

    # The default project ID is not set and hence a fully-qualified ID is required.
    table_ref = bigquery.TableReference(dataset, table_id="particulate_matter")
    try:
        return client.get_table(table_ref)
    except NotFound as _:
        return client.create_table(
            bigquery.Table(
                table_ref,
                schema=[
                    bigquery.SchemaField(
                        "humidity",
                        "NUMERIC",
                        description="Sensor DHT22humidity in %"),
                    bigquery.SchemaField("max_micro",
                                         "NUMERIC",
                                         description=""),
                    bigquery.SchemaField("min_micro",
                                         "NUMERIC",
                                         description=""),
                    bigquery.SchemaField("samples", "NUMERIC", description=""),
                    bigquery.SchemaField(
                        "sds_p1",
                        "NUMERIC",
                        description="Sensor SDS011 PM10 in µg/m³"),
                    bigquery.SchemaField(
                        "sds_p2",
                        "NUMERIC",
                        description="Sensor SDS011 PM2.5 in µg/m³"),
                    bigquery.SchemaField(
                        "signal",
                        "NUMERIC",
                        description="WiFi signal strength in dBm"),
                    bigquery.SchemaField(
                        "temperature",
                        "NUMERIC",
                        description="Sensor DHT22 temperature in °C"),
                    bigquery.SchemaField("datetime",
                                         "DATETIME",
                                         description="Datetime of measurement",
                                         mode="REQUIRED"),
                ],
            ))
Exemplo n.º 8
0
def get_tables(project_id: str,
               client: Client,
               dataset_id: Optional[str] = None) -> Iterator[Table]:
    """
    Gets BigQuery tables from a Google Cloud project.

    Args:
        project_id (str): ID of the project.
        dataset_id (Optional[str]): The ID of the dataset.
            If `None`, will retrieve tables from all datasets in project.
        client (Client): A Google Cloud Client instance.

    Yields:
        Table: A BigQuery table.
    """
    dataset_refs = ([f"{project_id}.{dataset_id}"] if dataset_id else
                    (dataset.reference
                     for dataset in client.list_datasets(project=project_id)))
    datasets = (client.get_dataset(dataset_ref)
                for dataset_ref in dataset_refs)
    for dataset in datasets:
        for table in client.list_tables(dataset):
            yield client.get_table(table)