Пример #1
0
def extract_rows(table_name: str = table_name, bucket_name: str = dest_bucket_name, path: str = dest_path,
                 diff_type: str = diff_type, dest_data_project: str = dest_data_project,
                 dest_dataset_name: str = dest_dataset_name, client: bigquery.Client = dest_client):
    job_config = bigquery.ExtractJobConfig(printHeader=False, destination_format="NEWLINE_DELIMITED_JSON")

    diff_type_val = DiffType[diff_type].value

    destination_uri = f"gs://{bucket_name}/{path}/{table_name}/{diff_type_val}/*"
    dataset_ref = bigquery.DatasetReference(dest_data_project, dest_dataset_name)
    table_ref = dataset_ref.table(f"{table_name}_{diff_type_val}")

    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        job_config=job_config
    )  # API request
    print(f"The write destination is: {destination_uri}")
    try:
        extract_job.result()
    except GoogleCloudError as err:
        print(f"There was a {type(err)}")
        print(err)
Пример #2
0
def create_big_query_table():

    bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default',
                           use_legacy_sql=False)

    gcp_credentials = bq_hook._get_credentials()

    bq_client = bigquery.Client(credentials=gcp_credentials,
                                project=bigquery_project)

    target_dataset_ref = bigquery.DatasetReference(
        project=bigquery_project, dataset_id=reference_dataset)

    try:
        target_dataset = bq_client.get_dataset(dataset_ref=target_dataset_ref)
    except NotFound as ex:
        # LOGGER.info(f"Dataset '{target_dataset_ref}' not found, attempting to create.")
        target_dataset = bq_client.create_dataset(dataset=target_dataset_ref)
    target_table_ref = bigquery.TableReference(dataset_ref=target_dataset,
                                               table_id=reference_table)

    target_table = bq_client.delete_table(table=target_table_ref)
Пример #3
0
 def test_export_query_results_to_cloud_storage(self):
     """export_query_results_to_cloud_storage creates the table from the view query and
     exports the table."""
     bucket = self.mock_project_id + '-bucket'
     query_job = futures.Future()
     query_job.set_result([])
     extract_job = futures.Future()
     extract_job.set_result(None)
     self.mock_client.query.return_value = query_job
     self.mock_client.extract_table.return_value = extract_job
     self.bq_client.export_query_results_to_cloud_storage([
         ExportQueryConfig.from_view_query(
             view=self.mock_view,
             view_filter_clause='WHERE x = y',
             intermediate_table_name=self.mock_table_id,
             output_uri=f'gs://{bucket}/view.json',
             output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
         ])
     self.mock_client.query.assert_called()
     self.mock_client.extract_table.assert_called()
     self.mock_client.delete_table.assert_called_with(
         bigquery.DatasetReference(self.mock_project_id, self.mock_view.dataset_id).table(self.mock_table_id))
Пример #4
0
    def test_copy_bq_views(self, mock_table_exists: mock.MagicMock,
                           mock_copy_view: mock.MagicMock) -> None:
        """Check that copy_view is called when the view does not exist in the destination dataset."""
        self.mock_client.list_tables.return_value = [self.mock_view]
        self.mock_client.get_table.return_value = self.mock_view
        mock_table_exists.side_effect = self.table_exists_side_effect

        copy_bq_views(
            source_project_id=self.mock_source_project_id,
            source_dataset_id=self.mock_source_dataset_id,
            destination_project_id=self.mock_destination_project_id,
            destination_dataset_id=self.mock_destination_dataset_id,
        )

        expected_view = BigQueryView(
            project_id=self.mock_destination_project_id,
            dataset_id=self.mock_destination_dataset_id,
            view_id=self.mock_view.view_id,
            view_query_template=self.mock_view.view_query,
            should_materialize=True,
        )

        expected_destination_dataset_ref = bigquery.DatasetReference(
            project=self.mock_destination_project_id,
            dataset_id=self.mock_destination_dataset_id,
        )

        mock_copy_view.assert_called()
        self.assertEqual(expected_view,
                         mock_copy_view.call_args_list[0][1].get("view"))
        self.assertEqual(
            self.mock_destination_project_id,
            mock_copy_view.call_args_list[0][1].get(
                "destination_client").project_id,
        )
        self.assertEqual(
            expected_destination_dataset_ref,
            mock_copy_view.call_args_list[0][1].get("destination_dataset_ref"),
        )
Пример #5
0
def create_external_table(project_id: str) -> None:
    client = bigquery.Client()
    dataset_id = "social_dataset"
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)

    table_id = "ja_kakei_chousa_income_divide_over_two_member"
    table = bigquery.Table(dataset_ref.table(table_id))

    external_config = bigquery.ExternalConfig("PARQUET")
    external_config.source_uris = [
        "gs://ja-kakei-chousa-income-divide-over-two-member/*"
    ]
    external_config.autodetect = True
    hive_partitioning = bigquery.external_config.HivePartitioningOptions()
    hive_partitioning.mode = "AUTO"
    hive_partitioning.require_partition_filter = False
    hive_partitioning.source_uri_prefix = (
        "gs://ja-kakei-chousa-income-divide-over-two-member")
    external_config.hive_partitioning = hive_partitioning
    table.external_data_configuration = external_config

    table = client.create_table(table, exists_ok=True)
Пример #6
0
def create_external_table(project_id: str) -> None:
    client = bigquery.Client()
    dataset_id = "social_dataset"
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)

    table_id = "jasso_gakuseiseikatsu_stats_annual_income_divide_university"
    table = bigquery.Table(dataset_ref.table(table_id))

    external_config = bigquery.ExternalConfig("PARQUET")
    external_config.source_uris = [
        "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university/*"
    ]
    external_config.autodetect = True
    hive_partitioning = bigquery.external_config.HivePartitioningOptions()
    hive_partitioning.mode = "AUTO"
    hive_partitioning.require_partition_filter = False
    hive_partitioning.source_uri_prefix = (
        "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university")
    external_config.hive_partitioning = hive_partitioning
    table.external_data_configuration = external_config

    table = client.create_table(table, exists_ok=True)
    def test_is_table_definition_in_match_with_bigquery_throw_user_exception(
            self):
        dataset_reference = bigquery.DatasetReference('project', 'dataset')
        table_reference = bigquery.TableReference(dataset_reference, 'table')
        schema = [
            bigquery.SchemaField('col1', 'INTEGER'),
            bigquery.SchemaField('col2', 'STRING')
        ]
        table = bigquery.Table(table_reference, schema)

        invalid_schema = [
            bigquery.SchemaField('col2', 'STRING'),
            bigquery.SchemaField('col1', 'INTEGER')
        ]
        try:
            schema_mapper.is_table_definition_in_match_with_bigquery(
                invalid_schema, table)
            pytest.fail('Must raise exception.')
        except exceptions.UserException as err:
            assert str(err) == "Column order mismatch. " \
                               "Actual configuration: col2, col1. " \
                               "Expected BigQuery: col1, col2."
Пример #8
0
def test_update_table_cmek(client, to_delete):
    """Patch a table's metadata."""
    dataset_id = "update_table_cmek_{}".format(_millis())
    table_id = "update_table_cmek_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    client.create_dataset(dataset)
    to_delete.append(dataset)

    table = bigquery.Table(dataset.table(table_id))
    original_kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
        "cloud-samples-tests", "us", "test", "test"
    )
    table.encryption_configuration = bigquery.EncryptionConfiguration(
        kms_key_name=original_kms_key_name
    )
    table = client.create_table(table)

    # [START bigquery_update_table_cmek]
    # from google.cloud import bigquery
    # client = bigquery.Client()

    assert table.encryption_configuration.kms_key_name == original_kms_key_name

    # Set a new encryption key to use for the destination.
    # TODO: Replace this key with a key you have created in KMS.
    updated_kms_key_name = (
        "projects/cloud-samples-tests/locations/us/keyRings/test/cryptoKeys/otherkey"
    )
    table.encryption_configuration = bigquery.EncryptionConfiguration(
        kms_key_name=updated_kms_key_name
    )

    table = client.update_table(table, ["encryption_configuration"])  # API request

    assert table.encryption_configuration.kms_key_name == updated_kms_key_name
    assert original_kms_key_name != updated_kms_key_name
Пример #9
0
def test_create_partitioned_table(client, to_delete):
    dataset_id = "create_table_partitioned_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = client.create_dataset(dataset_ref)
    to_delete.append(dataset)

    # [START bigquery_create_table_partitioned]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # project = client.project
    # dataset_ref = bigquery.DatasetReference(project, 'my_dataset')

    table_ref = dataset_ref.table("my_partitioned_table")
    schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
        bigquery.SchemaField("date", "DATE"),
    ]
    table = bigquery.Table(table_ref, schema=schema)
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        field="date",  # name of column to use for partitioning
        expiration_ms=7776000000,
    )  # 90 days

    table = client.create_table(table)

    print(
        "Created table {}, partitioned on column {}".format(
            table.table_id, table.time_partitioning.field
        )
    )
    # [END bigquery_create_table_partitioned]

    assert table.time_partitioning.type_ == "DAY"
    assert table.time_partitioning.field == "date"
    assert table.time_partitioning.expiration_ms == 7776000000
Пример #10
0
def test_create_table_nested_repeated_schema(client, to_delete):
    dataset_id = "create_table_nested_repeated_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    client.create_dataset(dataset)
    to_delete.append(dataset)

    # [START bigquery_nested_repeated_schema]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # project = client.project
    # dataset_ref = bigquery.DatasetReference(project, 'my_dataset')

    schema = [
        bigquery.SchemaField("id", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("first_name", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("last_name", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("dob", "DATE", mode="NULLABLE"),
        bigquery.SchemaField(
            "addresses",
            "RECORD",
            mode="REPEATED",
            fields=[
                bigquery.SchemaField("status", "STRING", mode="NULLABLE"),
                bigquery.SchemaField("address", "STRING", mode="NULLABLE"),
                bigquery.SchemaField("city", "STRING", mode="NULLABLE"),
                bigquery.SchemaField("state", "STRING", mode="NULLABLE"),
                bigquery.SchemaField("zip", "STRING", mode="NULLABLE"),
                bigquery.SchemaField("numberOfYears", "STRING", mode="NULLABLE"),
            ],
        ),
    ]
    table_ref = dataset_ref.table("my_table")
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)  # API request

    print("Created table {}".format(table.full_table_id))
Пример #11
0
    def test_query_succeed(self, mock_client, mock_kfp_context, mock_dump_json,
                           mock_display):
        mock_kfp_context().__enter__().context_id.return_value = 'ctx1'
        mock_client().get_job.side_effect = exceptions.NotFound('not found')
        mock_dataset = bigquery.DatasetReference('project-1', 'dataset-1')
        mock_client().dataset.return_value = mock_dataset
        mock_client().get_dataset.side_effect = exceptions.NotFound(
            'not found')
        mock_response = {
            'configuration': {
                'query': {
                    'query': 'SELECT * FROM table_1'
                }
            }
        }
        mock_client(
        ).query.return_value.to_api_repr.return_value = mock_response

        result = query('SELECT * FROM table_1',
                       'project-1',
                       'dataset-1',
                       output_gcs_path='gs://output/path')

        self.assertEqual(mock_response, result)
        mock_client().create_dataset.assert_called()
        expected_job_config = bigquery.QueryJobConfig()
        expected_job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
        expected_job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
        expected_job_config.destination = mock_dataset.table('query_ctx1')
        mock_client().query.assert_called_with('SELECT * FROM table_1',
                                               mock.ANY,
                                               job_id='query_ctx1')
        actual_job_config = mock_client().query.call_args_list[0][0][1]
        self.assertDictEqual(expected_job_config.to_api_repr(),
                             actual_job_config.to_api_repr())
        mock_client().extract_table.assert_called_with(
            mock_dataset.table('query_ctx1'), 'gs://output/path')
        self.assertEqual(2, mock_dump_json.call_count)
Пример #12
0
def test_copy_table(client, to_delete):
    dataset_id = 'copy_table_dataset_{}'.format(_millis())
    dest_dataset = bigquery.Dataset(client.dataset(dataset_id))
    dest_dataset = client.create_dataset(dest_dataset)
    to_delete.append(dest_dataset)

    # [START copy_table]
    source_dataset = bigquery.DatasetReference('bigquery-public-data',
                                               'samples')
    source_table_ref = source_dataset.table('shakespeare')

    # dataset_id = 'my_dataset'
    dest_table_ref = dest_dataset.table('destination_table')

    job = client.copy_table(source_table_ref, dest_table_ref)  # API request
    job.result()  # Waits for job to complete.

    assert job.state == 'DONE'
    dest_table = client.get_table(dest_table_ref)  # API request
    assert dest_table.table_id == 'destination_table'
    # [END copy_table]

    to_delete.insert(0, dest_table)
Пример #13
0
    def test_upload_data_tokyo(self, project_id, tokyo_dataset,
                               bigquery_client):
        from google.cloud import bigquery

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        tokyo_destination = "{}.to_gbq_test".format(tokyo_dataset)

        # Initialize table with sample data
        gbq.to_gbq(
            df,
            tokyo_destination,
            project_id,
            credentials=self.credentials,
            location="asia-northeast1",
        )

        table = bigquery_client.get_table(
            bigquery.TableReference(
                bigquery.DatasetReference(project_id, tokyo_dataset),
                "to_gbq_test",
            ))
        assert table.num_rows > 0
Пример #14
0
def create_table_bq(dataset, table_name, schema, field):

    client = bigquery.Client()
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset)

    table_ref = dataset_ref.table(table_name)
    schema = [
        bigquery.SchemaField(key, value) for key, value in schema.items()
    ]
    table = bigquery.Table(table_ref, schema=schema)
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        field=field,  # name of column to use for partitioning
        require_partition_filter=True)

    table = client.create_table(table)

    print(
        f"Created table {table.table_id}, partitioned on column {table.time_partitioning.field}"
    )

    return
Пример #15
0
def check_up_to_date(project, dataset_id, table_name='fact_actuary'):

    sql_check = """
            SELECT CASE
            WHEN ( SELECT CREATED_ON FROM `geb-dwh-test.uat_geb_dwh_eu_act.fact_actuary` LIMIT 1)=( SELECT MAX(CREATED_ON) FROM `geb-dwh-test.uat_geb_dwh_eu_awr.fact_claimdetails`) 
            THEN 'True' ELSE 'False' END AS result;
    """

    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    table_ref = dataset_ref.table(table_name)

    if bq_if_table_exists(client, table_ref):
        print("AVAILABILITY CHECK RETURNS TRUE", table_ref, "ALREADY EXISTS.")
        df = client.query(sql_check).result().to_dataframe()
        if df.iloc[0][0] == 'True':
            print("RECENTNESS CHECK RETURNS:", df.iloc[0][0],
                  "No need to recreate fact_actuary table.\n")
            return True
    else:
        print(
            "Fact Actuary table is unavailable, it will be created with the latest data.\nThis will take approximately 3 minutes. Please be patient and don't exit."
        )
        return False
    def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding(
        self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagDoubleDaggerWINDOWS1252.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagDoubleDaggerWINDOWS1252",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
Пример #17
0
def execute_query(bq_client: bigquery.Client,
    env_vars: Dict[str, Union[str, bool]],
    query_path: object,
    output_table_name: str,
    time_partition: bool) -> None:
    """Executes transformation query to a new destination table.

    Args:
        bq_client: bigquery.Client object
        env_vars: Dictionary of key: value, where value is environment variable
        query_path: Object representing location of SQL query to execute
        output_table_name: String representing name of table that holds output
        time_partition: Boolean indicating whether to time-partition output
    """
    dataset_ref = bq_client.get_dataset(bigquery.DatasetReference(
        project=bq_client.project,
        dataset_id=env_vars['corrected_dataset_id']))
    table_ref = dataset_ref.table(output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE

    # Time Partitioning table is only needed for final output query
    if time_partition:
        job_config.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            expiration_ms=None)
    sql = query_path.query
    sql = sql.format(**env_vars)
    logging.info('Attempting query...')

    # Execute Query
    query_job = bq_client.query(
        query=sql,
        job_config=job_config)

    query_job.result()  # Waits for the query to finish
Пример #18
0
def test_copy_table(client, to_delete):
    DATASET_ID = 'copy_table_dataset_%d' % (_millis(), )
    # [START copy_table]
    source_dataset = bigquery.DatasetReference('bigquery-public-data',
                                               'samples')
    source_table_ref = source_dataset.table('shakespeare')

    dest_dataset = bigquery.Dataset(client.dataset(DATASET_ID))
    dest_dataset = client.create_dataset(dest_dataset)  # API request
    dest_table_ref = dest_dataset.table('destination_table')

    job_config = bigquery.CopyJobConfig()
    job = client.copy_table(source_table_ref,
                            dest_table_ref,
                            job_config=job_config)  # API request
    job.result()  # Waits for job to complete.

    assert job.state == 'DONE'
    dest_table = client.get_table(dest_table_ref)  # API request
    assert dest_table.table_id == 'destination_table'
    # [END copy_table]

    to_delete.append(dest_dataset)
    to_delete.insert(0, dest_table)
Пример #19
0
def copy_bq_views(source_project_id: str, source_dataset_id: str,
                  destination_project_id: str, destination_dataset_id: str):
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)
    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)
    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)
        view_query = table.view_query

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Remove any references to the source_project_id from the view_query
            updated_view_query = view_query.replace(source_project_id,
                                                    '{project_id}')

            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(project_id=destination_project_id,
                                  dataset_id=destination_dataset_id,
                                  view_id=table.table_id,
                                  view_query_template=updated_view_query),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset)
Пример #20
0
def store_temp_table_to_gcs(project_id, dataset_id, table_id, location, bucket_name, destination_full_path, client):

    destination_uri = f'gs://{bucket_name}/{destination_full_path}'
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)
    table_ref = dataset_ref.table(table_id)

    try:
        extract_job = client.extract_table(
            table_ref,
            destination_uri,
            location="US")

        extract_job.result()
    except BadRequest as e:

        destination_uri = f'gs://{bucket_name}/{enumerate_destination_file_name(destination_full_path)}'
        extract_job = client.extract_table(
            table_ref,
            destination_uri,
            location="US")
    except Exception as e:
        raise(e)

    print(f'Successfully exported your query to {destination_uri}')
Пример #21
0
    def test_list(self, data_dir, capsys, credentials_type):
        client = self.get_client('service_account_manage')
        dataset_reference = bigquery.DatasetReference(
            self.get_project(), os.environ.get('BIGQUERY_DATASET'))
        dataset = bigquery.Dataset(dataset_reference)
        client.create_dataset(dataset)

        os.environ['KBC_DATADIR'] = data_dir + 'sample_populated/'
        self.prepare(action='list',
                     data_dir=data_dir,
                     credentials_type=credentials_type)
        application = app.App()
        application.run()
        out, err = capsys.readouterr()
        assert err == ''
        data = json.loads(out)
        assert 'projects' in data.keys()
        assert self.get_project() in map(lambda project: project['id'],
                                         data['projects'])
        project = list(
            filter(lambda project: project['id'] == self.get_project(),
                   data['projects']))[0]
        assert os.environ.get('BIGQUERY_DATASET') in map(
            lambda dataset: dataset['id'], project['datasets'])
    def test_import_bq_file_with_row_missing_columns(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagRowMissingColumns.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagRowMissingColumns",
            destination_table_schema=[
                bigquery.SchemaField("id_column", "STRING", "NULLABLE"),
                bigquery.SchemaField("comment", "STRING", "NULLABLE"),
                bigquery.SchemaField("termination_code", "STRING", "NULLABLE"),
                bigquery.SchemaField("update_date", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
def export_data_for_experiment(date, dataset, bucket, gcs_path, source_project,
                               destination_project, experiment):
    """Export the monitoring data for a specific experiment in the dataset."""
    experiment_slug, start_date = experiment
    table_name = dataset.split(".")[-1]
    storage_client = storage.Client(destination_project)
    client = bigquery.Client(source_project)

    if (date - start_date) > timedelta(days=14):
        # if the experiment has been running for more than 14 days,
        # export data as 30 minute intervals
        query = f"""
            WITH data AS (
                SELECT * EXCEPT(experiment) FROM
                `{dataset}`
                WHERE experiment = '{experiment_slug}' AND
                time >= (
                    SELECT TIMESTAMP(MIN(start_date))
                    FROM `moz-fx-data-experiments.monitoring.experimenter_experiments_v1`
                    WHERE normandy_slug = '{experiment_slug}'
                )
            )
            SELECT * EXCEPT(rn) FROM (
                SELECT *, ROW_NUMBER() OVER (PARTITION BY time) AS rn
                FROM (
                    SELECT
                        * EXCEPT(time),
                        TIMESTAMP_SECONDS(
                            UNIX_SECONDS(time) - MOD(UNIX_SECONDS(time), 30 * 60) + 30 * 60
                            ) AS time,
                    FROM data
                    ORDER BY time DESC
                )
            )
            WHERE rn = 1
            ORDER BY time DESC
        """
    else:
        # for recently launched experiments, data is exported as 5 minuted intervals
        query = f"""
            SELECT * EXCEPT(experiment) FROM {dataset}
            WHERE experiment = '{experiment_slug}' AND
            time >= (
                SELECT TIMESTAMP(MIN(start_date))
                FROM `moz-fx-data-experiments.monitoring.experimenter_experiments_v1`
                WHERE normandy_slug = '{experiment_slug}'
            )
            ORDER BY time DESC
        """

    job = client.query(query)
    job.result()
    dataset_ref = bigquery.DatasetReference(source_project,
                                            job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    # export data for experiment grouped by branch
    _upload_table_to_gcs(
        table_ref,
        bucket,
        gcs_path,
        experiment_slug,
        f"{table_name}_by_branch",
        source_project,
        client,
        storage_client,
    )

    # export aggregated data for experiment
    query = f"""
        SELECT
            time,
            SUM(value) AS value
        FROM `{source_project}.{job.destination.dataset_id}.{job.destination.table_id}`
        GROUP BY 1
        ORDER BY time DESC
    """

    job = client.query(query)
    job.result()
    dataset_ref = bigquery.DatasetReference(source_project,
                                            job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    _upload_table_to_gcs(
        table_ref,
        bucket,
        gcs_path,
        experiment_slug,
        table_name,
        source_project,
        client,
        storage_client,
    )
Пример #24
0
def bigquery(schema, data_directory, ignore_missing_dependency, **params):
    try:
        import google.api_core.exceptions
        from google.cloud import bigquery
    except ImportError:
        msg = 'google-cloud-bigquery dependency is missing'
        if ignore_missing_dependency:
            logger.warning('Ignored: %s', msg)
            return 0
        else:
            raise click.ClickException(msg)

    project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID']
    bqclient = bigquery.Client(project=project_id)

    # Create testing dataset.
    testing_dataset = bigquery.DatasetReference(bqclient.project, 'testing')
    try:
        bqclient.create_dataset(bigquery.Dataset(testing_dataset))
    except google.api_core.exceptions.Conflict:
        pass  # Skip if already created.

    # Set up main data tables.
    job = bqclient.query(schema.read())
    job.result()
    if job.error_result:
        raise click.ClickException(str(job.error_result))

    # Load main data table.
    data_directory = Path(data_directory)
    functional_alltypes_path = data_directory / 'functional_alltypes.csv'
    load_config = bigquery.LoadJobConfig()
    load_config.skip_leading_rows = 1  # skip the header row.
    with open(str(functional_alltypes_path), 'rb') as csvfile:
        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('functional_alltypes'),
            job_config=load_config,
        ).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Load an ingestion time partitioned table.
    with open(str(functional_alltypes_path), 'rb') as csvfile:
        job = bqclient.load_table_from_file(
            csvfile,
            testing_dataset.table('functional_alltypes_parted'),
            job_config=load_config,
        ).result()

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Create a table with complex data types (nested and repeated).
    struct_table_path = data_directory / 'struct_table.avro'
    with open(str(struct_table_path), 'rb') as avrofile:
        load_config = bigquery.LoadJobConfig()
        load_config.write_disposition = 'WRITE_TRUNCATE'
        load_config.source_format = 'AVRO'
        job = bqclient.load_table_from_file(
            avrofile,
            testing_dataset.table('struct_table'),
            job_config=load_config,
        )

        if job.error_result:
            raise click.ClickException(str(job.error_result))

    # Create empty date-partitioned table.
    date_table = bigquery.Table(testing_dataset.table('date_column_parted'))
    date_table.schema = [
        bigquery.SchemaField('my_date_parted_col', 'DATE'),
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('int_col', 'INTEGER'),
    ]
    date_table.time_partitioning = bigquery.TimePartitioning(
        field='my_date_parted_col')
    bqclient.create_table(date_table, exists_ok=True)

    # Create empty timestamp-partitioned tables.
    timestamp_table = bigquery.Table(
        testing_dataset.table('timestamp_column_parted'))
    timestamp_table.schema = [
        bigquery.SchemaField('my_timestamp_parted_col', 'DATE'),
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('int_col', 'INTEGER'),
    ]
    timestamp_table.time_partitioning = bigquery.TimePartitioning(
        field='my_timestamp_parted_col')
    bqclient.create_table(timestamp_table, exists_ok=True)

    # Create a table with a numeric column
    numeric_table = bigquery.Table(testing_dataset.table('numeric_table'))
    numeric_table.schema = [
        bigquery.SchemaField('string_col', 'STRING'),
        bigquery.SchemaField('numeric_col', 'NUMERIC'),
    ]
    bqclient.create_table(numeric_table, exists_ok=True)

    df = pd.read_csv(
        str(functional_alltypes_path),
        usecols=['string_col', 'double_col'],
        header=0,
    )
    numeric_csv = io.StringIO()
    df.to_csv(numeric_csv, header=False, index=False)
    csvfile = io.BytesIO(numeric_csv.getvalue().encode('utf-8'))
    load_config = bigquery.LoadJobConfig()
    load_config.write_disposition = 'WRITE_TRUNCATE'
    load_config.skip_leading_rows = 1  # skip the header row.
    load_config.schema = numeric_table.schema

    job = bqclient.load_table_from_file(
        csvfile,
        testing_dataset.table('numeric_table'),
        job_config=load_config,
    ).result()

    if job.error_result:
        raise click.ClickException(str(job.error_result))
 def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
     return bigquery.DatasetReference(project=self.project_id,
                                      dataset_id=dataset_id)
Пример #26
0
def find_glean_targets(pool, client, project=SHARED_PROD):
    """Return a dict like DELETE_TARGETS for glean tables."""
    datasets = {dataset.dataset_id for dataset in client.list_datasets(project)}
    glean_stable_tables = [
        table
        for tables in pool.map(
            client.list_tables,
            [
                bigquery.DatasetReference(project, dataset_id)
                for dataset_id in datasets
                if dataset_id.endswith("_stable")
            ],
            chunksize=1,
        )
        for table in tables
        if table.labels.get("schema_id") == GLEAN_SCHEMA_ID
    ]
    source_doctype = "deletion_request"
    sources = {
        dataset_id: DeleteSource(qualified_table_id(table), GLEAN_CLIENT_ID, project)
        # dict comprehension will only keep the last value for a given key, so
        # sort by table_id to use the latest version
        for table in sorted(glean_stable_tables, key=lambda t: t.table_id)
        if table.table_id.startswith(source_doctype)
        # re-use source for derived tables
        for dataset_id in [
            table.dataset_id,
            re.sub("_stable$", "_derived", table.dataset_id),
        ]
        if dataset_id in datasets
    }
    return {
        **{
            # glean stable tables that have a source
            glean_target(qualified_table_id(table)): sources[table.dataset_id]
            for table in glean_stable_tables
            if table.dataset_id in sources
            and not table.table_id.startswith(source_doctype)
            # migration tables not yet supported
            and not table.table_id.startswith("migration")
        },
        **{
            # glean derived tables that contain client_id
            client_id_target(table=qualified_table_id(table)): sources[table.dataset_id]
            for table in pool.map(
                client.get_table,
                [
                    table
                    for tables in pool.map(
                        client.list_tables,
                        [
                            bigquery.DatasetReference(project, dataset_id)
                            for dataset_id in sources
                            if not dataset_id.endswith("_stable")
                        ],
                        chunksize=1,
                    )
                    for table in tables
                ],
                chunksize=1,
            )
            if any(field.name == CLIENT_ID for field in table.schema)
        },
    }
Пример #27
0
import logging
from typing import List

from google.cloud import bigquery
from google.cloud import storage
from google.api_core.exceptions import NotFound

GCP_PROJECT = os.environ['GCP_PROJECT']
BUCKET = os.environ['BUCKET']
TABLE_NAME = os.environ['TABLE_NAME']
DATASET_NAME = os.environ['DATASET_NAME']
TEMP_DATASET_NAME = os.environ['TEMP_DATASET_NAME']

bq = bigquery.Client(project=GCP_PROJECT)

temp_dataset_ref = bigquery.DatasetReference(GCP_PROJECT, TEMP_DATASET_NAME)
temp_table_ref = bigquery.TableReference(temp_dataset_ref, TABLE_NAME)


def delete_temp_table():
    """Deletes temporary BigQuery table"""

    try:
        bq.get_table(temp_table_ref)
        bq.delete_table(temp_table_ref)
        logging.info("deleted temp table")
    except NotFound:
        pass


def get_queries() -> List[str]:
 def dataset_ref_for_id(self, dataset_id: str) -> bigquery.DatasetReference:
     return bigquery.DatasetReference(self._project_id, dataset_id)
Пример #29
0
def main_process_function(project_id, config_file, retention, backup_type, expiration):
    """
    This is the main function for exporting the big-query datasets
    to google-cloud-storage.
    :param project_id: Google Cloud Project Id (type:str)
    :param config_file: Backup Configuration File Path (type:str)
    :param retention: Retention Type ["daily", "monthly", "weekly", "yearly"] (type:str)
    :param backup_type: Backup Type ["all", "config"] (type:str)
    :param expiration: True/False (type:bool/str)
    :return NoneType:
    """
    print("Running bigquery dataset export for project:{}".format(project_id))
    # Reading backup-parameters from json config
    with open(config_file) as f:
        master_config = json.load(f)
    backup_config = master_config["backup"]

    location = backup_config["location"]
    schema_path = backup_config["schema_uri"]
    table_path = backup_config["table_uri"]
    project_backup_config = backup_config["projects_dict"][project_id]
    mapped_list = []

    # Get timestamp
    timestamp = datetime.now().strftime("%Y-%m-%d")

    # Creating Big Query Client
    client = bigquery.Client(project=project_id)

    # Getting mapped relation between datasets and their tables
    if backup_type == "all":
        # Get all datasets
        datasets = list_all_datasets(client=client)
        # Map dataset->[tables]
        dataset_tables_map = get_datasets_tables_dict(
            client=client, project_id=project_id, datasets=datasets
        )
        mapped_list.append(dataset_tables_map)
    elif backup_type == "config":
        # Extract the backup pattern from config
        backup_pattern = project_backup_config["backup_pattern"]
        for key, value in backup_pattern.items():
            dataset_tables_map = {}
            if value == "all":
                # Map dataset->[tables]
                dataset_tables_map = get_datasets_tables_dict(
                    client=client, project_id=project_id, datasets=[key]
                )
                mapped_list.append(dataset_tables_map)
            else:
                # Map dataset->[tables]
                dataset_tables_map[key] = value
                mapped_list.append(dataset_tables_map)
    else:
        print(
            "Please provide a valid backup_type option. Choose from ['all', 'config']"
        )
        return None

    # Performing dataset export to gcs (data, schema)
    if mapped_list:
        for datasets_tables_dict in mapped_list:
            for bq_dataset_name in datasets_tables_dict.keys():
                print("Backup Operation on dataset: {}".format(bq_dataset_name))
                for bq_table_name in datasets_tables_dict[bq_dataset_name]:
                    print("Backing up table: {}".format(bq_table_name))
                    try:
                        # Getting dataset and table objects
                        dataset_ref = bigquery.DatasetReference(
                            project_id, bq_dataset_name
                        )
                        table_ref = dataset_ref.table(bq_table_name)
                        table_obj = client.get_table(table_ref)

                        # Specifying extract-job parameters
                        gcs_table_path = table_path.format(
                            bucket_name=project_backup_config["bucket_name"],
                            retention=retention,
                            dataset_name=bq_dataset_name,
                            timestamp=timestamp,
                            table_file_name=bq_table_name + "-*.json",
                        )
                        job_config = bigquery.ExtractJobConfig()
                        job_config.compression = bigquery.Compression.GZIP
                        job_config.destination_format = (
                            bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON
                        )

                        # Exporting table-data to gcs
                        extract_job = client.extract_table(
                            table_ref,
                            gcs_table_path,
                            job_config=job_config,
                            location=location,
                        )
                        extract_job.result()

                        # Extracting table-schema
                        table_schema = table_obj.schema
                        table_schema = [
                            {
                                "name": item.name,
                                "mode": item.mode,
                                "type": item.field_type,
                            }
                            for item in table_schema
                        ]
                        json_schema = json.dumps(table_schema)

                        # Defining schema-path
                        gcs_schema_path = schema_path.format(
                            bucket_name=project_backup_config["bucket_name"],
                            retention=retention,
                            dataset_name=bq_dataset_name,
                            timestamp=timestamp,
                            schema_file_name=bq_table_name + "-schema.json",
                        )

                        # Writing table-schema to gcs
                        sa_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
                        fs = gcsfs.GCSFileSystem(
                            project=project_id, token=sa_credentials
                        )
                        with fs.open(
                            gcs_schema_path,
                            "w",
                            metadata={"Content-Type": "application/json"},
                        ) as f:
                            f.write(json_schema)
                    except Exception as error:
                        print(
                            "Exception occurred for project {} at function {} inside export-loop: {}".format(
                                project_id, "main_process_function", error
                            )
                        )
                # Deleting backup data based on the backup_data_policy
                backup_data_policy = {
                    "daily": 1,
                    "weekly": 7,
                    "monthly": 30,
                    "yearly": 365,
                }
                if str(expiration).title() == "True":
                    try:
                        bucket_name = project_backup_config["bucket_name"]
                        storage_client = storage.Client(project_id)
                        client_bucket = storage_client.get_bucket(bucket_name)
                        delete_date = (
                            datetime.now()
                            - timedelta(days=backup_data_policy[retention])
                        ).strftime("%Y-%m-%d")
                        delete_path = "{retention}/{dataset_name}/{timestamp}".format(
                            retention=retention,
                            dataset_name=bq_dataset_name,
                            timestamp=delete_date,
                        )
                        for file in client_bucket.list_blobs(prefix=delete_path):
                            file.delete()
                            print("Deleted '{}'".format(file.name))
                    except Exception as error:
                        print(
                            "Exception occurred at function {} inside expiration-loop: {}".format(
                                "main_process_function", error
                            )
                        )
                else:
                    pass
        return None
    else:
        print("The mapping between datasets and their tables is empty.")
        return None
Пример #30
0
def inventory_dataset(projectID, datasetID):
    # Pull metadata for a specified Table, and return inventory record for it.
    if debugOutput:
        print(
            f"Getting dataset metadata for project:dataset: {projectID}:{datasetID}"
        )
    if 'client' in globals():
        global client
    else:
        client = bigquery.Client()

    if 'datasetLastAccess' in globals():
        global datasetLastAccess
    else:
        global datasetLastAccess
        datasetLastAccess = get_dataset_access_log()

    dataset_ref = bigquery.DatasetReference(projectID, datasetID)
    dataset = client.get_dataset(dataset_ref)  # API request

    datasetPair = f"{projectID}:{datasetID}"

    owner = []
    for ace in dataset.access_entries:
        if ace.entity_type == 'userByEmail' and ace.role == 'OWNER':
            owner.append(ace.entity_id)
    owner.sort()

    datasetInventoryEntry = {
        'projectId':
        projectID,
        'datasetId':
        dataset.dataset_id,
        'creationTime':
        dataset.created,
        'lastModifiedTime':
        dataset.modified,
        'datasetLastAccess':
        datasetLastAccess[datasetPair] if
        (datasetPair in datasetLastAccess) else None,
        'dataLocation':
        dataset.location,
        'inventoriedTime':
        datetime.datetime.now(),
        'datasetDescription':
        dataset.description,
        'datasetDefaultTableExpiration':
        dataset.default_table_expiration_ms,
        'datasetDefaultPartitionExpiration':
        dataset.default_partition_expiration_ms,
        'owner':
        ",".join(owner),
        'costCenter':
        dataset.labels['cost-center']
        if 'cost-center' in dataset.labels else None,
        'datasetLink':
        f'https://console.cloud.google.com/bigquery?project={projectID}&p={projectID}&d={datasetID}&page=dataset'
    }

    # Remove "None" values
    datasetInventoryEntry = dict(
        filter(lambda item: item[1] is not None,
               datasetInventoryEntry.items()))
    print(dataset.labels)
    print(json.dumps(datasetInventoryEntry, cls=DateTimeEncoder))
    return