def extract_rows(table_name: str = table_name, bucket_name: str = dest_bucket_name, path: str = dest_path, diff_type: str = diff_type, dest_data_project: str = dest_data_project, dest_dataset_name: str = dest_dataset_name, client: bigquery.Client = dest_client): job_config = bigquery.ExtractJobConfig(printHeader=False, destination_format="NEWLINE_DELIMITED_JSON") diff_type_val = DiffType[diff_type].value destination_uri = f"gs://{bucket_name}/{path}/{table_name}/{diff_type_val}/*" dataset_ref = bigquery.DatasetReference(dest_data_project, dest_dataset_name) table_ref = dataset_ref.table(f"{table_name}_{diff_type_val}") extract_job = client.extract_table( table_ref, destination_uri, job_config=job_config ) # API request print(f"The write destination is: {destination_uri}") try: extract_job.result() except GoogleCloudError as err: print(f"There was a {type(err)}") print(err)
def create_big_query_table(): bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default', use_legacy_sql=False) gcp_credentials = bq_hook._get_credentials() bq_client = bigquery.Client(credentials=gcp_credentials, project=bigquery_project) target_dataset_ref = bigquery.DatasetReference( project=bigquery_project, dataset_id=reference_dataset) try: target_dataset = bq_client.get_dataset(dataset_ref=target_dataset_ref) except NotFound as ex: # LOGGER.info(f"Dataset '{target_dataset_ref}' not found, attempting to create.") target_dataset = bq_client.create_dataset(dataset=target_dataset_ref) target_table_ref = bigquery.TableReference(dataset_ref=target_dataset, table_id=reference_table) target_table = bq_client.delete_table(table=target_table_ref)
def test_export_query_results_to_cloud_storage(self): """export_query_results_to_cloud_storage creates the table from the view query and exports the table.""" bucket = self.mock_project_id + '-bucket' query_job = futures.Future() query_job.set_result([]) extract_job = futures.Future() extract_job.set_result(None) self.mock_client.query.return_value = query_job self.mock_client.extract_table.return_value = extract_job self.bq_client.export_query_results_to_cloud_storage([ ExportQueryConfig.from_view_query( view=self.mock_view, view_filter_clause='WHERE x = y', intermediate_table_name=self.mock_table_id, output_uri=f'gs://{bucket}/view.json', output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON) ]) self.mock_client.query.assert_called() self.mock_client.extract_table.assert_called() self.mock_client.delete_table.assert_called_with( bigquery.DatasetReference(self.mock_project_id, self.mock_view.dataset_id).table(self.mock_table_id))
def test_copy_bq_views(self, mock_table_exists: mock.MagicMock, mock_copy_view: mock.MagicMock) -> None: """Check that copy_view is called when the view does not exist in the destination dataset.""" self.mock_client.list_tables.return_value = [self.mock_view] self.mock_client.get_table.return_value = self.mock_view mock_table_exists.side_effect = self.table_exists_side_effect copy_bq_views( source_project_id=self.mock_source_project_id, source_dataset_id=self.mock_source_dataset_id, destination_project_id=self.mock_destination_project_id, destination_dataset_id=self.mock_destination_dataset_id, ) expected_view = BigQueryView( project_id=self.mock_destination_project_id, dataset_id=self.mock_destination_dataset_id, view_id=self.mock_view.view_id, view_query_template=self.mock_view.view_query, should_materialize=True, ) expected_destination_dataset_ref = bigquery.DatasetReference( project=self.mock_destination_project_id, dataset_id=self.mock_destination_dataset_id, ) mock_copy_view.assert_called() self.assertEqual(expected_view, mock_copy_view.call_args_list[0][1].get("view")) self.assertEqual( self.mock_destination_project_id, mock_copy_view.call_args_list[0][1].get( "destination_client").project_id, ) self.assertEqual( expected_destination_dataset_ref, mock_copy_view.call_args_list[0][1].get("destination_dataset_ref"), )
def create_external_table(project_id: str) -> None: client = bigquery.Client() dataset_id = "social_dataset" dataset_ref = bigquery.DatasetReference(project_id, dataset_id) table_id = "ja_kakei_chousa_income_divide_over_two_member" table = bigquery.Table(dataset_ref.table(table_id)) external_config = bigquery.ExternalConfig("PARQUET") external_config.source_uris = [ "gs://ja-kakei-chousa-income-divide-over-two-member/*" ] external_config.autodetect = True hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.require_partition_filter = False hive_partitioning.source_uri_prefix = ( "gs://ja-kakei-chousa-income-divide-over-two-member") external_config.hive_partitioning = hive_partitioning table.external_data_configuration = external_config table = client.create_table(table, exists_ok=True)
def create_external_table(project_id: str) -> None: client = bigquery.Client() dataset_id = "social_dataset" dataset_ref = bigquery.DatasetReference(project_id, dataset_id) table_id = "jasso_gakuseiseikatsu_stats_annual_income_divide_university" table = bigquery.Table(dataset_ref.table(table_id)) external_config = bigquery.ExternalConfig("PARQUET") external_config.source_uris = [ "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university/*" ] external_config.autodetect = True hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.require_partition_filter = False hive_partitioning.source_uri_prefix = ( "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university") external_config.hive_partitioning = hive_partitioning table.external_data_configuration = external_config table = client.create_table(table, exists_ok=True)
def test_is_table_definition_in_match_with_bigquery_throw_user_exception( self): dataset_reference = bigquery.DatasetReference('project', 'dataset') table_reference = bigquery.TableReference(dataset_reference, 'table') schema = [ bigquery.SchemaField('col1', 'INTEGER'), bigquery.SchemaField('col2', 'STRING') ] table = bigquery.Table(table_reference, schema) invalid_schema = [ bigquery.SchemaField('col2', 'STRING'), bigquery.SchemaField('col1', 'INTEGER') ] try: schema_mapper.is_table_definition_in_match_with_bigquery( invalid_schema, table) pytest.fail('Must raise exception.') except exceptions.UserException as err: assert str(err) == "Column order mismatch. " \ "Actual configuration: col2, col1. " \ "Expected BigQuery: col1, col2."
def test_update_table_cmek(client, to_delete): """Patch a table's metadata.""" dataset_id = "update_table_cmek_{}".format(_millis()) table_id = "update_table_cmek_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = bigquery.Dataset(dataset_ref) client.create_dataset(dataset) to_delete.append(dataset) table = bigquery.Table(dataset.table(table_id)) original_kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format( "cloud-samples-tests", "us", "test", "test" ) table.encryption_configuration = bigquery.EncryptionConfiguration( kms_key_name=original_kms_key_name ) table = client.create_table(table) # [START bigquery_update_table_cmek] # from google.cloud import bigquery # client = bigquery.Client() assert table.encryption_configuration.kms_key_name == original_kms_key_name # Set a new encryption key to use for the destination. # TODO: Replace this key with a key you have created in KMS. updated_kms_key_name = ( "projects/cloud-samples-tests/locations/us/keyRings/test/cryptoKeys/otherkey" ) table.encryption_configuration = bigquery.EncryptionConfiguration( kms_key_name=updated_kms_key_name ) table = client.update_table(table, ["encryption_configuration"]) # API request assert table.encryption_configuration.kms_key_name == updated_kms_key_name assert original_kms_key_name != updated_kms_key_name
def test_create_partitioned_table(client, to_delete): dataset_id = "create_table_partitioned_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = client.create_dataset(dataset_ref) to_delete.append(dataset) # [START bigquery_create_table_partitioned] # from google.cloud import bigquery # client = bigquery.Client() # project = client.project # dataset_ref = bigquery.DatasetReference(project, 'my_dataset') table_ref = dataset_ref.table("my_partitioned_table") schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), bigquery.SchemaField("date", "DATE"), ] table = bigquery.Table(table_ref, schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field="date", # name of column to use for partitioning expiration_ms=7776000000, ) # 90 days table = client.create_table(table) print( "Created table {}, partitioned on column {}".format( table.table_id, table.time_partitioning.field ) ) # [END bigquery_create_table_partitioned] assert table.time_partitioning.type_ == "DAY" assert table.time_partitioning.field == "date" assert table.time_partitioning.expiration_ms == 7776000000
def test_create_table_nested_repeated_schema(client, to_delete): dataset_id = "create_table_nested_repeated_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = bigquery.Dataset(dataset_ref) client.create_dataset(dataset) to_delete.append(dataset) # [START bigquery_nested_repeated_schema] # from google.cloud import bigquery # client = bigquery.Client() # project = client.project # dataset_ref = bigquery.DatasetReference(project, 'my_dataset') schema = [ bigquery.SchemaField("id", "STRING", mode="NULLABLE"), bigquery.SchemaField("first_name", "STRING", mode="NULLABLE"), bigquery.SchemaField("last_name", "STRING", mode="NULLABLE"), bigquery.SchemaField("dob", "DATE", mode="NULLABLE"), bigquery.SchemaField( "addresses", "RECORD", mode="REPEATED", fields=[ bigquery.SchemaField("status", "STRING", mode="NULLABLE"), bigquery.SchemaField("address", "STRING", mode="NULLABLE"), bigquery.SchemaField("city", "STRING", mode="NULLABLE"), bigquery.SchemaField("state", "STRING", mode="NULLABLE"), bigquery.SchemaField("zip", "STRING", mode="NULLABLE"), bigquery.SchemaField("numberOfYears", "STRING", mode="NULLABLE"), ], ), ] table_ref = dataset_ref.table("my_table") table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table) # API request print("Created table {}".format(table.full_table_id))
def test_query_succeed(self, mock_client, mock_kfp_context, mock_dump_json, mock_display): mock_kfp_context().__enter__().context_id.return_value = 'ctx1' mock_client().get_job.side_effect = exceptions.NotFound('not found') mock_dataset = bigquery.DatasetReference('project-1', 'dataset-1') mock_client().dataset.return_value = mock_dataset mock_client().get_dataset.side_effect = exceptions.NotFound( 'not found') mock_response = { 'configuration': { 'query': { 'query': 'SELECT * FROM table_1' } } } mock_client( ).query.return_value.to_api_repr.return_value = mock_response result = query('SELECT * FROM table_1', 'project-1', 'dataset-1', output_gcs_path='gs://output/path') self.assertEqual(mock_response, result) mock_client().create_dataset.assert_called() expected_job_config = bigquery.QueryJobConfig() expected_job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED expected_job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE expected_job_config.destination = mock_dataset.table('query_ctx1') mock_client().query.assert_called_with('SELECT * FROM table_1', mock.ANY, job_id='query_ctx1') actual_job_config = mock_client().query.call_args_list[0][0][1] self.assertDictEqual(expected_job_config.to_api_repr(), actual_job_config.to_api_repr()) mock_client().extract_table.assert_called_with( mock_dataset.table('query_ctx1'), 'gs://output/path') self.assertEqual(2, mock_dump_json.call_count)
def test_copy_table(client, to_delete): dataset_id = 'copy_table_dataset_{}'.format(_millis()) dest_dataset = bigquery.Dataset(client.dataset(dataset_id)) dest_dataset = client.create_dataset(dest_dataset) to_delete.append(dest_dataset) # [START copy_table] source_dataset = bigquery.DatasetReference('bigquery-public-data', 'samples') source_table_ref = source_dataset.table('shakespeare') # dataset_id = 'my_dataset' dest_table_ref = dest_dataset.table('destination_table') job = client.copy_table(source_table_ref, dest_table_ref) # API request job.result() # Waits for job to complete. assert job.state == 'DONE' dest_table = client.get_table(dest_table_ref) # API request assert dest_table.table_id == 'destination_table' # [END copy_table] to_delete.insert(0, dest_table)
def test_upload_data_tokyo(self, project_id, tokyo_dataset, bigquery_client): from google.cloud import bigquery test_size = 10 df = make_mixed_dataframe_v2(test_size) tokyo_destination = "{}.to_gbq_test".format(tokyo_dataset) # Initialize table with sample data gbq.to_gbq( df, tokyo_destination, project_id, credentials=self.credentials, location="asia-northeast1", ) table = bigquery_client.get_table( bigquery.TableReference( bigquery.DatasetReference(project_id, tokyo_dataset), "to_gbq_test", )) assert table.num_rows > 0
def create_table_bq(dataset, table_name, schema, field): client = bigquery.Client() project = client.project dataset_ref = bigquery.DatasetReference(project, dataset) table_ref = dataset_ref.table(table_name) schema = [ bigquery.SchemaField(key, value) for key, value in schema.items() ] table = bigquery.Table(table_ref, schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=field, # name of column to use for partitioning require_partition_filter=True) table = client.create_table(table) print( f"Created table {table.table_id}, partitioned on column {table.time_partitioning.field}" ) return
def check_up_to_date(project, dataset_id, table_name='fact_actuary'): sql_check = """ SELECT CASE WHEN ( SELECT CREATED_ON FROM `geb-dwh-test.uat_geb_dwh_eu_act.fact_actuary` LIMIT 1)=( SELECT MAX(CREATED_ON) FROM `geb-dwh-test.uat_geb_dwh_eu_awr.fact_claimdetails`) THEN 'True' ELSE 'False' END AS result; """ dataset_ref = bigquery.DatasetReference(project, dataset_id) table_ref = dataset_ref.table(table_name) if bq_if_table_exists(client, table_ref): print("AVAILABILITY CHECK RETURNS TRUE", table_ref, "ALREADY EXISTS.") df = client.query(sql_check).result().to_dataframe() if df.iloc[0][0] == 'True': print("RECENTNESS CHECK RETURNS:", df.iloc[0][0], "No need to recreate fact_actuary table.\n") return True else: print( "Fact Actuary table is unavailable, it will be created with the latest data.\nThis will take approximately 3 minutes. Please be patient and don't exit." ) return False
def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding( self, ) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagDoubleDaggerWINDOWS1252.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagDoubleDaggerWINDOWS1252", destination_table_schema=[ bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"), bigquery.SchemaField("COL2", "STRING", "NULLABLE"), bigquery.SchemaField("COL3", "STRING", "NULLABLE"), bigquery.SchemaField("COL4", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def execute_query(bq_client: bigquery.Client, env_vars: Dict[str, Union[str, bool]], query_path: object, output_table_name: str, time_partition: bool) -> None: """Executes transformation query to a new destination table. Args: bq_client: bigquery.Client object env_vars: Dictionary of key: value, where value is environment variable query_path: Object representing location of SQL query to execute output_table_name: String representing name of table that holds output time_partition: Boolean indicating whether to time-partition output """ dataset_ref = bq_client.get_dataset(bigquery.DatasetReference( project=bq_client.project, dataset_id=env_vars['corrected_dataset_id'])) table_ref = dataset_ref.table(output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE # Time Partitioning table is only needed for final output query if time_partition: job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, expiration_ms=None) sql = query_path.query sql = sql.format(**env_vars) logging.info('Attempting query...') # Execute Query query_job = bq_client.query( query=sql, job_config=job_config) query_job.result() # Waits for the query to finish
def test_copy_table(client, to_delete): DATASET_ID = 'copy_table_dataset_%d' % (_millis(), ) # [START copy_table] source_dataset = bigquery.DatasetReference('bigquery-public-data', 'samples') source_table_ref = source_dataset.table('shakespeare') dest_dataset = bigquery.Dataset(client.dataset(DATASET_ID)) dest_dataset = client.create_dataset(dest_dataset) # API request dest_table_ref = dest_dataset.table('destination_table') job_config = bigquery.CopyJobConfig() job = client.copy_table(source_table_ref, dest_table_ref, job_config=job_config) # API request job.result() # Waits for job to complete. assert job.state == 'DONE' dest_table = client.get_table(dest_table_ref) # API request assert dest_table.table_id == 'destination_table' # [END copy_table] to_delete.append(dest_dataset) to_delete.insert(0, dest_table)
def copy_bq_views(source_project_id: str, source_dataset_id: str, destination_project_id: str, destination_dataset_id: str): """Copies all views from the source_project_id.source_dataset_id to the destination_project_id.destination_dataset_id.""" # Construct a BigQuery client with the source_project_id source_client = BigQueryClientImpl(project_id=source_project_id) # Construct a BigQuery client with the destination_project_id destination_client = BigQueryClientImpl(project_id=destination_project_id) destination_dataset = bigquery.DatasetReference(destination_project_id, destination_dataset_id) tables_in_source_dataset = source_client.list_tables(source_dataset_id) for table_ref in tables_in_source_dataset: table = source_client.get_table( source_client.dataset_ref_for_id(table_ref.dataset_id), table_ref.table_id) view_query = table.view_query # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the # destination dataset if view_query and not destination_client.table_exists( destination_dataset, table_id=table.table_id): # Remove any references to the source_project_id from the view_query updated_view_query = view_query.replace(source_project_id, '{project_id}') # Retrieve all of the information about the view source_client.copy_view( view=BigQueryView(project_id=destination_project_id, dataset_id=destination_dataset_id, view_id=table.table_id, view_query_template=updated_view_query), destination_client=destination_client, destination_dataset_ref=destination_dataset)
def store_temp_table_to_gcs(project_id, dataset_id, table_id, location, bucket_name, destination_full_path, client): destination_uri = f'gs://{bucket_name}/{destination_full_path}' dataset_ref = bigquery.DatasetReference(project_id, dataset_id) table_ref = dataset_ref.table(table_id) try: extract_job = client.extract_table( table_ref, destination_uri, location="US") extract_job.result() except BadRequest as e: destination_uri = f'gs://{bucket_name}/{enumerate_destination_file_name(destination_full_path)}' extract_job = client.extract_table( table_ref, destination_uri, location="US") except Exception as e: raise(e) print(f'Successfully exported your query to {destination_uri}')
def test_list(self, data_dir, capsys, credentials_type): client = self.get_client('service_account_manage') dataset_reference = bigquery.DatasetReference( self.get_project(), os.environ.get('BIGQUERY_DATASET')) dataset = bigquery.Dataset(dataset_reference) client.create_dataset(dataset) os.environ['KBC_DATADIR'] = data_dir + 'sample_populated/' self.prepare(action='list', data_dir=data_dir, credentials_type=credentials_type) application = app.App() application.run() out, err = capsys.readouterr() assert err == '' data = json.loads(out) assert 'projects' in data.keys() assert self.get_project() in map(lambda project: project['id'], data['projects']) project = list( filter(lambda project: project['id'] == self.get_project(), data['projects']))[0] assert os.environ.get('BIGQUERY_DATASET') in map( lambda dataset: dataset['id'], project['datasets'])
def test_import_bq_file_with_row_missing_columns(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagRowMissingColumns.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagRowMissingColumns", destination_table_schema=[ bigquery.SchemaField("id_column", "STRING", "NULLABLE"), bigquery.SchemaField("comment", "STRING", "NULLABLE"), bigquery.SchemaField("termination_code", "STRING", "NULLABLE"), bigquery.SchemaField("update_date", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) self.assertEqual(2, self.num_lines_uploaded) self._check_no_temp_files_remain()
def export_data_for_experiment(date, dataset, bucket, gcs_path, source_project, destination_project, experiment): """Export the monitoring data for a specific experiment in the dataset.""" experiment_slug, start_date = experiment table_name = dataset.split(".")[-1] storage_client = storage.Client(destination_project) client = bigquery.Client(source_project) if (date - start_date) > timedelta(days=14): # if the experiment has been running for more than 14 days, # export data as 30 minute intervals query = f""" WITH data AS ( SELECT * EXCEPT(experiment) FROM `{dataset}` WHERE experiment = '{experiment_slug}' AND time >= ( SELECT TIMESTAMP(MIN(start_date)) FROM `moz-fx-data-experiments.monitoring.experimenter_experiments_v1` WHERE normandy_slug = '{experiment_slug}' ) ) SELECT * EXCEPT(rn) FROM ( SELECT *, ROW_NUMBER() OVER (PARTITION BY time) AS rn FROM ( SELECT * EXCEPT(time), TIMESTAMP_SECONDS( UNIX_SECONDS(time) - MOD(UNIX_SECONDS(time), 30 * 60) + 30 * 60 ) AS time, FROM data ORDER BY time DESC ) ) WHERE rn = 1 ORDER BY time DESC """ else: # for recently launched experiments, data is exported as 5 minuted intervals query = f""" SELECT * EXCEPT(experiment) FROM {dataset} WHERE experiment = '{experiment_slug}' AND time >= ( SELECT TIMESTAMP(MIN(start_date)) FROM `moz-fx-data-experiments.monitoring.experimenter_experiments_v1` WHERE normandy_slug = '{experiment_slug}' ) ORDER BY time DESC """ job = client.query(query) job.result() dataset_ref = bigquery.DatasetReference(source_project, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) # export data for experiment grouped by branch _upload_table_to_gcs( table_ref, bucket, gcs_path, experiment_slug, f"{table_name}_by_branch", source_project, client, storage_client, ) # export aggregated data for experiment query = f""" SELECT time, SUM(value) AS value FROM `{source_project}.{job.destination.dataset_id}.{job.destination.table_id}` GROUP BY 1 ORDER BY time DESC """ job = client.query(query) job.result() dataset_ref = bigquery.DatasetReference(source_project, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) _upload_table_to_gcs( table_ref, bucket, gcs_path, experiment_slug, table_name, source_project, client, storage_client, )
def bigquery(schema, data_directory, ignore_missing_dependency, **params): try: import google.api_core.exceptions from google.cloud import bigquery except ImportError: msg = 'google-cloud-bigquery dependency is missing' if ignore_missing_dependency: logger.warning('Ignored: %s', msg) return 0 else: raise click.ClickException(msg) project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID'] bqclient = bigquery.Client(project=project_id) # Create testing dataset. testing_dataset = bigquery.DatasetReference(bqclient.project, 'testing') try: bqclient.create_dataset(bigquery.Dataset(testing_dataset)) except google.api_core.exceptions.Conflict: pass # Skip if already created. # Set up main data tables. job = bqclient.query(schema.read()) job.result() if job.error_result: raise click.ClickException(str(job.error_result)) # Load main data table. data_directory = Path(data_directory) functional_alltypes_path = data_directory / 'functional_alltypes.csv' load_config = bigquery.LoadJobConfig() load_config.skip_leading_rows = 1 # skip the header row. with open(str(functional_alltypes_path), 'rb') as csvfile: job = bqclient.load_table_from_file( csvfile, testing_dataset.table('functional_alltypes'), job_config=load_config, ).result() if job.error_result: raise click.ClickException(str(job.error_result)) # Load an ingestion time partitioned table. with open(str(functional_alltypes_path), 'rb') as csvfile: job = bqclient.load_table_from_file( csvfile, testing_dataset.table('functional_alltypes_parted'), job_config=load_config, ).result() if job.error_result: raise click.ClickException(str(job.error_result)) # Create a table with complex data types (nested and repeated). struct_table_path = data_directory / 'struct_table.avro' with open(str(struct_table_path), 'rb') as avrofile: load_config = bigquery.LoadJobConfig() load_config.write_disposition = 'WRITE_TRUNCATE' load_config.source_format = 'AVRO' job = bqclient.load_table_from_file( avrofile, testing_dataset.table('struct_table'), job_config=load_config, ) if job.error_result: raise click.ClickException(str(job.error_result)) # Create empty date-partitioned table. date_table = bigquery.Table(testing_dataset.table('date_column_parted')) date_table.schema = [ bigquery.SchemaField('my_date_parted_col', 'DATE'), bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('int_col', 'INTEGER'), ] date_table.time_partitioning = bigquery.TimePartitioning( field='my_date_parted_col') bqclient.create_table(date_table, exists_ok=True) # Create empty timestamp-partitioned tables. timestamp_table = bigquery.Table( testing_dataset.table('timestamp_column_parted')) timestamp_table.schema = [ bigquery.SchemaField('my_timestamp_parted_col', 'DATE'), bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('int_col', 'INTEGER'), ] timestamp_table.time_partitioning = bigquery.TimePartitioning( field='my_timestamp_parted_col') bqclient.create_table(timestamp_table, exists_ok=True) # Create a table with a numeric column numeric_table = bigquery.Table(testing_dataset.table('numeric_table')) numeric_table.schema = [ bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('numeric_col', 'NUMERIC'), ] bqclient.create_table(numeric_table, exists_ok=True) df = pd.read_csv( str(functional_alltypes_path), usecols=['string_col', 'double_col'], header=0, ) numeric_csv = io.StringIO() df.to_csv(numeric_csv, header=False, index=False) csvfile = io.BytesIO(numeric_csv.getvalue().encode('utf-8')) load_config = bigquery.LoadJobConfig() load_config.write_disposition = 'WRITE_TRUNCATE' load_config.skip_leading_rows = 1 # skip the header row. load_config.schema = numeric_table.schema job = bqclient.load_table_from_file( csvfile, testing_dataset.table('numeric_table'), job_config=load_config, ).result() if job.error_result: raise click.ClickException(str(job.error_result))
def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id)
def find_glean_targets(pool, client, project=SHARED_PROD): """Return a dict like DELETE_TARGETS for glean tables.""" datasets = {dataset.dataset_id for dataset in client.list_datasets(project)} glean_stable_tables = [ table for tables in pool.map( client.list_tables, [ bigquery.DatasetReference(project, dataset_id) for dataset_id in datasets if dataset_id.endswith("_stable") ], chunksize=1, ) for table in tables if table.labels.get("schema_id") == GLEAN_SCHEMA_ID ] source_doctype = "deletion_request" sources = { dataset_id: DeleteSource(qualified_table_id(table), GLEAN_CLIENT_ID, project) # dict comprehension will only keep the last value for a given key, so # sort by table_id to use the latest version for table in sorted(glean_stable_tables, key=lambda t: t.table_id) if table.table_id.startswith(source_doctype) # re-use source for derived tables for dataset_id in [ table.dataset_id, re.sub("_stable$", "_derived", table.dataset_id), ] if dataset_id in datasets } return { **{ # glean stable tables that have a source glean_target(qualified_table_id(table)): sources[table.dataset_id] for table in glean_stable_tables if table.dataset_id in sources and not table.table_id.startswith(source_doctype) # migration tables not yet supported and not table.table_id.startswith("migration") }, **{ # glean derived tables that contain client_id client_id_target(table=qualified_table_id(table)): sources[table.dataset_id] for table in pool.map( client.get_table, [ table for tables in pool.map( client.list_tables, [ bigquery.DatasetReference(project, dataset_id) for dataset_id in sources if not dataset_id.endswith("_stable") ], chunksize=1, ) for table in tables ], chunksize=1, ) if any(field.name == CLIENT_ID for field in table.schema) }, }
import logging from typing import List from google.cloud import bigquery from google.cloud import storage from google.api_core.exceptions import NotFound GCP_PROJECT = os.environ['GCP_PROJECT'] BUCKET = os.environ['BUCKET'] TABLE_NAME = os.environ['TABLE_NAME'] DATASET_NAME = os.environ['DATASET_NAME'] TEMP_DATASET_NAME = os.environ['TEMP_DATASET_NAME'] bq = bigquery.Client(project=GCP_PROJECT) temp_dataset_ref = bigquery.DatasetReference(GCP_PROJECT, TEMP_DATASET_NAME) temp_table_ref = bigquery.TableReference(temp_dataset_ref, TABLE_NAME) def delete_temp_table(): """Deletes temporary BigQuery table""" try: bq.get_table(temp_table_ref) bq.delete_table(temp_table_ref) logging.info("deleted temp table") except NotFound: pass def get_queries() -> List[str]:
def dataset_ref_for_id(self, dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(self._project_id, dataset_id)
def main_process_function(project_id, config_file, retention, backup_type, expiration): """ This is the main function for exporting the big-query datasets to google-cloud-storage. :param project_id: Google Cloud Project Id (type:str) :param config_file: Backup Configuration File Path (type:str) :param retention: Retention Type ["daily", "monthly", "weekly", "yearly"] (type:str) :param backup_type: Backup Type ["all", "config"] (type:str) :param expiration: True/False (type:bool/str) :return NoneType: """ print("Running bigquery dataset export for project:{}".format(project_id)) # Reading backup-parameters from json config with open(config_file) as f: master_config = json.load(f) backup_config = master_config["backup"] location = backup_config["location"] schema_path = backup_config["schema_uri"] table_path = backup_config["table_uri"] project_backup_config = backup_config["projects_dict"][project_id] mapped_list = [] # Get timestamp timestamp = datetime.now().strftime("%Y-%m-%d") # Creating Big Query Client client = bigquery.Client(project=project_id) # Getting mapped relation between datasets and their tables if backup_type == "all": # Get all datasets datasets = list_all_datasets(client=client) # Map dataset->[tables] dataset_tables_map = get_datasets_tables_dict( client=client, project_id=project_id, datasets=datasets ) mapped_list.append(dataset_tables_map) elif backup_type == "config": # Extract the backup pattern from config backup_pattern = project_backup_config["backup_pattern"] for key, value in backup_pattern.items(): dataset_tables_map = {} if value == "all": # Map dataset->[tables] dataset_tables_map = get_datasets_tables_dict( client=client, project_id=project_id, datasets=[key] ) mapped_list.append(dataset_tables_map) else: # Map dataset->[tables] dataset_tables_map[key] = value mapped_list.append(dataset_tables_map) else: print( "Please provide a valid backup_type option. Choose from ['all', 'config']" ) return None # Performing dataset export to gcs (data, schema) if mapped_list: for datasets_tables_dict in mapped_list: for bq_dataset_name in datasets_tables_dict.keys(): print("Backup Operation on dataset: {}".format(bq_dataset_name)) for bq_table_name in datasets_tables_dict[bq_dataset_name]: print("Backing up table: {}".format(bq_table_name)) try: # Getting dataset and table objects dataset_ref = bigquery.DatasetReference( project_id, bq_dataset_name ) table_ref = dataset_ref.table(bq_table_name) table_obj = client.get_table(table_ref) # Specifying extract-job parameters gcs_table_path = table_path.format( bucket_name=project_backup_config["bucket_name"], retention=retention, dataset_name=bq_dataset_name, timestamp=timestamp, table_file_name=bq_table_name + "-*.json", ) job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP job_config.destination_format = ( bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON ) # Exporting table-data to gcs extract_job = client.extract_table( table_ref, gcs_table_path, job_config=job_config, location=location, ) extract_job.result() # Extracting table-schema table_schema = table_obj.schema table_schema = [ { "name": item.name, "mode": item.mode, "type": item.field_type, } for item in table_schema ] json_schema = json.dumps(table_schema) # Defining schema-path gcs_schema_path = schema_path.format( bucket_name=project_backup_config["bucket_name"], retention=retention, dataset_name=bq_dataset_name, timestamp=timestamp, schema_file_name=bq_table_name + "-schema.json", ) # Writing table-schema to gcs sa_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") fs = gcsfs.GCSFileSystem( project=project_id, token=sa_credentials ) with fs.open( gcs_schema_path, "w", metadata={"Content-Type": "application/json"}, ) as f: f.write(json_schema) except Exception as error: print( "Exception occurred for project {} at function {} inside export-loop: {}".format( project_id, "main_process_function", error ) ) # Deleting backup data based on the backup_data_policy backup_data_policy = { "daily": 1, "weekly": 7, "monthly": 30, "yearly": 365, } if str(expiration).title() == "True": try: bucket_name = project_backup_config["bucket_name"] storage_client = storage.Client(project_id) client_bucket = storage_client.get_bucket(bucket_name) delete_date = ( datetime.now() - timedelta(days=backup_data_policy[retention]) ).strftime("%Y-%m-%d") delete_path = "{retention}/{dataset_name}/{timestamp}".format( retention=retention, dataset_name=bq_dataset_name, timestamp=delete_date, ) for file in client_bucket.list_blobs(prefix=delete_path): file.delete() print("Deleted '{}'".format(file.name)) except Exception as error: print( "Exception occurred at function {} inside expiration-loop: {}".format( "main_process_function", error ) ) else: pass return None else: print("The mapping between datasets and their tables is empty.") return None
def inventory_dataset(projectID, datasetID): # Pull metadata for a specified Table, and return inventory record for it. if debugOutput: print( f"Getting dataset metadata for project:dataset: {projectID}:{datasetID}" ) if 'client' in globals(): global client else: client = bigquery.Client() if 'datasetLastAccess' in globals(): global datasetLastAccess else: global datasetLastAccess datasetLastAccess = get_dataset_access_log() dataset_ref = bigquery.DatasetReference(projectID, datasetID) dataset = client.get_dataset(dataset_ref) # API request datasetPair = f"{projectID}:{datasetID}" owner = [] for ace in dataset.access_entries: if ace.entity_type == 'userByEmail' and ace.role == 'OWNER': owner.append(ace.entity_id) owner.sort() datasetInventoryEntry = { 'projectId': projectID, 'datasetId': dataset.dataset_id, 'creationTime': dataset.created, 'lastModifiedTime': dataset.modified, 'datasetLastAccess': datasetLastAccess[datasetPair] if (datasetPair in datasetLastAccess) else None, 'dataLocation': dataset.location, 'inventoriedTime': datetime.datetime.now(), 'datasetDescription': dataset.description, 'datasetDefaultTableExpiration': dataset.default_table_expiration_ms, 'datasetDefaultPartitionExpiration': dataset.default_partition_expiration_ms, 'owner': ",".join(owner), 'costCenter': dataset.labels['cost-center'] if 'cost-center' in dataset.labels else None, 'datasetLink': f'https://console.cloud.google.com/bigquery?project={projectID}&p={projectID}&d={datasetID}&page=dataset' } # Remove "None" values datasetInventoryEntry = dict( filter(lambda item: item[1] is not None, datasetInventoryEntry.items())) print(dataset.labels) print(json.dumps(datasetInventoryEntry, cls=DateTimeEncoder)) return