def create_datasets(client, rdr_dataset, release_tag): rdr_clean = f'{release_tag}_rdr' rdr_staging = f'{rdr_clean}_staging' rdr_sandbox = f'{rdr_clean}_sandbox' staging_desc = f'Intermediary dataset to apply cleaning rules on {rdr_dataset}' labels = { "phase": "staging", "release_tag": release_tag, "de_identified": "false" } staging_dataset_object = bq.define_dataset(client.project, rdr_staging, staging_desc, labels) client.create_dataset(staging_dataset_object) LOGGER.info(f'Created dataset `{client.project}.{rdr_staging}`') sandbox_desc = (f'Sandbox created for storing records affected by the ' f'cleaning rules applied to {rdr_staging}') labels["phase"] = "sandbox" sandbox_dataset_object = bq.define_dataset(client.project, rdr_sandbox, sandbox_desc, labels) client.create_dataset(sandbox_dataset_object) LOGGER.info(f'Created dataset `{client.project}.{rdr_sandbox}`') version = 'implement getting software version' clean_desc = (f'{version} clean version of {rdr_dataset}') labels["phase"] = "clean" clean_dataset_object = bq.define_dataset(client.project, rdr_clean, clean_desc, labels) client.create_dataset(clean_dataset_object) LOGGER.info(f'Created dataset `{client.project}.{rdr_clean}`') return {'clean': rdr_clean, 'staging': rdr_staging, 'sandbox': rdr_sandbox}
def test_define_dataset(self): # Tests if project_id is given self.assertRaises(RuntimeError, bq.define_dataset, None, self.dataset_id, self.description, self.existing_labels_or_tags) # Tests if dataset_id is given self.assertRaises(RuntimeError, bq.define_dataset, self.project_id, None, self.description, self.existing_labels_or_tags) # Tests if description is given self.assertRaises(RuntimeError, bq.define_dataset, self.project_id, self.dataset_id, (None or ''), self.existing_labels_or_tags) # Tests if no label or tag is given self.assertRaises(RuntimeError, bq.define_dataset, self.project_id, self.dataset_id, self.description, None) # Pre-conditions results = bq.define_dataset(self.project_id, self.dataset_id, self.description, self.existing_labels_or_tags) # Post conditions self.assertIsInstance(results, bigquery.Dataset) self.assertEqual(results.labels, self.existing_labels_or_tags)
def setUpClass(cls): # get the test project if 'test' not in cls.project_id: raise RuntimeError( f'Tests should only run in a test environment. ' f'Current environment is {cls.project_id} .') if not cls.fq_table_names: raise RuntimeError( f'Provide a list of fully qualified table names the ' f'test will manipulate.') cls.client = bq.get_client(cls.project_id) # get or create datasets, cleaning rules can assume the datasets exist required_datasets = [] for table_name in cls.fq_table_names + cls.fq_sandbox_table_names: dataset_id = table_name.split('.')[1] required_datasets.append(dataset_id) desc = (f"dataset created by {cls.__name__} to test a " f"cleaning rule. deletion candidate.") for dataset_id in set(required_datasets): dataset = bq.define_dataset(cls.project_id, dataset_id, desc, {'test': ''}) cls.client.create_dataset(dataset, exists_ok=True)
def main(raw_args=None): """ Run a full RDR import. Assumes you are passing arguments either via command line or a list. """ args = parse_rdr_args(raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) description = f'RDR DUMP loaded from {args.bucket} dated {args.export_date}' export_date = args.export_date.replace('-', '') new_dataset_name = f'rdr{export_date}' # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.curation_project_id, credentials=impersonation_creds) dataset_object = bq.define_dataset(client.project, new_dataset_name, description, {'export_date': args.export_date}) client.create_dataset(dataset_object) create_rdr_tables(client, new_dataset_name, args.bucket) copy_vocab_tables(client, new_dataset_name, args.vocabulary)
def test_define_dataset(self): self.assertRaises(RuntimeError, bq.define_dataset, None, self.dataset_id, self.description, self.label_or_tag) self.assertRaises(RuntimeError, bq.define_dataset, '', self.dataset_id, self.description, self.label_or_tag) self.assertRaises(RuntimeError, bq.define_dataset, self.project_id, False, self.description, self.label_or_tag) self.assertRaises(RuntimeError, bq.define_dataset, self.project_id, self.dataset_id, ' ', self.label_or_tag) self.assertRaises(RuntimeError, bq.define_dataset, self.project_id, self.dataset_id, self.description, None) dataset = bq.define_dataset(self.project_id, self.dataset_id, self.description, self.label_or_tag) self.assertEqual(dataset.dataset_id, self.dataset_id)
def create_dataset(project, dataset_id, description, tags, app_creds): """ Create a dataset with the given parameters. :param project: The project_id used to define the dataset. :param dataset_id: The string to name the dataset with. :param description: A string to use to describe the dataset. :param tags: The list of tags/labels to apply to the dataset. :parm app_creds: Filepath to credentials file used to create the dataset """ # Construct a full Dataset object to send to the API. dataset = bq.define_dataset(project, dataset_id, description, tags) client = get_client(project, app_creds) dataset = client.create_dataset(dataset, exists_ok=True) print(f"Created dataset {project}.{dataset_id}")
def create_fitbit_datasets(client, release_tag): """ Creates staging, sandbox, backup and clean datasets with descriptions and labels :param client: bq client :param release_tag: string of the form "YYYYqNrN" :return: dict of dataset names with keys 'clean', 'backup', 'staging', 'sandbox' """ fitbit_datasets = { consts.CLEAN: f'{release_tag}_fitbit', consts.BACKUP: f'{release_tag}_fitbit_backup', consts.STAGING: f'{release_tag}_fitbit_staging', consts.SANDBOX: f'{release_tag}_fitbit_sandbox' } fitbit_desc = { consts.CLEAN: f'Cleaned version of {fitbit_datasets[consts.BACKUP]}', consts.BACKUP: f'Backup dataset during generation of {fitbit_datasets[consts.STAGING]}', consts.STAGING: f'Intermediary dataset to apply cleaning rules on {fitbit_datasets[consts.BACKUP]}', consts.SANDBOX: (f'Sandbox created for storing records affected by the ' f'cleaning rules applied to {fitbit_datasets[consts.STAGING]}'), } for phase in fitbit_datasets: labels = { "phase": phase, "release_tag": release_tag, "de_identified": "false" } dataset_object = bq.define_dataset(client.project, fitbit_datasets[phase], fitbit_desc[phase], labels) client.create_dataset(dataset_object) LOGGER.info( f'Created dataset `{client.project}.{fitbit_datasets[phase]}`') return fitbit_datasets
def test_define_dataset(self): # Tests if project_id is given self.assertRaises(TypeError, define_dataset, self.dataset_id, self.description, self.label_or_tag) # Tests if dataset_id is given self.assertRaises(TypeError, define_dataset, self.project_id, self.description, self.label_or_tag) # Tests if description is given self.assertRaises(TypeError, define_dataset, self.project_id, self.dataset_id, self.label_or_tag) # Tests if no label or tag is given self.assertRaises(TypeError, define_dataset, self.project_id, self.dataset_id, self.description) # Pre-conditions results = define_dataset(self.project_id, self.dataset_id, self.description, self.label_or_tag) # Post conditions self.assertIsInstance(results, bigquery.Dataset) self.assertEqual(results.labels, self.label_or_tag)
def create_datasets(client, name, input_dataset, tier, release_tag): """ Creates backup, staging, sandbox, and final datasets with the proper descriptions and tag/labels applied :param client: an instantiated bigquery client object :param name: the base name of the datasets to be created :param input_dataset: name of the input dataset :param tier: tier parameter passed through from either a list or command line argument :param release_tag: release tag parameter passed through either the command line arguments :return: tuple of created dataset names """ if not client: raise RuntimeError("Please specify BigQuery client object") if not name: raise RuntimeError( "Please specify the base name of the datasets to be created") if not input_dataset: raise RuntimeError("Please specify the name of the input dataset") if not tier: raise RuntimeError( "Please specify the tier intended for the output datasets") if not release_tag: raise RuntimeError( "Please specify the release tag for the dataset in the format of YYYY#q#r" ) # Construct names of datasets need as part of the deid process final_dataset_id = name backup_dataset_id = f'{name}_{consts.BACKUP}' staging_dataset_id = f'{name}_{consts.STAGING}' sandbox_dataset_id = f'{name}_{consts.SANDBOX}' datasets = { consts.CLEAN: final_dataset_id, consts.BACKUP: backup_dataset_id, consts.STAGING: staging_dataset_id, consts.SANDBOX: sandbox_dataset_id } deid_datasets = [final_dataset_id, staging_dataset_id] # base labels and tags for the datasets base_labels_and_tags = {'release_tag': release_tag, 'data_tier': tier} description = f'dataset created from {input_dataset} for {tier}{release_tag} CDR run' # Creation of dataset objects and dataset label and description updates for phase, dataset_id in datasets.items(): dataset_object = bq.define_dataset(client.project, dataset_id, description, base_labels_and_tags) client.create_dataset(dataset_object, exists_ok=True) dataset = bq.get_dataset(client.project, dataset_id) if dataset_id in deid_datasets: new_labels = bq.update_labels_and_tags(dataset_id, base_labels_and_tags, { 'phase': phase, 'de-identified': 'true' }) dataset.labels = new_labels dataset.description = f'{phase} {description}' client.update_dataset(dataset, ["labels", "description"]) else: new_labels = bq.update_labels_and_tags(dataset_id, base_labels_and_tags, { 'phase': phase, 'de-identified': 'false' }) dataset.labels = new_labels dataset.description = f'{phase} {description}' client.update_dataset(dataset, ["labels", "description"]) # Copy input dataset tables to backup and staging datasets tables = client.list_tables(input_dataset) for table in tables: backup_table = f'{backup_dataset_id}.{table.table_id}' staging_table = f'{staging_dataset_id}.{table.table_id}' client.copy_table(table, backup_table) client.copy_table(table, staging_table) return datasets
#Create dataset with labels output_dataset_name = get_dataset_name(args.tier, args.release_tag, args.deid_stage) description = f'{args.deid_stage} dataset created from {args.src_dataset_id} for {args.tier}{args.release_tag} CDR run' labels = { 'clean': 'yes' if args.deid_stage == 'clean' else 'no', 'data_tier': args.tier.lower(), 'release_tag': args.release_tag.lower() } LOGGER.info( f'Creating dataset {output_dataset_name} in {args.output_prod_project_id}...' ) dataset_object = bq.define_dataset(args.output_prod_project_id, output_dataset_name, description, labels) client.create_dataset(dataset_object, exists_ok=False) #Copy tables from source to destination LOGGER.info( f'Copying tables from dataset {args.src_project_id}.{args.src_dataset_id} to {args.output_prod_project_id}.{output_dataset_name}...' ) bq.copy_datasets(client, f'{args.src_project_id}.{args.src_dataset_id}', f'{args.output_prod_project_id}.{output_dataset_name}') #Append extra columns to person table LOGGER.info(f'Appending extract columns to the person table...') update_person(client, args.output_prod_project_id, output_dataset_name) LOGGER.info(f'Completed successfully.')