def test_copy_datasets(self, mock_client, mock_list_tables, mock_copy_table): full_table_ids = [ f'{self.project_id}.{self.dataset_id}.{table_id}' for table_id in CDM_TABLES ] list_tables_results = [ list_item_from_table_id(table_id) for table_id in full_table_ids ] mock_list_tables.return_value = list_tables_results bq.copy_datasets(mock_client, self.dataset_id, f'{self.dataset_id}_snapshot') mock_list_tables.assert_called_once_with(self.dataset_id) self.assertEqual(mock_copy_table.call_count, len(list_tables_results))
def main(raw_args=None): """ Truncate and store fitbit data. Assumes you are passing arguments either via command line or a list. """ parser = get_fitbit_parser() args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) # Identify the cleaning classes being run for specified data_stage # and validate if all the required arguments are supplied cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT] clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) # create staging, sandbox, backup and clean datasets with descriptions and labels fitbit_datasets = create_fitbit_datasets(client, args.release_tag) copy_fitbit_tables_from_views(client, args.fitbit_dataset, fitbit_datasets[consts.BACKUP], table_prefix='v_') bq.copy_datasets(client, fitbit_datasets[consts.BACKUP], fitbit_datasets[consts.STAGING]) common_cleaning_args = [ '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b', fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT ] fitbit_cleaning_args = args_parser.add_kwargs_to_args( common_cleaning_args, kwargs) clean_cdr.main(args=fitbit_cleaning_args) # Snapshot the staging dataset to final dataset bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING], fitbit_datasets[consts.CLEAN])
def create_tier(credentials_filepath, project_id, tier, input_dataset, release_tag, deid_stage, run_as, **kwargs): """ This function is the main entry point for the deid process. It passes the required parameters to the implementing functions. :param credentials_filepath: filepath to credentials to access GCP :param project_id: project_id associated with the input dataset :param tier: controlled or registered tier intended for the output dataset :param input_dataset: name of the input dataset :param release_tag: release tag for dataset in the format of YYYYq#r# :param deid_stage: deid stage (deid, base or clean) :param run_as: email address of the service account to impersonate :return: name of created controlled or registered dataset """ # validation of params validate_create_tier_args(tier, deid_stage, release_tag) # today's date for QA handoff qa_handoff_date = datetime.strftime(datetime.now(), '%Y-%m-%d') # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( run_as, SCOPES, credentials_filepath) client = bq.get_client(project_id, credentials=impersonation_creds) # Get Final Dataset name final_dataset_name = get_dataset_name(tier, release_tag, deid_stage) # Create intermediary datasets and copy tables from input dataset to newly created dataset datasets = create_datasets(client, final_dataset_name, input_dataset, tier, release_tag) bq.copy_datasets(client, input_dataset, datasets[consts.STAGING]) # Run cleaning rules cleaning_args = [ '-p', project_id, '-d', datasets[consts.STAGING], '-b', datasets[consts.SANDBOX], '--data_stage', f'{tier}_tier_{deid_stage}' ] # Will update the qa_handoff_date to current date if 'base' in deid_stage: versions = add_cdr_metadata.get_etl_version(datasets[consts.STAGING], project_id) if not versions: raise RuntimeError( 'etl version does not exist, make sure _cdr_metadata table was created in combined step' ) add_cdr_metadata.main([ '--component', add_cdr_metadata.INSERT, '--project_id', project_id, '--target_dataset', datasets[consts.STAGING], '--qa_handoff_date', qa_handoff_date, '--etl_version', versions[0] ]) else: LOGGER.info( f'deid_stage was not base, no data inserted into _cdr_metadata table' ) controlled_tier_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs) clean_cdr.main(args=controlled_tier_cleaning_args) # Snapshot the staging dataset to final dataset create_schemaed_snapshot_dataset(project_id, datasets[consts.STAGING], final_dataset_name, False) return datasets
#Create dataset with labels output_dataset_name = get_dataset_name(args.tier, args.release_tag, args.deid_stage) description = f'{args.deid_stage} dataset created from {args.src_dataset_id} for {args.tier}{args.release_tag} CDR run' labels = { 'clean': 'yes' if args.deid_stage == 'clean' else 'no', 'data_tier': args.tier.lower(), 'release_tag': args.release_tag.lower() } LOGGER.info( f'Creating dataset {output_dataset_name} in {args.output_prod_project_id}...' ) dataset_object = bq.define_dataset(args.output_prod_project_id, output_dataset_name, description, labels) client.create_dataset(dataset_object, exists_ok=False) #Copy tables from source to destination LOGGER.info( f'Copying tables from dataset {args.src_project_id}.{args.src_dataset_id} to {args.output_prod_project_id}.{output_dataset_name}...' ) bq.copy_datasets(client, f'{args.src_project_id}.{args.src_dataset_id}', f'{args.output_prod_project_id}.{output_dataset_name}') #Append extra columns to person table LOGGER.info(f'Appending extract columns to the person table...') update_person(client, args.output_prod_project_id, output_dataset_name) LOGGER.info(f'Completed successfully.')