示例#1
0
    def test_copy_datasets(self, mock_client, mock_list_tables,
                           mock_copy_table):
        full_table_ids = [
            f'{self.project_id}.{self.dataset_id}.{table_id}'
            for table_id in CDM_TABLES
        ]
        list_tables_results = [
            list_item_from_table_id(table_id) for table_id in full_table_ids
        ]
        mock_list_tables.return_value = list_tables_results

        bq.copy_datasets(mock_client, self.dataset_id,
                         f'{self.dataset_id}_snapshot')
        mock_list_tables.assert_called_once_with(self.dataset_id)
        self.assertEqual(mock_copy_table.call_count, len(list_tables_results))
示例#2
0
def main(raw_args=None):
    """
    Truncate and store fitbit data.

    Assumes you are passing arguments either via command line or a
    list.
    """
    parser = get_fitbit_parser()
    args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    # Identify the cleaning classes being run for specified data_stage
    # and validate if all the required arguments are supplied
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT]
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    # create staging, sandbox, backup and clean datasets with descriptions and labels
    fitbit_datasets = create_fitbit_datasets(client, args.release_tag)

    copy_fitbit_tables_from_views(client,
                                  args.fitbit_dataset,
                                  fitbit_datasets[consts.BACKUP],
                                  table_prefix='v_')
    bq.copy_datasets(client, fitbit_datasets[consts.BACKUP],
                     fitbit_datasets[consts.STAGING])

    common_cleaning_args = [
        '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b',
        fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT
    ]
    fitbit_cleaning_args = args_parser.add_kwargs_to_args(
        common_cleaning_args, kwargs)

    clean_cdr.main(args=fitbit_cleaning_args)

    # Snapshot the staging dataset to final dataset
    bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING],
                               fitbit_datasets[consts.CLEAN])
示例#3
0
def create_tier(credentials_filepath, project_id, tier, input_dataset,
                release_tag, deid_stage, run_as, **kwargs):
    """
    This function is the main entry point for the deid process.
    It passes the required parameters to the implementing functions.

    :param credentials_filepath: filepath to credentials to access GCP
    :param project_id: project_id associated with the input dataset
    :param tier: controlled or registered tier intended for the output dataset
    :param input_dataset: name of the input dataset
    :param release_tag: release tag for dataset in the format of YYYYq#r#
    :param deid_stage: deid stage (deid, base or clean)
    :param run_as: email address of the service account to impersonate
    :return: name of created controlled or registered dataset
    """
    # validation of params
    validate_create_tier_args(tier, deid_stage, release_tag)

    # today's date for QA handoff
    qa_handoff_date = datetime.strftime(datetime.now(), '%Y-%m-%d')

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        run_as, SCOPES, credentials_filepath)

    client = bq.get_client(project_id, credentials=impersonation_creds)

    # Get Final Dataset name
    final_dataset_name = get_dataset_name(tier, release_tag, deid_stage)

    # Create intermediary datasets and copy tables from input dataset to newly created dataset
    datasets = create_datasets(client, final_dataset_name, input_dataset, tier,
                               release_tag)
    bq.copy_datasets(client, input_dataset, datasets[consts.STAGING])

    # Run cleaning rules
    cleaning_args = [
        '-p', project_id, '-d', datasets[consts.STAGING], '-b',
        datasets[consts.SANDBOX], '--data_stage', f'{tier}_tier_{deid_stage}'
    ]

    # Will update the qa_handoff_date to current date
    if 'base' in deid_stage:
        versions = add_cdr_metadata.get_etl_version(datasets[consts.STAGING],
                                                    project_id)
        if not versions:
            raise RuntimeError(
                'etl version does not exist, make sure _cdr_metadata table was created in combined step'
            )
        add_cdr_metadata.main([
            '--component', add_cdr_metadata.INSERT, '--project_id', project_id,
            '--target_dataset', datasets[consts.STAGING], '--qa_handoff_date',
            qa_handoff_date, '--etl_version', versions[0]
        ])
    else:
        LOGGER.info(
            f'deid_stage was not base, no data inserted into _cdr_metadata table'
        )

    controlled_tier_cleaning_args = add_kwargs_to_args(cleaning_args, kwargs)
    clean_cdr.main(args=controlled_tier_cleaning_args)

    # Snapshot the staging dataset to final dataset
    create_schemaed_snapshot_dataset(project_id, datasets[consts.STAGING],
                                     final_dataset_name, False)

    return datasets
    #Create dataset with labels
    output_dataset_name = get_dataset_name(args.tier, args.release_tag,
                                           args.deid_stage)
    description = f'{args.deid_stage} dataset created from {args.src_dataset_id} for {args.tier}{args.release_tag} CDR run'
    labels = {
        'clean': 'yes' if args.deid_stage == 'clean' else 'no',
        'data_tier': args.tier.lower(),
        'release_tag': args.release_tag.lower()
    }

    LOGGER.info(
        f'Creating dataset {output_dataset_name} in {args.output_prod_project_id}...'
    )
    dataset_object = bq.define_dataset(args.output_prod_project_id,
                                       output_dataset_name, description,
                                       labels)
    client.create_dataset(dataset_object, exists_ok=False)

    #Copy tables from source to destination
    LOGGER.info(
        f'Copying tables from dataset {args.src_project_id}.{args.src_dataset_id} to {args.output_prod_project_id}.{output_dataset_name}...'
    )
    bq.copy_datasets(client, f'{args.src_project_id}.{args.src_dataset_id}',
                     f'{args.output_prod_project_id}.{output_dataset_name}')

    #Append extra columns to person table
    LOGGER.info(f'Appending extract columns to the person table...')
    update_person(client, args.output_prod_project_id, output_dataset_name)

    LOGGER.info(f'Completed successfully.')