Exemplo n.º 1
0
    def test_hadoop_ls(self):
        path1 = resource('ls_test/f_50')
        ls1 = hl.hadoop_ls(path1)
        self.assertEqual(len(ls1), 1)
        self.assertEqual(ls1[0]['size_bytes'], 50)
        self.assertEqual(ls1[0]['is_dir'], False)
        self.assertTrue('path' in ls1[0])
        self.assertTrue('owner' in ls1[0])
        self.assertTrue('modification_time' in ls1[0])

        path2 = resource('ls_test')
        ls2 = hl.hadoop_ls(path2)
        self.assertEqual(len(ls2), 3)
        ls2_dict = {x['path'].split("/")[-1]: x for x in ls2}
        self.assertEqual(ls2_dict['f_50']['size_bytes'], 50)
        self.assertEqual(ls2_dict['f_100']['size_bytes'], 100)
        self.assertEqual(ls2_dict['f_100']['is_dir'], False)
        self.assertEqual(ls2_dict['subdir']['is_dir'], True)
        self.assertTrue('owner' in ls2_dict['f_50'])
        self.assertTrue('modification_time' in ls2_dict['f_50'])

        path3 = resource('ls_test/f*')
        ls3 = hl.hadoop_ls(path3)
        assert len(ls3) == 2, ls3

        with self.assertRaisesRegex(Exception, "FileNotFound"):
            hl.hadoop_ls('a_file_that_does_not_exist')
Exemplo n.º 2
0
def join_clump_hts(pop, not_pop, max_pops, high_quality=False, overwrite=False):
    r'''
    Wrapper for mwzj_hts_by_tree()
    '''
    assert not (not_pop and max_pops), '`not_pop` and `max_pops` cannot both be True'
    mt_path = get_clumping_results_path(pop=pop,
                                        not_pop=not_pop,
                                        max_pops=max_pops,
                                        high_quality=high_quality)
    if hl.hadoop_is_file(f'{mt_path}/_SUCCESS') and ~overwrite:
        print(f'\nMT already written to {mt_path}! To overwrite, use overwrite=True')
        return
    else:
        print(f'Writing MT to {mt_path}')
    pop = pop.upper() if pop is not None else None
    
    clump_results_dir = (f'{ldprune_dir}/results{"_high_quality" if high_quality else ""}/'+
                         ('max_pops' if max_pops else '{"not_" if not_pop else ""}{pop}'))
    ls = hl.hadoop_ls(f'{clump_results_dir}/*')
    all_hts = [x['path'] for x in ls if 'clump_results.ht' in x['path']]
    
    temp_dir = ('gs://ukbb-diverse-temp-30day/nb-temp/'+
                'max_pops' if max_pops else f'{"not_" if not_pop else ""}{pop}'+
                f'{"-hq" if high_quality else ""}')
    globals_for_col_key = ukb_common.PHENO_KEY_FIELDS
    mt = mwzj_hts_by_tree(all_hts=all_hts,
                         temp_dir=temp_dir,
                         globals_for_col_key=globals_for_col_key)
#    mt = resume_mwzj(temp_dir=temp_dir, # NOTE: only use if all the temp hts have been created
#                     globals_for_col_key=globals_for_col_key)

    mt.write(mt_path, overwrite=overwrite)
Exemplo n.º 3
0
def get_rows_data(rows_files):  # noqa: D103
    file_sizes = []
    partition_bounds = []
    parts_file = [x["path"] for x in rows_files if x["path"].endswith("parts")]
    if parts_file:
        parts = hl.hadoop_ls(parts_file[0])
        for i, x in enumerate(parts):
            index = x["path"].split(f"{parts_file[0]}/part-")[1].split("-")[0]
            if i < len(parts) - 1:
                test_index = (parts[i + 1]["path"].split(
                    f"{parts_file[0]}/part-")[1].split("-")[0])
                if test_index == index:
                    continue
            file_sizes.append(x["size_bytes"])
    metadata_file = [
        x["path"] for x in rows_files if x["path"].endswith("metadata.json.gz")
    ]
    if metadata_file:
        with hl.hadoop_open(metadata_file[0], "rb") as f:
            rows_meta = json.loads(f.read())
            try:
                partition_bounds = [(
                    x["start"]["locus"]["contig"],
                    x["start"]["locus"]["position"],
                    x["end"]["locus"]["contig"],
                    x["end"]["locus"]["position"],
                ) for x in rows_meta["jRangeBounds"]]
            except KeyError:
                pass
    return partition_bounds, file_sizes
Exemplo n.º 4
0
def get_rows_data(rows_files):
    file_sizes = []
    partition_bounds = []
    parts_file = [x['path'] for x in rows_files if x['path'].endswith('parts')]
    if parts_file:
        parts = hl.hadoop_ls(parts_file[0])
        for i, x in enumerate(parts):
            index = x['path'].split(f'{parts_file[0]}/part-')[1].split('-')[0]
            if i < len(parts) - 1:
                test_index = parts[i + 1]['path'].split(
                    f'{parts_file[0]}/part-')[1].split('-')[0]
                if test_index == index:
                    continue
            file_sizes.append(x['size_bytes'])
    metadata_file = [
        x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz')
    ]
    if metadata_file:
        with hl.hadoop_open(metadata_file[0], 'rb') as f:
            rows_meta = json.loads(f.read())
            try:
                partition_bounds = [(x['start']['locus']['contig'],
                                     x['start']['locus']['position'],
                                     x['end']['locus']['contig'],
                                     x['end']['locus']['position'])
                                    for x in rows_meta['jRangeBounds']]
            except KeyError:
                pass
    return partition_bounds, file_sizes
Exemplo n.º 5
0
def create_full_results_file(prune, overwrite=False):
    r'''
    Concatenates PRS-phentype regression results into a single table.
    '''
    reg_path_regex = prs_dir + f'prs_phen_reg.*.*.n_remove_{int(n_remove_per_sex)}.seed_*.{"" if prune else "not_"}pruned*.tsv'
    ls = hl.hadoop_ls(reg_path_regex)
    reg_paths = sorted([f['path'] for f in ls])
    df_list = []
    for reg_path in reg_paths:
        with hl.hadoop_open(reg_path) as f:
            df_list.append(pd.read_csv(f, sep='\t'))
    df = pd.concat(df_list, sort=False)
    df.insert(1, 'phen_desc',
              df.phen.astype(str).apply(lambda x: phen_dict[x][0])
              )  # add phenotype description to dataframe

    all_reg_results_path = prs_dir + f'prs_phen_reg.all_phens.n_remove_{int(n_remove_per_sex)}.{"" if prune else "not_"}pruned.tsv'
    if hl.hadoop_is_file(all_reg_results_path) and not overwrite:
        print('\n... Full PRS-phen regression results already written! ...')
        print(all_reg_results_path)
    else:
        print('\n... Writing PRS-phen regression results ...')
        print(all_reg_results_path)
        with hl.hadoop_open(all_reg_results_path, 'w') as f:
            df.to_csv(f, sep='\t', index=False)
Exemplo n.º 6
0
def export_loo(batch_size=256, update=False):
    r'''
    For exporting p-values of meta-analysis of leave-one-out population sets
    '''
    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())
    
    meta_mt0 = meta_mt0.select_rows()

    meta_mt0 = meta_mt0.annotate_cols(pheno_id = get_pheno_id(tb=meta_mt0))
    
    meta_mt0 = meta_mt0.filter_cols(hl.len(meta_mt0.pheno_data.pop)==6)
    
    if update:    
        current_dir = f'{ldprune_dir}/loo/sumstats/batch1' # directory of current results to update
        
        ss_list = hl.hadoop_ls(current_dir)
        pheno_id_list = [x['path'].replace('.tsv.bgz','').replace(f'{current_dir}/','') for x in ss_list if 'bgz' in x['path']]
    
        meta_mt0 = meta_mt0.filter_cols(~hl.literal(pheno_id_list).contains(meta_mt0.pheno_id))
                
    meta_mt0 = meta_mt0.annotate_rows(chr = meta_mt0.locus.contig,
                                      pos = meta_mt0.locus.position,
                                      SNP = (meta_mt0.locus.contig+':'+
                                             hl.str(meta_mt0.locus.position)+':'+
                                             meta_mt0.alleles[0]+':'+
                                             meta_mt0.alleles[1]))

    all_pops = sorted(['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID'])

    annotate_dict = {}
    '''
    pop_idx corresponds to the alphabetic ordering of the pops 
    entry with idx=0 is 6-pop meta-analysis, entry with idx=1 is 5-pop not-AFR 
    meta-analysis, idx=2 is 5-pop not-AMR, etc)
    '''
    for pop_idx, pop in enumerate(all_pops,1): 
        annotate_dict.update({f'pval_not_{pop}': meta_mt0.meta_analysis.Pvalue[pop_idx]})
    meta_mt1 = meta_mt0.annotate_entries(**annotate_dict)
    
    meta_mt1 = meta_mt1.key_cols_by('pheno_id')
    meta_mt1 = meta_mt1.key_rows_by().drop('locus','alleles','meta_analysis')
    
    meta_mt1.describe()
    
    batch_idx = 1
    get_export_path = lambda batch_idx: f'{ldprune_dir}/loo/sumstats/batch{batch_idx}'
    while hl.hadoop_is_dir(get_export_path(batch_idx)):
        batch_idx += 1
    print(f'\nExporting to: {get_export_path(batch_idx)}\n')
    print(meta_mt1.count_cols())
    hl.experimental.export_entries_by_col(mt = meta_mt1,
                                          path = get_export_path(batch_idx),
                                          bgzip = True,
                                          batch_size = batch_size,
                                          use_string_key_as_file_name = True,
                                          header_json_in_file = False)
def annotate_sites(ht, bed_dir = "gs://gnomad-qingbowang/finucane_et_al_hg38_ht/"): #currently only supports hg38. Returns annotated hail table
    #per functional annotation, annotate binary
    ls = hl.hadoop_ls(bed_dir)
    for i in range(len(ls)):
        func_interval = hl.read_table(ls[i]["path"])
        func_name = ls[i]["path"].split("/")[-1].split(".")[0]
        ht = ht.annotate(func_name_tmp = hl.is_defined(func_interval[ht.locus])) #temp for the column name
        ht = ht.rename({"func_name_tmp":func_name}) #and change it to the functional anntation name itself
        print ("done {0}".format(func_name))
    return (ht)
Exemplo n.º 8
0
    def test_hadoop_ls(self):
        path1 = resource('ls_test/f_50')
        ls1 = hl.hadoop_ls(path1)
        self.assertEqual(len(ls1), 1)
        self.assertEqual(ls1[0]['size_bytes'], 50)
        self.assertEqual(ls1[0]['is_dir'], False)
        self.assertTrue('path' in ls1[0])
        self.assertTrue('owner' in ls1[0])
        self.assertTrue('modification_time' in ls1[0])

        path2 = resource('ls_test')
        ls2 = hl.hadoop_ls(path2)
        self.assertEqual(len(ls2), 3)
        ls2_dict = {x['path'].split("/")[-1]: x for x in ls2}
        self.assertEqual(ls2_dict['f_50']['size_bytes'], 50)
        self.assertEqual(ls2_dict['f_100']['size_bytes'], 100)
        self.assertEqual(ls2_dict['f_100']['is_dir'], False)
        self.assertEqual(ls2_dict['subdir']['is_dir'], True)
        self.assertTrue('owner' in ls2_dict['f_50'])
        self.assertTrue('modification_time' in ls2_dict['f_50'])
Exemplo n.º 9
0
    def test_hadoop_ls(self):
        path1 = resource('ls_test/f_50')
        ls1 = hl.hadoop_ls(path1)
        self.assertEqual(len(ls1), 1)
        self.assertEqual(ls1[0]['size_bytes'], 50)
        self.assertEqual(ls1[0]['is_dir'], False)
        self.assertTrue('path' in ls1[0])
        self.assertTrue('owner' in ls1[0])
        self.assertTrue('modification_time' in ls1[0])

        path2 = resource('ls_test')
        ls2 = hl.hadoop_ls(path2)
        self.assertEqual(len(ls2), 3)
        ls2_dict = {x['path'].split("/")[-1]: x for x in ls2}
        self.assertEqual(ls2_dict['f_50']['size_bytes'], 50)
        self.assertEqual(ls2_dict['f_100']['size_bytes'], 100)
        self.assertEqual(ls2_dict['f_100']['is_dir'], False)
        self.assertEqual(ls2_dict['subdir']['is_dir'], True)
        self.assertTrue('owner' in ls2_dict['f_50'])
        self.assertTrue('modification_time' in ls2_dict['f_50'])
Exemplo n.º 10
0
def get_paths():
    pheno_manifest = hl.import_table(f'{ldprune_dir}/phenotype_manifest.tsv.bgz', impute=True)
    pheno_manifest = pheno_manifest.filter(pheno_manifest.num_pops==6)
    filenames = pheno_manifest.filename.collect()
    all_files = hl.hadoop_ls(loo_sumstats_dir)
    print(len(all_files))
    all_paths = list(map(lambda f: f['path'], all_files))
    print(len(all_paths))
    filename_path_dict = dict(zip([path.split('/')[-1] for path in all_paths],all_paths))
    paths = [filename_path_dict[f] for f in filenames if f in filename_path_dict and filename_path_dict[f] in all_paths]
    print(len(paths))
    return paths
def annotate_sites_specific_histonemark(ht, bed_dir = "gs://gnomad-qingbowang/finucane_et_al_hg38_ht/", mark="H3K4me1"):
    #per histone mark, since otherwise this will blow up
    #per functional annotation, annotate binary
    ls = hl.hadoop_ls(bed_dir)
    for i in range(len(ls)):
        func_interval = hl.read_table(ls[i]["path"])
        func_name = ls[i]["path"].split("/")[-1].replace("_narrowpeak.ht", "")
        his_name = func_name.split("-")[-1]
        if his_name ==mark:
            ht = ht.annotate(func_name_tmp = hl.is_defined(func_interval[ht.locus])) #temp for the column name
            ht = ht.rename({"func_name_tmp":func_name}) #and change it to the functional anntation name itself
            print ("done {0}".format(func_name))
    return (ht)
Exemplo n.º 12
0
def resume_mwzj(temp_dir, globals_for_col_key):
    r'''
    For resuming multiway zip join if intermediate tables have already been written
    '''
    ls = hl.hadoop_ls(temp_dir)
    paths = [x['path'] for x in ls if 'temp_output' in x['path'] ]
    chunk_size = len(paths)
    outer_hts = []
    for i in range(chunk_size):
        outer_hts.append(hl.read_table(f'{temp_dir}/temp_output_{i}.ht'))
    ht = hl.Table.multi_way_zip_join(outer_hts, 'row_field_name_outer', 'global_field_name_outer')
    ht = ht.transmute(inner_row=hl.flatmap(lambda i:
                                           hl.cond(hl.is_missing(ht.row_field_name_outer[i].row_field_name),
                                                   hl.range(0, hl.len(ht.global_field_name_outer[i].global_field_name))
                                                   .map(lambda _: hl.null(ht.row_field_name_outer[i].row_field_name.dtype.element_type)),
                                                   ht.row_field_name_outer[i].row_field_name),
                                           hl.range(hl.len(ht.global_field_name_outer))))
    ht = ht.transmute_globals(inner_global=hl.flatmap(lambda x: x.global_field_name, ht.global_field_name_outer))
    mt = ht._unlocalize_entries('inner_row', 'inner_global', globals_for_col_key)
    return mt
Exemplo n.º 13
0
def prs_phen_reg(test_mt,
                 phen,
                 sex,
                 n_remove,
                 prune,
                 percentiles,
                 seed,
                 use_sex_spec_irnt=False,
                 overwrite=False):

    if use_sex_spec_irnt and 'irnt' not in phen:
        print(
            f'NOTE: Setting use_sex_spec_irnt=False because phen {phen} is not IRNT'
        )

    test_ht = test_mt.cols()

    reg_path = prs_dir + f'prs_phen_reg.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{"" if prune else "not_"}pruned{".sexspecirnt" if use_sex_spec_irnt else ""}.tsv'

    if hl.hadoop_is_file(reg_path) and not overwrite:
        print(
            f'... Phen ~ PRS + covariates regression already complete for all gwas versions & percentiles of {phen} {sex} {"sex_spec_irnt" if use_sex_spec_irnt else ""}! ...'
        )
    else:

        row_struct_ls = []

        gwas_versions = [
            'unadjusted', f'mtag_{"def" if sex!="both_sexes" else "rg1"}'
        ]

        for gwas_version in gwas_versions:
            for percentile in percentiles:
                prs_path_without_threshold = prs_dir + f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned*.perc_{percentile}.tsv'
                print(prs_path_without_threshold)
                ls = hl.hadoop_ls(prs_path_without_threshold)
                print(
                    'WARNING: More than one file matches {prs_path_without_threshold}'
                    if len(ls) > 1 else '')
                print('\n'.join([x['path'] for x in ls]))
                prs_path = ls[0][
                    'path']  # default to using the first path if more than one path exists for a given p-value percentile
                pval_thresh = prs_path.split('pval_thresh_')[1].split('.perc')[
                    0]  #previously used for the both_sexes prs
                print(
                    f'... {phen} {sex} {gwas_version} percentile={percentile} ...'
                )
                print(f'... using {prs_path} ...')
                print(f'... pval threshold: {pval_thresh} ...')
                prs_ht = hl.import_table(prs_path,
                                         impute=True,
                                         key='s',
                                         types={'s': hl.tstr})
                test_ht = test_ht.annotate(prs=prs_ht[test_ht.s].prs)

                cov_list = ['prs', 'age', 'age_squared'
                            ] + ['PC{:}'.format(i) for i in range(1, 21)]
                for isFemale in [0, 1]:  # test in males, then females
                    test_ht_sex = test_ht.filter(test_ht.isFemale == isFemale)
                    reg = test_ht_sex.aggregate(
                        hl.agg.linreg(
                            y=test_ht_sex.phen,
                            x=[1] + list(
                                map(
                                    lambda x: test_ht_sex[x]
                                    if type(x) is str else x, cov_list))))
                    print(
                        f'\n\n... {phen} {sex} {gwas_version} percentile={percentile} '
                        +
                        f'applied to {"fe" if isFemale else ""}males {"using sex-spec irnt" if use_sex_spec_irnt else ""} ...\n'
                        + f'\n... multiple R^2: {reg.multiple_r_squared} ...' +
                        f'\n... pval for multiple R^2: {reg.multiple_p_value} ...'
                        + f'\n... adjusted R^2: {reg.adjusted_r_squared} ...')
                    row_struct_ls.append({
                        'phen':
                        phen,
                        'gwas_sex':
                        sex,
                        'gwas_version':
                        gwas_version,
                        'sex_spec_irnt':
                        str(use_sex_spec_irnt),
                        'percentile':
                        str(percentile),
                        'pval_threshold':
                        pval_thresh,
                        'sex_tested_on':
                        f'{"fe" if isFemale else ""}males',
                        'multiple_r2':
                        str(reg.multiple_r_squared),
                        'multiple_r2_pval':
                        str(reg.multiple_p_value),
                        'adjusted_r2':
                        str(reg.adjusted_r_squared)
                    })

        ht = hl.Table.parallelize(
            hl.literal(
                row_struct_ls,
                'array<struct{phen: str, gwas_sex: str, gwas_version: str, sex_spec_irnt: str, percentile: str, pval_threshold: str, sex_tested_on: str, multiple_r2: str, multiple_r2_pval: str, adjusted_r2: str}>'
            ))
        ht = ht.annotate(percentile=hl.float(ht.percentile),
                         pval_threshold=hl.float(ht.pval_threshold),
                         multiple_r2=hl.float(ht.multiple_r2),
                         multiple_r2_pval=hl.float(ht.multiple_r2_pval),
                         adjusted_r2=hl.float(ht.adjusted_r2))
        ht.show(12)

        print(
            f'\n\n... Writing regression results to {reg_path} (overwrite={overwrite})...'
        )
        ht.export(reg_path)
Exemplo n.º 14
0
os.chdir("/Users/weisburd/code/methods/gcnv_viewer")
print(os.getcwd())

#%%

#google_storage_dir = "gs://fc-secure-e2c5f2a5-2e76-4c01-a264-419262b2c7c8/dcr_tabs"
#google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v1/beds"
google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v3/beds"

assert hl.hadoop_is_dir(google_storage_dir)

#%%

batch_name_to_path_and_samples = {}

for result in hl.hadoop_ls(google_storage_dir):

    if not result['path'].endswith('.bed.gz') and not result['path'].endswith('.bed'):
        continue

    if result['size_bytes'] < 1000:
        print(f"ERROR: file size of {result['path']} is too small: {result['size_bytes']}")

    with hl.hadoop_open(result['path'], 'r') as f:
        line = f.readline()
        fields = line.rstrip("\n").split("\t")
        sample_ids = fields[3:]

    batch_name = os.path.basename(result['path']).replace(".dcr", "").replace(".bed", "").replace(".gz", "")
    batch_name_to_path_and_samples[batch_name] = (result['path'], sample_ids)

ht = hl.read_table("gs://gnomad-bw2/gnomad_v3_1_readviz_crams__that_failed_AB_filter_exploded_keyed_by_sample.ht")
current_samples_v31 = ht.distinct().S.collect()  # 4,445 samples
len(set(current_samples_v31) - set(v31_release_samples))   # 919 samples  

ht = hl.read_table("gs://gnomad-bw2/gnomad_v3_readviz_crams__that_failed_AB_filter_exploded_keyed_by_sample.ht")
current_samples_v3 = ht.distinct().S.collect()  # 68,639 samples

len(set(current_samples_v3) - set(v31_release_samples)) # 1,655

"""

#%%

tsv_paths = hl.hadoop_ls("gs://gnomad-bw2/gnomad_v3_1_readviz_tsvs")

#%%

path_tuples = [(os.path.basename(t['path']).replace(".tsv.bgz", ""), t['path']) for t in tsv_paths]
df = pd.DataFrame(path_tuples, columns=['entity:participant_id', 'variants_tsv_bgz'])
df = df.set_index('entity:participant_id')

#%%

df2 = pd.read_table("./metadata/v3_1_new_releasable_cram_paths_with_sex.txt").rename(columns={'CRAM': 'cram_path', 'CRAI': 'crai_path'})
df2 = df2[['sample_id', 'cram_path', 'crai_path']]
df2 = df2.set_index('sample_id')

#%%
Exemplo n.º 16
0
def plot_hail_file_metadata(
        t_path: str) -> Optional[Union[Grid, Tabs, bokeh.plotting.Figure]]:
    """
    Takes path to hail Table or MatrixTable (gs://bucket/path/hail.mt), outputs Grid or Tabs, respectively.
    Or if an unordered Table is provided, a Figure with file sizes is output.
    If metadata file or rows directory is missing, returns None.
    """
    panel_size = 600
    subpanel_size = 150

    files = hl.hadoop_ls(t_path)
    rows_file = [x['path'] for x in files if x['path'].endswith('rows')]
    entries_file = [x['path'] for x in files if x['path'].endswith('entries')]
    # cols_file = [x['path'] for x in files if x['path'].endswith('cols')]
    success_file = [
        x['modification_time'] for x in files if x['path'].endswith('SUCCESS')
    ]

    data_type = 'Table'

    metadata_file = [
        x['path'] for x in files if x['path'].endswith('metadata.json.gz')
    ]
    if not metadata_file:
        warnings.warn('No metadata file found. Exiting...')
        return None

    with hl.hadoop_open(metadata_file[0], 'rb') as f:
        overall_meta = json.loads(f.read())
        rows_per_partition = overall_meta['components']['partition_counts'][
            'counts']

    if not rows_file:
        warnings.warn('No rows directory found. Exiting...')
        return None
    rows_files = hl.hadoop_ls(rows_file[0])

    if entries_file:
        data_type = 'MatrixTable'
        rows_file = [
            x['path'] for x in rows_files if x['path'].endswith('rows')
        ]
        rows_files = hl.hadoop_ls(rows_file[0])
    row_partition_bounds, row_file_sizes = get_rows_data(rows_files)

    total_file_size, row_file_sizes, row_scale = scale_file_sizes(
        row_file_sizes)

    if not row_partition_bounds:
        warnings.warn('Table is not partitioned. Only plotting file sizes')
        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            row_file_sizes, bins=50)
        p_file_size = figure(plot_width=panel_size, plot_height=panel_size)
        p_file_size.quad(right=row_file_sizes_hist,
                         left=0,
                         bottom=row_file_sizes_edges[:-1],
                         top=row_file_sizes_edges[1:],
                         fill_color="#036564",
                         line_color="#033649")
        p_file_size.yaxis.axis_label = f'File size ({row_scale}B)'
        return p_file_size

    all_data = {
        'partition_widths':
        [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds],
        'partition_bounds':
        [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds],
        'spans_chromosome': [
            'Spans chromosomes' if x[0] != x[2] else 'Within chromosome'
            for x in row_partition_bounds
        ],
        'row_file_sizes':
        row_file_sizes,
        'row_file_sizes_human':
        [f'{x:.1f} {row_scale}B' for x in row_file_sizes],
        'rows_per_partition':
        rows_per_partition,
        'index':
        list(range(len(rows_per_partition)))
    }

    if entries_file:
        entries_rows_files = hl.hadoop_ls(entries_file[0])
        entries_rows_file = [
            x['path'] for x in entries_rows_files if x['path'].endswith('rows')
        ]
        if entries_rows_file:
            entries_files = hl.hadoop_ls(entries_rows_file[0])
            entry_partition_bounds, entry_file_sizes = get_rows_data(
                entries_files)
            total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes(
                entry_file_sizes)
            all_data['entry_file_sizes'] = entry_file_sizes
            all_data['entry_file_sizes_human'] = [
                f'{x:.1f} {entry_scale}B' for x in row_file_sizes
            ]

    title = f'{data_type}: {t_path}'

    msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>"
    if success_file[0]:
        msg += success_file[0]

    source = ColumnDataSource(pd.DataFrame(all_data))
    p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size)
    p.title.text = title
    p.xaxis.axis_label = 'Number of rows'
    p.yaxis.axis_label = f'File size ({row_scale}B)'
    color_map = factor_cmap('spans_chromosome',
                            palette=Spectral8,
                            factors=list(set(all_data['spans_chromosome'])))
    p.scatter('rows_per_partition',
              'row_file_sizes',
              color=color_map,
              legend='spans_chromosome',
              source=source)
    p.legend.location = 'bottom_right'
    p.select_one(HoverTool).tooltips = [
        (x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human',
                               'partition_bounds', 'index')
    ]

    p_stats = Div(text=msg)
    p_rows_per_partition = figure(x_range=p.x_range,
                                  plot_width=panel_size,
                                  plot_height=subpanel_size)
    p_file_size = figure(y_range=p.y_range,
                         plot_width=subpanel_size,
                         plot_height=panel_size)

    rows_per_partition_hist, rows_per_partition_edges = np.histogram(
        all_data['rows_per_partition'], bins=50)
    p_rows_per_partition.quad(top=rows_per_partition_hist,
                              bottom=0,
                              left=rows_per_partition_edges[:-1],
                              right=rows_per_partition_edges[1:],
                              fill_color="#036564",
                              line_color="#033649")
    row_file_sizes_hist, row_file_sizes_edges = np.histogram(
        all_data['row_file_sizes'], bins=50)
    p_file_size.quad(right=row_file_sizes_hist,
                     left=0,
                     bottom=row_file_sizes_edges[:-1],
                     top=row_file_sizes_edges[1:],
                     fill_color="#036564",
                     line_color="#033649")

    rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]])

    if 'entry_file_sizes' in all_data:
        title = f'Statistics for {data_type}: {t_path}'

        msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>"
        if success_file[0]:
            msg += success_file[0]

        source = ColumnDataSource(pd.DataFrame(all_data))
        panel_size = 600
        subpanel_size = 150
        p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size)
        p.title.text = title
        p.xaxis.axis_label = 'Number of rows'
        p.yaxis.axis_label = f'File size ({entry_scale}B)'
        color_map = factor_cmap('spans_chromosome',
                                palette=Spectral8,
                                factors=list(set(
                                    all_data['spans_chromosome'])))
        p.scatter('rows_per_partition',
                  'entry_file_sizes',
                  color=color_map,
                  legend='spans_chromosome',
                  source=source)
        p.legend.location = 'bottom_right'
        p.select_one(HoverTool).tooltips = [
            (x, f'@{x}')
            for x in ('rows_per_partition', 'entry_file_sizes_human',
                      'partition_bounds', 'index')
        ]

        p_stats = Div(text=msg)
        p_rows_per_partition = figure(x_range=p.x_range,
                                      plot_width=panel_size,
                                      plot_height=subpanel_size)
        p_rows_per_partition.quad(top=rows_per_partition_hist,
                                  bottom=0,
                                  left=rows_per_partition_edges[:-1],
                                  right=rows_per_partition_edges[1:],
                                  fill_color="#036564",
                                  line_color="#033649")
        p_file_size = figure(y_range=p.y_range,
                             plot_width=subpanel_size,
                             plot_height=panel_size)

        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            all_data['entry_file_sizes'], bins=50)
        p_file_size.quad(right=row_file_sizes_hist,
                         left=0,
                         bottom=row_file_sizes_edges[:-1],
                         top=row_file_sizes_edges[1:],
                         fill_color="#036564",
                         line_color="#033649")
        entries_grid = gridplot([[p_rows_per_partition, p_stats],
                                 [p, p_file_size]])

        return Tabs(tabs=[
            Panel(child=entries_grid, title='Entries'),
            Panel(child=rows_grid, title='Rows')
        ])
    else:
        return rows_grid
Exemplo n.º 17
0
def plot_hail_file_metadata(
    t_path: str, ) -> Optional[Union[Grid, Tabs, bokeh.plotting.Figure]]:
    """
    Take path to hail Table or MatrixTable (gs://bucket/path/hail.mt), output Grid or Tabs, respectively.

    Or if an unordered Table is provided, a Figure with file sizes is output.
    If metadata file or rows directory is missing, returns None.
    """
    panel_size = 600
    subpanel_size = 150

    files = hl.hadoop_ls(t_path)
    rows_file = [x["path"] for x in files if x["path"].endswith("rows")]
    entries_file = [x["path"] for x in files if x["path"].endswith("entries")]
    # cols_file = [x['path'] for x in files if x['path'].endswith('cols')]
    success_file = [
        x["modification_time"] for x in files if x["path"].endswith("SUCCESS")
    ]

    data_type = "Table"

    metadata_file = [
        x["path"] for x in files if x["path"].endswith("metadata.json.gz")
    ]
    if not metadata_file:
        logger.warning("No metadata file found. Exiting...")
        return None

    with hl.hadoop_open(metadata_file[0], "rb") as f:
        overall_meta = json.loads(f.read())
        rows_per_partition = overall_meta["components"]["partition_counts"][
            "counts"]

    if not rows_file:
        logger.warning("No rows directory found. Exiting...")
        return None
    rows_files = hl.hadoop_ls(rows_file[0])

    if entries_file:
        data_type = "MatrixTable"
        rows_file = [
            x["path"] for x in rows_files if x["path"].endswith("rows")
        ]
        rows_files = hl.hadoop_ls(rows_file[0])
    row_partition_bounds, row_file_sizes = get_rows_data(rows_files)

    total_file_size, row_file_sizes, row_scale = scale_file_sizes(
        row_file_sizes)

    if not row_partition_bounds:
        logger.warning("Table is not partitioned. Only plotting file sizes")
        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            row_file_sizes, bins=50)
        p_file_size = figure(plot_width=panel_size, plot_height=panel_size)
        p_file_size.quad(
            right=row_file_sizes_hist,
            left=0,
            bottom=row_file_sizes_edges[:-1],
            top=row_file_sizes_edges[1:],
            fill_color="#036564",
            line_color="#033649",
        )
        p_file_size.yaxis.axis_label = f"File size ({row_scale}B)"
        return p_file_size

    all_data = {
        "partition_widths":
        [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds],
        "partition_bounds":
        [f"{x[0]}:{x[1]}-{x[2]}:{x[3]}" for x in row_partition_bounds],
        "spans_chromosome": [
            "Spans chromosomes" if x[0] != x[2] else "Within chromosome"
            for x in row_partition_bounds
        ],
        "row_file_sizes":
        row_file_sizes,
        "row_file_sizes_human":
        [f"{x:.1f} {row_scale}B" for x in row_file_sizes],
        "rows_per_partition":
        rows_per_partition,
        "index":
        list(range(len(rows_per_partition))),
    }

    if entries_file:
        entries_rows_files = hl.hadoop_ls(entries_file[0])
        entries_rows_file = [
            x["path"] for x in entries_rows_files if x["path"].endswith("rows")
        ]
        if entries_rows_file:
            entries_files = hl.hadoop_ls(entries_rows_file[0])
            entry_partition_bounds, entry_file_sizes = get_rows_data(
                entries_files)
            total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes(
                entry_file_sizes)
            all_data["entry_file_sizes"] = entry_file_sizes
            all_data["entry_file_sizes_human"] = [
                f"{x:.1f} {entry_scale}B" for x in row_file_sizes
            ]

    title = f"{data_type}: {t_path}"

    msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>"
    if success_file[0]:
        msg += success_file[0]

    source = ColumnDataSource(pd.DataFrame(all_data))
    p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size)
    p.title.text = title
    p.xaxis.axis_label = "Number of rows"
    p.yaxis.axis_label = f"File size ({row_scale}B)"
    color_map = factor_cmap(
        "spans_chromosome",
        palette=Spectral8,
        factors=list(set(all_data["spans_chromosome"])),
    )
    p.scatter(
        "rows_per_partition",
        "row_file_sizes",
        color=color_map,
        legend="spans_chromosome",
        source=source,
    )
    p.legend.location = "bottom_right"
    p.select_one(HoverTool).tooltips = [(x, f"@{x}") for x in (
        "rows_per_partition",
        "row_file_sizes_human",
        "partition_bounds",
        "index",
    )]

    p_stats = Div(text=msg)
    p_rows_per_partition = figure(x_range=p.x_range,
                                  plot_width=panel_size,
                                  plot_height=subpanel_size)
    p_file_size = figure(y_range=p.y_range,
                         plot_width=subpanel_size,
                         plot_height=panel_size)

    rows_per_partition_hist, rows_per_partition_edges = np.histogram(
        all_data["rows_per_partition"], bins=50)
    p_rows_per_partition.quad(
        top=rows_per_partition_hist,
        bottom=0,
        left=rows_per_partition_edges[:-1],
        right=rows_per_partition_edges[1:],
        fill_color="#036564",
        line_color="#033649",
    )
    row_file_sizes_hist, row_file_sizes_edges = np.histogram(
        all_data["row_file_sizes"], bins=50)
    p_file_size.quad(
        right=row_file_sizes_hist,
        left=0,
        bottom=row_file_sizes_edges[:-1],
        top=row_file_sizes_edges[1:],
        fill_color="#036564",
        line_color="#033649",
    )

    rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]])

    if "entry_file_sizes" in all_data:
        title = f"Statistics for {data_type}: {t_path}"

        msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>"
        if success_file[0]:
            msg += success_file[0]

        source = ColumnDataSource(pd.DataFrame(all_data))
        panel_size = 600
        subpanel_size = 150
        p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size)
        p.title.text = title
        p.xaxis.axis_label = "Number of rows"
        p.yaxis.axis_label = f"File size ({entry_scale}B)"
        color_map = factor_cmap(
            "spans_chromosome",
            palette=Spectral8,
            factors=list(set(all_data["spans_chromosome"])),
        )
        p.scatter(
            "rows_per_partition",
            "entry_file_sizes",
            color=color_map,
            legend="spans_chromosome",
            source=source,
        )
        p.legend.location = "bottom_right"
        p.select_one(HoverTool).tooltips = [(x, f"@{x}") for x in (
            "rows_per_partition",
            "entry_file_sizes_human",
            "partition_bounds",
            "index",
        )]

        p_stats = Div(text=msg)
        p_rows_per_partition = figure(x_range=p.x_range,
                                      plot_width=panel_size,
                                      plot_height=subpanel_size)
        p_rows_per_partition.quad(
            top=rows_per_partition_hist,
            bottom=0,
            left=rows_per_partition_edges[:-1],
            right=rows_per_partition_edges[1:],
            fill_color="#036564",
            line_color="#033649",
        )
        p_file_size = figure(y_range=p.y_range,
                             plot_width=subpanel_size,
                             plot_height=panel_size)

        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            all_data["entry_file_sizes"], bins=50)
        p_file_size.quad(
            right=row_file_sizes_hist,
            left=0,
            bottom=row_file_sizes_edges[:-1],
            top=row_file_sizes_edges[1:],
            fill_color="#036564",
            line_color="#033649",
        )
        entries_grid = gridplot([[p_rows_per_partition, p_stats],
                                 [p, p_file_size]])

        return Tabs(tabs=[
            Panel(child=entries_grid, title="Entries"),
            Panel(child=rows_grid, title="Rows"),
        ])
    else:
        return rows_grid
Exemplo n.º 18
0
def get_ss_path_list(sumstats_dir):
    ss_list = hl.hadoop_ls(sumstats_dir)
    ss_path_list = [x['path'] for x in ss_list if 'bgz' in x['path']]
    print(f'\nNumber of sumstats files: {len(ss_path_list)}\n')
    return ss_path_list
Exemplo n.º 19
0
def prs(mt, phen, sex, n_remove, prune, percentiles, seed, count=True):
    r'''
    Calculate PRS using betas from both sexes and sex-stratified GWAS, as well
    as MTAG meta-analyzed betas. PRS are always calculated on both sexes, 
    regardless of the sex the GWAS was run on.
    P-value thresholds are determined by percentile.
    Set `count`=True if running this for the first time, to be sure that 
    numbers make sense. To speed up, set count_rows=False
    '''
    assert sex in [
        'both_sexes', 'female', 'male'
    ], f'WARNING: sex={sex} not allowed. sex must be one of the following: both_sexes, female, male'

    # "def" uses the MTAG results created by using the default settings
    # "rg1" uses the MTAG results created by using the --perfect-gencov flag
    gwas_versions = [
        'unadjusted', f'mtag_{"rg1" if sex=="both_sexes" else "def"}'
    ]

    mt = mt

    for gwas_version in gwas_versions:
        print(
            f'\n... Calculating PRS for "{phen_dict[phen][0]}" {sex} {gwas_version} ...\n'
        )
        gwas_version_suffix = "" if gwas_version == 'unadjusted' else '.' + gwas_version
        gwas_path = (
            prs_dir +
            f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}{gwas_version_suffix}.tsv.{"b" if gwas_version=="unadjusted" else ""}gz'
        )

        ss = hl.import_table(
            gwas_path,
            impute=True,
            key='snpid' if gwas_version == 'unadjusted' else 'SNP',
            force=True)

        if prune:
            print('\n... Pruning SNPs ...\n')
            # define the set of SNPs
            pruned_snps_path = 'gs://nbaya/risk_gradients/ukb_imp_v3_pruned.bim'  #from Robert Maier (pruning threshold r2=0.2, random 10k UKB sample), download here: https://github.com/nikbaya/split/blob/master/ukb_imp_v3_pruned.bim.gz
            variants = hl.import_table(pruned_snps_path,
                                       delimiter='\t',
                                       no_header=True,
                                       impute=True)
            print(f'\n... Pruning to variants in {pruned_snps_path} ...\n')
            variants = variants.rename({
                'f0': 'chr',
                'f1': 'rsid',
                'f3': 'pos'
            }).key_by('rsid')
            #            mt = mt.key_rows_by('rsid')
            # filter to variants defined in variants table
            ss = ss.filter(
                hl.is_defined(variants[ss['snpid' if gwas_version ==
                                          'unadjusted' else 'SNP']]))
            if count:
                ct_rows = ss.count()
                print(
                    f'\n\n... SNP count after pruning filter: {ct_rows} ...\n')
        else:
            print(f'\n... Not pruning because prune={prune} ...\n')

        for percentile in percentiles:

            ## use path without threshold to check if PRS was already run because it doesn't require calculating the pval threshold
            prs_path_without_threshold = (
                prs_dir +
                f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned.pval_thresh_*.perc_{percentile}.tsv'
            )
            # prs_path_without_threshold = (prs_dir+f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned.pval_thresh_*.perc_{percentile}.opposite_sex.tsv')

            if len(hl.hadoop_ls(prs_path_without_threshold)) > 0:
                print(
                    f'\n\n... Calculation of PRS for "{phen_dict[phen][0]}" {sex} {gwas_version} for percentile {percentile} already completed! ...\n'
                )
            else:
                start = dt.now()

                if percentile != 1:
                    threshold = ss.aggregate(
                        hl.agg.approx_quantiles(
                            ss[('' if gwas_version == 'unadjusted' else 'mtag_'
                                ) + 'pval'], percentile))
                    ss = ss.filter(
                        ss[('' if gwas_version == 'unadjusted' else 'mtag_') +
                           'pval'] <= threshold)
                else:
                    threshold = 1
                threshold_str = '{:.4e}'.format(threshold)
                prs_path = prs_dir + f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned.pval_thresh_{threshold_str}.perc_{percentile}.tsv'

                print(
                    f'\n\n... Using p-value threshold of {threshold} for percentile {percentile} ...\n'
                )
                mt = mt.annotate_rows(
                    beta=ss[mt.rsid]['beta' if gwas_version ==
                                     'unadjusted' else 'mtag_beta'])

                if count:
                    if percentile != 1:
                        threshold_ct = mt.filter_rows(hl.is_defined(
                            mt.beta)).count_rows()
                    else:
                        threshold_ct = ct_rows

                    print(
                        f'\n\n... Variants remaining after thresholding filter: {threshold_ct} ...\n'
                    )

                mt = mt.annotate_cols(prs=hl.agg.sum(mt.dosage * mt.beta))

                if count:
                    mt_cols_ct = mt.filter_cols(hl.is_defined(
                        mt.prs)).count_cols()

                    print(f'\n\n... Samples with PRS: {mt_cols_ct} ...\n')

                mt.cols().describe()

                mt.cols().select('prs').export(prs_path)

                elapsed = dt.now() - start

                print(
                    f'\n\n... Completed calculation of PRS for "{phen_dict[phen][0]}" {sex} {gwas_version} ...'
                )
                print(
                    f'\n... Elapsed time: {round(elapsed.seconds/60, 2)} min ...\n'
                )