def export_ma_format(batch_size=256): r''' Export columns for .ma format (A1, A2, freq, beta, se, N) for select phenotypes ''' meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path()) highprev = hl.import_table(f'{ldprune_dir}/joined_ukbb_lancet_age_high_prev.tsv', impute=True) highprev = highprev.annotate(pheno = highprev.code.replace('_irnt','')) pheno_list = highprev.pheno.collect() pheno_list = [p for p in pheno_list if p is not None] meta_mt0 = meta_mt0.filter_cols(hl.literal(pheno_list).contains(meta_mt0.pheno)) meta_mt0 = meta_mt0.annotate_cols(pheno_id = (meta_mt0.trait_type+'-'+ meta_mt0.phenocode+'-'+ meta_mt0.pheno_sex+ hl.if_else(hl.len(meta_mt0.coding)>0, '-'+meta_mt0.coding, '')+ hl.if_else(hl.len(meta_mt0.modifier)>0, '-'+meta_mt0.modifier, '') ).replace(' ','_').replace('/','_')) meta_mt0 = meta_mt0.annotate_rows(SNP = meta_mt0.locus.contig+':'+hl.str(meta_mt0.locus.position)+':'+meta_mt0.alleles[0]+':'+meta_mt0.alleles[1], A1 = meta_mt0.alleles[1], # .ma format requires A1 = effect allele, which in this case is A2 for UKB GWAS A2 = meta_mt0.alleles[0]) meta_field_rename_dict = {'BETA':'b', 'SE':'se', 'Pvalue':'p', 'AF_Allele2':'freq', 'N':'N'} for pop in ['AFR','EUR']: #['AFR','AMR','CSA','EAS','EUR','MID']: print(f'not_{pop}') req_pop_list = [p for p in POPS if p is not pop] loo_pop = meta_mt0.annotate_cols(idx = meta_mt0.meta_analysis_data.pop.index(hl.literal(req_pop_list))) # get index of which meta-analysis is the leave-on-out for current pop loo_pop = loo_pop.filter_cols(hl.is_defined(loo_pop.idx)) annotate_dict = {meta_field_rename_dict[field]: loo_pop.meta_analysis[field][loo_pop.idx] for field in ['AF_Allele2','BETA','SE','Pvalue','N']} batch_idx = 1 export_out = f'{ldprune_dir}/loo/not_{pop}/batch{batch_idx}' while hl.hadoop_is_dir(export_out): batch_idx += 1 export_out = f'{ldprune_dir}/loo/not_{pop}/batch{batch_idx}' checkpoint_path = f'gs://ukbb-diverse-temp-30day/loo/not_{pop}/batch{batch_idx}.mt' # print(f'\nCheckpointing to: {checkpoint_path}\n') loo_pop = loo_pop.checkpoint(checkpoint_path, _read_if_exists=True, overwrite=True) loo_pop = loo_pop.filter_entries(hl.is_defined(loo_pop.b)) print(f'\nExporting to: {export_out}\n') hl.experimental.export_entries_by_col(mt = loo_pop, path = export_out, bgzip = True, batch_size = batch_size, use_string_key_as_file_name = True, header_json_in_file = False)
def export_loo(batch_size=256): r''' For exporting p-values of meta-analysis of leave-one-out population sets ''' meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path()) meta_mt0 = meta_mt0.filter_cols(hl.len(meta_mt0.pheno_data.pop) == 6) meta_mt0 = meta_mt0.annotate_cols(pheno_id=( meta_mt0.trait_type + '-' + meta_mt0.phenocode + '-' + meta_mt0.pheno_sex + hl.if_else(hl.len(meta_mt0.coding) > 0, '-' + meta_mt0.coding, '') + hl.if_else(hl.len(meta_mt0.modifier) > 0, '-' + meta_mt0.modifier, '')).replace(' ', '_').replace('/', '_')) meta_mt0 = meta_mt0.annotate_rows( SNP=(meta_mt0.locus.contig + ':' + hl.str(meta_mt0.locus.position) + ':' + meta_mt0.alleles[0] + ':' + meta_mt0.alleles[1])) all_pops = sorted(['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']) annotate_dict = {} for pop_idx, pop in enumerate( all_pops, 1 ): # pop idx corresponds to the alphabetic ordering of the pops (entry with idx=0 is 6-pop meta-analysis) annotate_dict.update( {f'pval_not_{pop}': meta_mt0.meta_analysis.Pvalue[pop_idx]}) meta_mt1 = meta_mt0.annotate_entries(**annotate_dict) meta_mt1 = meta_mt1.key_cols_by('pheno_id') meta_mt1 = meta_mt1.key_rows_by().drop('locus', 'alleles', 'gene', 'annotation', 'meta_analysis') print(meta_mt1.describe()) batch_idx = 1 get_export_path = lambda batch_idx: f'{ldprune_dir}/loo/sumstats/batch{batch_idx}' while hl.hadoop_is_dir(get_export_path(batch_idx)): batch_idx += 1 print(f'\nExporting to: {get_export_path(batch_idx)}\n') hl.experimental.export_entries_by_col(mt=meta_mt1, path=get_export_path(batch_idx), bgzip=True, batch_size=batch_size, use_string_key_as_file_name=True, header_json_in_file=False)
def export_binary_eur(cluster_idx, num_clusters=10, batch_size = 256): r''' Export summary statistics for binary traits defined only for EUR. Given the large number of such traits (4184), it makes sense to batch this across `num_clusters` clusters for reduced wall time and robustness to mid-export errors. NOTE: `cluster_idx` is 1-indexed. ''' mt0 = get_final_sumstats_mt_for_export() meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path()) mt0 = mt0.annotate_cols(pheno_id = get_pheno_id(tb=mt0)) mt0 = mt0.annotate_rows(chr = mt0.locus.contig, pos = mt0.locus.position, ref = mt0.alleles[0], alt = mt0.alleles[1]) trait_types_to_run = ['categorical','phecode', 'icd10', 'prescriptions'] # list of which trait_type to run # fields specific to each category of trait meta_fields = ['AF_Cases','AF_Controls'] fields = ['AF.Cases','AF.Controls'] # dictionaries for renaming fields meta_field_rename_dict = {'BETA':'beta_meta', 'SE':'se_meta', 'Pvalue':'pval_meta', 'AF_Cases':'af_cases_meta', 'AF_Controls':'af_controls_meta', 'Pvalue_het':'pval_heterogeneity'} field_rename_dict = {'AF.Cases':'af_cases', 'AF.Controls':'af_controls', 'BETA':'beta', 'SE':'se', 'Pvalue':'pval', 'low_confidence':'low_confidence'} # decided on this implementation to make later code cleaner all_binary_trait_types = {'categorical','phecode', 'icd10', 'prescriptions'} meta_fields += ['BETA','SE','Pvalue','Pvalue_het'] fields += ['BETA','SE','Pvalue','low_confidence'] trait_category = 'binary' trait_types = all_binary_trait_types.intersection(trait_types_to_run) # get list of binary trait types to run pop_set = {'EUR'} start = time() mt1 = mt0.filter_cols((hl.literal(trait_types).contains(mt0.trait_type))& (hl.set(mt0.pheno_data.pop)==hl.literal(pop_set))) pheno_id_list = mt1.pheno_id.collect() num_traits = len(pheno_id_list) # total number of traits to run traits_per_cluster = ceil(num_traits/num_clusters) # maximum traits to run per cluster cluster_pheno_id_list = pheno_id_list[(cluster_idx-1)*traits_per_cluster:cluster_idx*traits_per_cluster] # list of traits to run in current cluster print(len(cluster_pheno_id_list)) mt1 = mt1.filter_cols(hl.literal(cluster_pheno_id_list).contains(mt1.pheno_id)) pop_list = sorted(pop_set) annotate_dict = {} keyed_mt = meta_mt0[mt1.row_key,mt1.col_key] if len(pop_set)>1: for field in meta_fields: # NOTE: Meta-analysis columns go before per-population columns field_expr = keyed_mt.meta_analysis[field][0] annotate_dict.update({f'{meta_field_rename_dict[field]}': hl.if_else(hl.is_nan(field_expr), hl.str(field_expr), hl.format('%.3e', field_expr))}) for field in fields: for pop_idx, pop in enumerate(pop_list): field_expr = mt1.summary_stats[field][pop_idx] annotate_dict.update({f'{field_rename_dict[field]}_{pop}': hl.if_else(hl.is_nan(field_expr), hl.str(field_expr), hl.str(field_expr) if field=='low_confidence' else hl.format('%.3e', field_expr))}) mt2 = mt1.annotate_entries(**annotate_dict) mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409') mt2 = mt2.key_cols_by('pheno_id') mt2 = mt2.key_rows_by().drop('locus','alleles','summary_stats') # row fields that are no longer included: 'gene','annotation' print(mt2.describe()) batch_idx = 1 get_export_path = lambda batch_idx: f'{ldprune_dir}/release/{trait_category}/{"-".join(pop_list)}_batch{batch_idx}/subbatch{cluster_idx}' while hl.hadoop_is_dir(get_export_path(batch_idx)): batch_idx += 1 print(f'\nExporting {len(cluster_pheno_id_list)} phenos to: {get_export_path(batch_idx)}\n') hl.experimental.export_entries_by_col(mt = mt2, path = get_export_path(batch_idx), bgzip = True, batch_size = batch_size, use_string_key_as_file_name = True, header_json_in_file = False) end = time() print(f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs')
def test_hadoop_is_dir(self): self.assertTrue(hl.hadoop_is_dir(resource('ls_test/subdir'))) self.assertFalse(hl.hadoop_is_dir(resource('ls_test/f_50'))) self.assertFalse(hl.hadoop_is_dir(resource('ls_test/invalid-path')))
@author: nbaya """ import subprocess from itertools import combinations import hail as hl hl.init() for num_pops in range(1, 6)[::-1]: all_pops = ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID'] pop_sets = [set(i) for i in list(combinations(all_pops, num_pops)) ] # list of exact set of pops for which phenotype is defined incorrrect_pops = all_pops[:num_pops] # incorrect column suffixes for trait_category in ['quant', 'binary']: for pop_set in pop_sets: pop_list = sorted(pop_set) # correct column suffixes bucket = f'gs://ukb-diverse-pops/ld_prune/release/{trait_category}/{"-".join(pop_list)}_batch1' print(bucket) if hl.hadoop_is_dir(bucket): subprocess.call(['gsutil', '-m', 'cp', f'{bucket}/*bgz', './']) assert False # break
import json import os hl.init(log="/dev/null") #%% os.chdir("/Users/weisburd/code/methods/gcnv_viewer") print(os.getcwd()) #%% #google_storage_dir = "gs://fc-secure-e2c5f2a5-2e76-4c01-a264-419262b2c7c8/dcr_tabs" #google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v1/beds" google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v3/beds" assert hl.hadoop_is_dir(google_storage_dir) #%% batch_name_to_path_and_samples = {} for result in hl.hadoop_ls(google_storage_dir): if not result['path'].endswith('.bed.gz') and not result['path'].endswith('.bed'): continue if result['size_bytes'] < 1000: print(f"ERROR: file size of {result['path']} is too small: {result['size_bytes']}") with hl.hadoop_open(result['path'], 'r') as f: line = f.readline()