def main(args): """Find doubleton pairs and compare to related pairs.""" try: hl.init(log="/test_doubletons_relatedness.log", default_reference="GRCh38") compare_doubletons_to_related() finally: logger.info("Copying hail log to logging bucket...") hl.copy_log(args.temp_path)
def copy_logs_output(log_dir, log_file, plot_dir): if not log_dir.endswith("/"): log_dir = log_dir + "/" datestr = time.strftime("%Y.%m.%d") hail_log_name = os.path.join(log_dir, datestr + "_hail_log.txt") hl.copy_log(hail_log_name) cmd = ['gsutil', 'cp', log_file, log_dir] subprocess.call(cmd) cmd = ['gsutil', 'cp', '*.html', plot_dir] subprocess.call(cmd) cmd = ['gsutil', 'cp', '*.pdf', plot_dir] subprocess.call(cmd)
def test_hadoop_copy_log(self): with with_local_temp_file('log') as r: hl.copy_log(r) stats = hl.hadoop_stat(r) self.assertTrue(stats['size_bytes'] > 0)
def test_hadoop_copy_log(self): r = resource('copy_log_test.txt') hl.copy_log(r) stats = hl.hadoop_stat(r) self.assertTrue(stats['size_bytes'] > 0)
def main(args): subsets = args.subsets hl.init( log= f"/generate_frequency_data{'.' + '_'.join(subsets) if subsets else ''}.log", default_reference="GRCh38", ) invalid_subsets = [] n_subsets_use_subpops = 0 for s in subsets: if s not in SUBSETS: invalid_subsets.append(s) if s in COHORTS_WITH_POP_STORED_AS_SUBPOP: n_subsets_use_subpops += 1 if invalid_subsets: raise ValueError( f"{', '.join(invalid_subsets)} subset(s) are not one of the following official subsets: {SUBSETS}" ) if n_subsets_use_subpops & (n_subsets_use_subpops != len(subsets)): raise ValueError( f"All or none of the supplied subset(s) should be in the list of cohorts that need to use subpops instead " f"of pops in frequency calculations: {COHORTS_WITH_POP_STORED_AS_SUBPOP}" ) try: logger.info("Reading full sparse MT and metadata table...") mt = get_gnomad_v3_mt( key_by_locus_and_alleles=True, release_only=not args.include_non_release, samples_meta=True, ) if args.test: logger.info("Filtering to two partitions on chr20") mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20:1-1000000")]) mt = mt._filter_partitions(range(2)) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) if args.include_non_release: logger.info("Filtering MT columns to high quality samples") total_sample_count = mt.count_cols() mt = mt.filter_cols(mt.meta.high_quality) high_quality_sample_count = mt.count_cols() logger.info( f"Filtered {total_sample_count - high_quality_sample_count} from the full set of {total_sample_count} " f"samples...") if subsets: mt = mt.filter_cols(hl.any([mt.meta.subsets[s] for s in subsets])) logger.info( f"Running frequency generation pipeline on {mt.count_cols()} samples in {', '.join(subsets)} subset(s)..." ) else: logger.info( f"Running frequency generation pipeline on {mt.count_cols()} samples..." ) logger.info("Computing adj and sex adjusted genotypes...") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex_imputation.sex_karyotype), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD), ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) # Temporary hotfix for depletion of homozygous alternate genotypes logger.info( "Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt..." ) # Load v3.0 allele frequencies to avoid an extra frequency calculation # NOTE: Using previous callset AF works for small incremental changes to a callset, but we will need to revisit for large increments freq_ht = get_freq(version="3").ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries(GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, )) logger.info("Generating frequency data...") if subsets: mt = annotate_freq( mt, sex_expr=mt.meta.sex_imputation.sex_karyotype, pop_expr=mt.meta.population_inference.pop if not n_subsets_use_subpops else mt.meta.project_meta.project_subpop, # NOTE: TGP and HGDP labeled populations are highly specific and are stored in the project_subpop meta field ) # NOTE: no FAFs or popmax needed for subsets mt = mt.select_rows("freq") logger.info( f"Writing out frequency data for {', '.join(subsets)} subset(s)..." ) if args.test: mt.rows().write( get_checkpoint_path( f"chr20_test_freq.{'_'.join(subsets)}"), overwrite=True, ) else: mt.rows().write(get_freq(subset="_".join(subsets)).path, overwrite=args.overwrite) else: logger.info("Computing age histograms for each variant...") mt = mt.annotate_cols(age=hl.if_else( hl.is_defined(mt.meta.project_meta.age), mt.meta.project_meta.age, mt.meta.project_meta.age_alt, # NOTE: most age data is stored as integers in 'age' annotation, but for a select number of samples, age is stored as a bin range and 'age_alt' corresponds to an integer in the middle of the bin )) mt = mt.annotate_rows(**age_hists_expr(mt.adj, mt.GT, mt.age)) # Compute callset-wide age histogram global mt = mt.annotate_globals(age_distribution=mt.aggregate_cols( hl.agg.hist(mt.age, 30, 80, 10))) mt = annotate_freq( mt, sex_expr=mt.meta.sex_imputation.sex_karyotype, pop_expr=mt.meta.population_inference.pop, downsamplings=DOWNSAMPLINGS, ) # Remove all loci with raw AC=0 mt = mt.filter_rows(mt.freq[1].AC > 0) logger.info("Calculating InbreedingCoeff...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Computing filtering allele frequencies and popmax...") faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( "InbreedingCoeff", "freq", faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX), ) mt = mt.annotate_globals( faf_meta=faf_meta, faf_index_dict=make_faf_index_dict(faf_meta)) mt = mt.annotate_rows(popmax=mt.popmax.annotate( faf95=mt.faf[mt.faf_meta.index( lambda x: x.values() == ["adj", mt.popmax.pop])].faf95)) logger.info("Annotating quality metrics histograms...") # NOTE: these are performed here as the quality metrics histograms also require densifying mt = mt.annotate_rows( qual_hists=qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD, mt.adj)) ht = mt.rows() ht = ht.annotate( qual_hists=hl.Struct( **{ i.replace("_adj", ""): ht.qual_hists[i] for i in ht.qual_hists if "_adj" in i }), raw_qual_hists=hl.Struct(**{ i: ht.qual_hists[i] for i in ht.qual_hists if "_adj" not in i }), ) logger.info("Writing out frequency data...") if args.test: ht.write(get_checkpoint_path("chr20_test_freq"), overwrite=True) else: ht.write(get_freq().path, overwrite=args.overwrite) finally: logger.info("Copying hail log to logging bucket...") hl.copy_log(f"{qc_temp_prefix()}logs/")
help='Radius of window for LD matrix') parser.add_argument('--ld-score-radius', type=int, default=1e6, help='Radius of window for LD score') parser.add_argument('--write-mt', action='store_true', help='Write MatrixTable from bgen') parser.add_argument('--write-bm', action='store_true', help='Write BlockMatrix from MatrixTable') parser.add_argument('--compute-ld-matrix', action='store_true', help='Compute LD matrix') parser.add_argument('--compute-ldscore', action='store_true', help='Compute LD score') parser.add_argument('--write-ldsc-hm3-snplist', action='store_true', help='Write QCed HM3 snplist for ldsc') parser.add_argument('--overwrite', action='store_true', help='Overwrite data') args = parser.parse_args() atexit.register(lambda: hl.copy_log( timestamp_path(f'gs://ukb-diverse-pops/ld/{args.pop}/ld', suffix='.log'))) main(args)
def main(args): hl.init() # Read in all sumstats mt = load_final_sumstats_mt(filter_phenos=True, filter_variants=False, filter_sumstats=True, separate_columns_by_pop=False, annotate_with_nearest_gene=False) # Annotate per-entry sample size def get_n(pheno_data, i): return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0) mt = mt.annotate_entries(summary_stats=hl.map( lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]), get_n(mt.pheno_data, x[0]))), hl.zip_with_index(mt.summary_stats))) # Exclude entries with low confidence flag. if not args.keep_low_confidence_variants: mt = mt.annotate_entries(summary_stats=hl.map( lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats)) # Run fixed-effect meta-analysis (all + leave-one-out) mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA / (mt.summary_stats.SE**2), inv_se2=1 / (mt.summary_stats.SE**2)) mt = mt.annotate_entries( sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta, mt.pheno_data.pop), sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop)) mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2, META_SE=hl.map(lambda x: hl.sqrt(1 / x), mt.sum_inv_se2)) mt = mt.annotate_entries( META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA / mt.META_SE))) # Run heterogeneity test (Cochran's Q) mt = mt.annotate_entries(META_Q=hl.map( lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2), mt.META_BETA), variant_exists=hl.map(lambda x: ~hl.is_missing(x), mt.summary_stats.BETA)) mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out( mt.variant_exists, mt.pheno_data.pop)) mt = mt.annotate_entries(META_Pvalue_het=hl.map( lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1), hl.range(hl.len(mt.META_Q)))) # Add other annotations mt = mt.annotate_entries( ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats), ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats), META_AC_Allele2=all_and_leave_one_out( mt.summary_stats.AF_Allele2 * mt.summary_stats.N, mt.pheno_data.pop), META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop)) mt = mt.annotate_entries( META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N, META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) / mt.META_N, META_AF_Controls=all_and_leave_one_out(mt.ac_controls, mt.pheno_data.pop) / mt.META_N) mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases', 'ac_controls', 'summary_stats', 'META_AC_Allele2') # Format everything into array<struct> def is_finite_or_missing(x): return (hl.or_missing(hl.is_finite(x), x)) meta_fields = [ 'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2', 'AF_Cases', 'AF_Controls' ] mt = mt.transmute_entries(meta_analysis=hl.map( lambda i: hl.struct( **{ field: is_finite_or_missing(mt[f'META_{field}'][i]) for field in meta_fields }), hl.range(hl.len(mt.META_BETA)))) col_fields = ['n_cases', 'n_controls'] mt = mt.annotate_cols( **{ field: all_and_leave_one_out(mt.pheno_data[field], mt.pheno_data.pop) for field in col_fields }) col_fields += ['pop'] mt = mt.annotate_cols(pop=all_and_leave_one_out( mt.pheno_data.pop, mt.pheno_data.pop, all_f=lambda x: x, loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x), )) mt = mt.transmute_cols(meta_analysis_data=hl.map( lambda i: hl.struct(**{field: mt[field][i] for field in col_fields}), hl.range(hl.len(mt.pop)))) mt.describe() mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite) hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')