def pre_process_subset_freq(subset: str, global_ht: hl.Table, test: bool = False) -> hl.Table: """ Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort. .. note:: The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry) :param subset: subset ID :param global_ht: Hail Table containing all variants discovered in the overall release cohort :param test: If True, filter to small region on chr20 :return: Table containing subset frequencies with missing freq structs filled in """ # Read in subset HTs subset_ht_path = get_freq(subset=subset).path subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht" if test: if file_exists(subset_chr20_ht_path): logger.info( "Loading chr20 %s subset frequency data for testing: %s", subset, subset_chr20_ht_path, ) subset_ht = hl.read_table(subset_chr20_ht_path) elif file_exists(subset_ht_path): logger.info( "Loading %s subset frequency data for testing: %s", subset, subset_ht_path, ) subset_ht = hl.read_table(subset_ht_path) subset_ht = hl.filter_intervals( subset_ht, [hl.parse_locus_interval("chr20:1-1000000")]) elif file_exists(subset_ht_path): logger.info("Loading %s subset frequency data: %s", subset, subset_ht_path) subset_ht = hl.read_table(subset_ht_path) else: raise DataException( f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first." ) # Fill in missing freq structs ht = subset_ht.join(global_ht.select().select_globals(), how="right") ht = ht.annotate(freq=hl.if_else( hl.is_missing(ht.freq), hl.map(lambda x: missing_callstats_expr(), hl.range(hl.len(ht.freq_meta))), ht.freq, )) return ht
def get_rf_runs(rf_json_fp: str) -> Dict: """ Loads RF run data from JSON file. :param rf_json_fp: File path to rf json file. :return: Dictionary containing the content of the JSON file, or an empty dictionary if the file wasn't found. """ if file_exists(rf_json_fp): with hl.hadoop_open(rf_json_fp) as f: return json.load(f) else: logger.warning( f"File {rf_json_fp} could not be found. Returning empty RF run hash dict." ) return {}
def get_release_file(file_path: str, version: str = CURRENT_RELEASE) -> str: """ Tries to get the desired file from the corresponding release version on the google cloud. If the file is not found for the desired release version, falls back on previous versions. :param str file_path: Desired file path, with {0} as placeholder(s) for the version number :param str version: Desired file version :return: Path for closest version of the file available """ if file_exists(file_path.format(version)): return file_path.format(version) else: for v in range(RELEASES.index(version) - 1, -1, -1): if file_exists(file_path.format(RELEASES[v])): print( "WARN: Resource {} could not be found for gnomAD release version {}.\n Loading gnomAD release version {} of the file. ({})" .format(file_path.format(version), version, RELEASES[v], file_path.format(RELEASES[v]))) return file_path.format(RELEASES[v]) print("ERROR: Resource {} could not be found for any release.".format( file_path.format(version))) return file_path.format(version)
def main(args): hl.init(default_reference="GRCh38", log="/variant_histograms.log") logger.info("Loading ANNOTATIONS_HISTS dictionary...") if not file_exists(annotation_hists_path()): raise DataException( "Annotation hists JSON file not found. Need to create this JSON before running script!" ) with hl.hadoop_open(annotation_hists_path()) as a: ANNOTATIONS_HISTS = json.loads(a.read()) # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data ht = hl.read_table(release_ht_path(public=False)) ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS)) inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"] # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be # handled differently. It is stored as a dictionary in annotation_hists_path ANNOTATIONS_HISTS.remove("InbreedingCoeff") logger.info("Getting info annotation histograms...") hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS) # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary if args.determine_bounds: logger.info( "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10." ) minmax_dict = {} for metric in ANNOTATIONS_HISTS: minmax_dict[metric] = hl.struct( min=hl.agg.min(ht.info[metric]), max=hl.if_else( hl.agg.max(ht.info[metric]) < 1e10, hl.agg.max(ht.info[metric]), 1e10, ), ) minmax = ht.aggregate(hl.struct(**minmax_dict)) logger.info(f"Metrics bounds: {minmax}") else: logger.info( "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can " "be used to help define these ranges..." ) hists = ht.aggregate( hl.array( [ hist_expr.annotate(metric=hist_metric) for hist_metric, hist_expr in hist_ranges_expr.items() ] ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.QUALapprox), *ANNOTATIONS_HISTS["QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0])) ) .extend( hl.array( hl.agg.group_by( create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF), hl.agg.hist( hl.log10(ht.info.AS_QUALapprox), *ANNOTATIONS_HISTS["AS_QUALapprox"], ), ) ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0])) ), _localize=False, ) # Defining hist range and bins for allele frequency groups because they needed different ranges ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF)) inbreeding_hists = [ ht.aggregate( hl.agg.filter( ht.af_bin == x, hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],), ) ).annotate(metric="InbreedingCoeff" + "-" + x) for x in inbreeding_bin_ranges ] hists = hl.eval(hl.json(hists)) inbreeding_hists = hl.eval(hl.json(inbreeding_hists)) # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in # inbreeding_hists then joins them together to be written out as a single JSON hists = hists[:-1] + "," + inbreeding_hists[1:] logger.info("Writing output") with hl.hadoop_open(qual_hists_json_path(), "w") as f: f.write(hists)
def main(args): hl.init(log="/create_release_ht.log", default_reference="GRCh38") # The concatenated HT contains all subset frequency annotations, plus the overall cohort frequency annotations, # concatenated together in a single freq annotation ('freq') # Load global frequency Table if args.test: global_freq_chr20_ht_path = "gs://gnomad-tmp/gnomad_freq/chr20_test_freq.ht" if file_exists(global_freq_chr20_ht_path): logger.info( "Loading chr20 global frequency data for testing: %s", global_freq_chr20_ht_path, ) global_freq_ht = (hl.read_table(global_freq_chr20_ht_path).select( "freq").select_globals("freq_meta")) elif file_exists(get_freq().path): logger.info("Loading global frequency data for testing: %s", get_freq().path) global_freq_ht = (hl.read_table( get_freq().path).select("freq").select_globals("freq_meta")) global_freq_ht = hl.filter_intervals( global_freq_ht, [hl.parse_locus_interval("chr20:1-1000000")]) elif file_exists(get_freq().path): logger.info("Loading global frequency data: %s", get_freq().path) global_freq_ht = (hl.read_table( get_freq().path).select("freq").select_globals("freq_meta")) else: raise DataException( "Hail Table containing global callset frequencies not found. You may need to run the script to generate frequency annotations first." ) # Load subset frequency Table(s) if args.test: test_subsets = args.test_subsets subset_freq_hts = [ pre_process_subset_freq(subset, global_freq_ht, test=True) for subset in test_subsets ] else: subset_freq_hts = [ pre_process_subset_freq(subset, global_freq_ht) for subset in SUBSETS ] logger.info("Concatenating subset frequencies...") freq_ht = hl.Table.multi_way_zip_join( [global_freq_ht] + subset_freq_hts, data_field_name="freq", global_field_name="freq_meta", ) freq_ht = freq_ht.transmute(freq=freq_ht.freq.flatmap(lambda x: x.freq)) freq_ht = freq_ht.transmute_globals( freq_meta=freq_ht.freq_meta.flatmap(lambda x: x.freq_meta)) # Create frequency index dictionary on concatenated array (i.e., including all subsets) # NOTE: non-standard downsampling values are created in the frequency script corresponding to population totals, so # callset-specific DOWNSAMPLINGS must be used instead of the generic DOWNSAMPLING values global_freq_ht = hl.read_table(get_freq().path) freq_ht = freq_ht.annotate_globals(freq_index_dict=make_freq_index_dict( freq_meta=hl.eval(freq_ht.freq_meta), pops=POPS, downsamplings=hl.eval(global_freq_ht.downsamplings), )) # Add back in all global frequency annotations not present in concatenated frequencies HT row_fields = global_freq_ht.row_value.keys() - freq_ht.row_value.keys() logger.info( "Adding back the following row annotations onto concatenated frequencies: %s", row_fields) freq_ht = freq_ht.annotate(**global_freq_ht[freq_ht.key].select( *row_fields)) global_fields = global_freq_ht.globals.keys() - freq_ht.globals.keys() global_fields.remove("downsamplings") logger.info( "Adding back the following global annotations onto concatenated frequencies: %s", global_fields) freq_ht = freq_ht.annotate_globals(**global_freq_ht.index_globals().select( *global_fields)) logger.info("Preparing release Table annotations...") ht = add_release_annotations(freq_ht) logger.info("Removing chrM and sites without filter...") ht = hl.filter_intervals(ht, [hl.parse_locus_interval("chrM")], keep=False) ht = ht.filter(hl.is_defined(ht.filters)) ht = ht.checkpoint( qc_temp_prefix() + "release/gnomad.genomes.v3.1.sites.chr20.ht" if args.test else release_sites().path, args.overwrite, ) logger.info("Final variant count: %d", ht.count()) ht.describe() ht.show() ht.summarize()
def main(args): hl.init(log="/variant_qc_finalize.log") ht = get_score_bins(args.model_id, aggregated=False).ht() if args.filter_centromere_telomere: ht = ht.filter( ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus])) info_ht = get_info(split=True).ht() ht = ht.filter(~info_ht[ht.key].AS_lowqual) if args.model_id.startswith("vqsr_"): ht = ht.drop("info") freq_ht = get_freq().ht() ht = ht.annotate(InbreedingCoeff=freq_ht[ht.key].InbreedingCoeff) freq_idx = freq_ht[ht.key] aggregated_bin_path = get_score_bins(args.model_id, aggregated=True).path if not file_exists(aggregated_bin_path): sys.exit( f"Could not find binned HT for model: {args.model_id} ({aggregated_bin_path}). Please run create_ranked_scores.py for that hash." ) aggregated_bin_ht = get_score_bins(args.model_id, aggregated=True).ht() ht = generate_final_filter_ht( ht, args.model_name, args.score_name, ac0_filter_expr=freq_idx.freq[0].AC == 0, ts_ac_filter_expr=freq_idx.freq[1].AC == 1, mono_allelic_flag_expr=(freq_idx.freq[1].AF == 1) | (freq_idx.freq[1].AF == 0), snp_bin_cutoff=args.snp_bin_cutoff, indel_bin_cutoff=args.indel_bin_cutoff, snp_score_cutoff=args.snp_score_cutoff, indel_score_cutoff=args.indel_score_cutoff, inbreeding_coeff_cutoff=args.inbreeding_coeff_threshold, aggregated_bin_ht=aggregated_bin_ht, bin_id="bin", vqsr_ht=get_vqsr_filters(args.vqsr_model_id, split=True).ht() if args.vqsr_model_id else None, ) ht = ht.annotate_globals( filtering_model=ht.filtering_model.annotate(model_id=args.model_id, )) if args.model_id.startswith("vqsr_"): ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate( snv_training_variables=[ "AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_MQ", ], indel_training_variables=[ "AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", ], )) else: ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate( snv_training_variables=ht.features, indel_training_variables=ht.features, )) ht.write(final_filter.path, args.overwrite) final_filter_ht = final_filter.ht() final_filter_ht.summarize()
def main(args): output_dir = args.output_dir output_name = args.output_name inferred_sex = args.inferred_sex mt_path = args.mt_path input_pedigree = args.input_pedigree gnomad_ld = args.gnomad_ld run_ibd = args.run_ibd first_degree_pi_hat = args.first_degree_pi_hat grandparent_pi_hat = args.grandparent_pi_hat grandparent_ibd1 = args.grandparent_ibd1 grandparent_ibd2 = args.grandparent_ibd2 filter_kinship_ht = args.filter_kinship_ht logger.info("Reading in inputs...") mt = hl.read_matrix_table(mt_path) pedigree = hl.import_table(input_pedigree, impute=True) # Infer build of the MatrixTable build = get_reference_genome(mt.locus).name logger.info( "Filtering to biallelic SNVs on autosomes and performing LD pruning..." ) mt = filter_rows_for_qc(mt, min_af=0.001, min_callrate=0.99, apply_hard_filters=False) mt = ld_prune(mt, build, gnomad_ld) out_mt = f"{output_dir}/{output_name}_processed_mt.mt" logger.info("Remapping sample names...") mt, sex_ht = remap_samples(mt_path, mt, pedigree, inferred_sex) mt = mt.checkpoint(out_mt, overwrite=True) if run_ibd: logger.info("Running identity by descent...") ibd_results_ht = hl.identity_by_descent(mt, maf=mt.AF, min=0.10, max=1.0) ibd_results_ht = ibd_results_ht.annotate( ibd0=ibd_results_ht.ibd.Z0, ibd1=ibd_results_ht.ibd.Z1, ibd2=ibd_results_ht.ibd.Z2, pi_hat=ibd_results_ht.ibd.PI_HAT, ).drop("ibs0", "ibs1", "ibs2", "ibd") out_ht = f"{output_dir}/{output_name}_ibd_kinship.tsv" ibd_results_ht.export(out_ht) else: logger.warn("Skipping IBD - using previous calculations...") if not file_exists(f"{output_dir}/{output_name}_ibd_kinship.tsv"): logger.warning( "IBD calculation was skipped but no file with previous calculations was found...", sample, ) logger.info("Reading in kinship ht...") kin_ht = hl.import_table(f"{output_dir}/{output_name}_ibd_kinship.tsv", impute=True) # Subset MatrixTable and sex ht to the samples in the pedigree mt_subset, sex_ht, expected_samples, vcf_samples = subset_samples( mt, pedigree, sex_ht, output_dir, output_name) # Subset Table to the samples in the pedigree subset = hl.set(expected_samples) kin_ht = kin_ht.filter( subset.contains(kin_ht.i) | subset.contains(kin_ht.j)) # Key the Table kin_ht = kin_ht.key_by("i", "j") # Setup output file out_summary = hl.hadoop_open( f"{output_dir}/{output_name}_ped_check_summary.txt", "w") if filter_kinship_ht: logger.info( "Filtering kinship table to remove unrelated individuals from analysis..." ) kin_ht = filter_kin_ht(kin_ht, out_summary) # Output basic stats out_summary.write("Number individuals in pedigree: " + str(len(expected_samples)) + "\n") out_summary.write("Number individuals in subset from the VCF: " + str(len(vcf_samples)) + "\n") out_summary.write("Number of relationships in the kinship table: " + str(kin_ht.count()) + "\n\n") out_summary.close() seqr_projects, family_ids, given_sex = write_functional_pedigree( input_pedigree, vcf_samples, output_dir, output_name) # Compare inferred and given sex check_sex(sex_ht, output_dir, output_name) kin_ht = add_project_and_family_annotations(kin_ht, seqr_projects, family_ids) logger.info("Writing kinship ht per project...") # Output original ht per project for project in set(seqr_projects.values()): full_ht = kin_ht.filter((kin_ht.seqr_proj_i == project) | (kin_ht.seqr_proj_j == project)) full_ht.drop("seqr_proj_i", "seqr_proj_j").export( f"{output_dir}/{project}/{output_name}_{project}_annotated_kin.txt" )