def get_gnomad_v3_mt( split=False, key_by_locus_and_alleles: bool = False, remove_hard_filtered_samples: bool = True, release_only: bool = False, samples_meta: bool = False, ) -> hl.MatrixTable: """ Wrapper function to get gnomAD data with desired filtering and metadata annotations :param split: Perform split on MT - Note: this will perform a split on the MT rather than grab an already split MT :param key_by_locus_and_alleles: Whether to key the MatrixTable by locus and alleles (only needed for v3) :param remove_hard_filtered_samples: Whether to remove samples that failed hard filters (only relevant after sample QC) :param release_only: Whether to filter the MT to only samples available for release (can only be used if metadata is present) :param samples_meta: Whether to add metadata to MT in 'meta' column :return: gnomAD v3 dataset with chosen annotations and filters """ mt = gnomad_v3_genotypes.mt() if key_by_locus_and_alleles: mt = hl.MatrixTable( hl.ir.MatrixKeyRowsBy( mt._mir, ["locus", "alleles"], is_sorted=True ) # Prevents hail from running sort on genotype MT which is already sorted by a unique locus ) if remove_hard_filtered_samples: mt = mt.filter_cols( hl.is_missing(hard_filtered_samples.ht()[mt.col_key])) if samples_meta: mt = mt.annotate_cols(meta=meta.ht()[mt.col_key]) if release_only: mt = mt.filter_cols(mt.meta.release) elif release_only: mt = mt.filter_cols(meta.ht()[mt.col_key].release) if split: mt = mt.annotate_rows( n_unsplit_alleles=hl.len(mt.alleles), mixed_site=(hl.len(mt.alleles) > 2) & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:]) & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]), ) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) return mt
def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def get_gnomad_v3_mt(key_by_locus_and_alleles: bool = False, remove_hard_filtered_samples: bool = True, release_only: bool = False, samples_meta: bool = False) -> hl.MatrixTable: mt = gnomad_v3_genotypes.mt() if key_by_locus_and_alleles: mt = hl.MatrixTable( hl.ir.MatrixKeyRowsBy(mt._mir, ['locus', 'alleles'], is_sorted=True)) if remove_hard_filtered_samples: mt = mt.filter_cols( hl.is_missing(hard_filtered_samples.ht()[mt.col_key])) if samples_meta: mt = mt.annotate_cols(meta=meta.ht()[mt.col_key]) if release_only: mt = mt.filter_cols(mt.meta.release) elif release_only: mt = mt.filter_cols(meta.ht()[mt.col_key].release) return mt
def run_infer_families() -> hl.Pedigree: logger.info("Inferring families") ped = infer_families(get_relatedness_annotated_ht(), sex.ht(), duplicates.ht()) # Remove all trios containing any QC-filtered sample meta_ht = meta.ht() filtered_samples = meta_ht.aggregate( hl.agg.filter( (hl.len(meta_ht.qc_metrics_filters) > 0) | hl.or_else(hl.len(meta_ht.hard_filters) > 0, False), hl.agg.collect_as_set(meta_ht.s), )) return hl.Pedigree(trios=[ trio for trio in ped.trios if trio.s not in filtered_samples and trio.pat_id not in filtered_samples and trio.mat_id not in filtered_samples ])
def main(args): if args.join_qc_mt: v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True) v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes") v3_qc_mt = qc.mt() v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release) v3_qc_mt = v3_qc_mt.select_rows().select_cols() v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes") joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt) joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PC-Relate') logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!") joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt") joint_qc_mt = joint_qc_mt.sample_rows(0.1) eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores, block_size=4096, min_kinship=0.1, statistics='all') relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
def main(args): hl.init(default_reference='GRCh38') coverage_version = args.coverage_version if args.coverage_version else CURRENT_GENOME_COVERAGE_RELEASE logger = logging.getLogger("gnomad_qc.v3.load_data.compute_coverage") logger.warning( "Last time this was run (July 2020), this script required high-mem machines." ) if args.compute_coverage_ht: print("Building reference context HT") ref_ht = get_reference_ht( hl.get_reference('GRCh38'), contigs=[f'chr{x}' for x in range(1, 23)] + ['chrX', 'chrY'], excluded_intervals=telomeres_and_centromeres.ht().interval.collect( )) ref_ht = ref_ht.checkpoint("gs://gnomad-tmp/ref_context.ht", overwrite=True) logger.info("Done building reference context HT") mt = get_gnomad_v3_mt() mt = mt.filter_cols(meta.ht()[mt.col_key].release) coverage_ht = compute_coverage_stats(mt, ref_ht) coverage_ht = coverage_ht.checkpoint( 'gs://gnomad-tmp/gnomad.genomes_v3.coverage.summary.ht', overwrite=True) coverage_ht = coverage_ht.naive_coalesce(5000) coverage_ht.write(coverage('genomes').versions[coverage_version].path, overwrite=args.overwrite) if args.export_coverage: ht = coverage('genomes').versions[coverage_version].ht() if 'count_array' in ht.row_value: # Note that count_array isn't computed any more, so this is v3.0-specific ht = ht.drop('count_array') ht.export(coverage_tsv_path('genomes', coverage_version))
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...") # hotfix for depletion of homozygous alternate genotypes # Using v3.0 AF to avoid an extra frequency calculation # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments freq_ht = freq.versions["3"].ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries( GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, ) ) logger.info("Calculating InbreedingCoefficient...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'InbreedingCoeff', 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)