def generate_ac(mt: hl.MatrixTable) -> hl.Table: """ Creates Table containing allele counts per variant. Returns table containing the following annotations: - `ac_qc_samples_raw`: Allele count of high quality samples - `ac_qc_samples_unrelated_raw`: Allele count of high quality unrelated samples - `ac_release_samples_raw`: Allele count of release samples - `ac_qc_samples_adj`: Allele count of high quality samples after adj filtering - `ac_qc_samples_unrelated_adj`: Allele count of high quality unrelated samples after adj filtering - `ac_release_samples_adj`: Allele count of release samples after adj filtering :param mt: Input MatrixTable :return: Table containing allele counts """ mt = mt.filter_cols(mt.meta.high_quality) mt = mt.filter_rows(hl.len(mt.alleles) > 1) mt = annotate_adj(mt) mt = mt.annotate_rows( ac_qc_samples_raw=hl.agg.sum(mt.GT.n_alt_alleles()), ac_qc_samples_unrelated_raw=hl.agg.filter(~mt.meta.sample_filters.all_samples_related, hl.agg.sum(mt.GT.n_alt_alleles())), ac_release_samples_raw=hl.agg.filter(mt.meta.release, hl.agg.sum(mt.GT.n_alt_alleles())), ac_qc_samples_adj=hl.agg.filter(mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())), ac_qc_samples_unrelated_adj=hl.agg.filter(~mt.meta.sample_filters.all_samples_related & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())), ac_release_samples_adj=hl.agg.filter(mt.meta.release & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())), ) return mt.rows()
def generate_ac(mt: hl.MatrixTable, fam_file: str) -> hl.Table: """ Creates Table with QC samples, QC samples removing children and release samples raw and adj ACs. """ mt = mt.filter_cols(mt.meta.high_quality) fam_ht = hl.import_fam(fam_file, delimiter="\t") mt = mt.annotate_cols(unrelated_sample=hl.is_missing(fam_ht[mt.s])) mt = mt.filter_rows(hl.len(mt.alleles) > 1) mt = annotate_adj(mt) mt = mt.annotate_rows( ac_qc_samples_raw=hl.agg.sum(mt.GT.n_alt_alleles()), ac_qc_samples_unrelated_raw=hl.agg.filter( ~mt.meta.all_samples_related, hl.agg.sum(mt.GT.n_alt_alleles())), ac_release_samples_raw=hl.agg.filter(mt.meta.release, hl.agg.sum( mt.GT.n_alt_alleles())), ac_qc_samples_adj=hl.agg.filter(mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())), ac_qc_samples_unrelated_adj=hl.agg.filter( ~mt.meta.all_samples_related & mt.adj, hl.agg.sum(mt.GT.n_alt_alleles())), ac_release_samples_adj=hl.agg.filter(mt.meta.release & mt.adj, hl.agg.sum( mt.GT.n_alt_alleles())), ) return mt.rows()
def generate_sib_stats( mt: hl.MatrixTable, relatedness_ht: hl.Table, i_col: str = "i", j_col: str = "j", relationship_col: str = "relationship", autosomes_only: bool = True, bi_allelic_only: bool = True, ) -> hl.Table: """ This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants shared by pairs of siblings in `relatedness_ht`. This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too). The `relationship_col` should be a column specifying the relationship between each two samples as defined by the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of samples that are annotated as `SIBLINGS`. .. note:: By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites. :param mt: Input Matrix table :param relatedness_ht: Input relationship table :param i_col: Column containing the 1st sample of the pair in the relationship table :param j_col: Column containing the 2nd sample of the pair in the relationship table :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants. :param autosomes_only: If set, only autosomal intervals are used. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :return: A Table with the sibling shared variant counts """ if autosomes_only: mt = filter_to_autosomes(mt) if bi_allelic_only: mt = mt.filter_rows(bi_allelic_expr(mt)) sib_ht = relatedness_ht.filter( relatedness_ht[relationship_col] == SIBLINGS) s_to_keep = sib_ht.aggregate( hl.agg.explode(lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s]), _localize=False, ) mt = mt.filter_cols(s_to_keep.contains(mt.s)) if "adj" not in mt.entry: mt = annotate_adj(mt) sib_stats_ht = mt.select_rows(**generate_sib_stats_expr( mt, sib_ht, i_col=i_col, j_col=j_col, strata={ "raw": True, "adj": mt.adj }, )).rows() return sib_stats_ht
def filter_to_adj(mt: hl.MatrixTable) -> hl.MatrixTable: """ Filter genotypes to adj criteria """ if "adj" not in list(mt.entry): mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj) return mt.drop(mt.adj)
def main(args): hl.init() data_type = 'genomes' if args.genomes else 'exomes' if args.write_hardcalls: mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None) ht = hl.read_table(qc_ht_path(data_type, 'hard_filters')) mt = annotate_adj( mt.select_cols(sex=ht[hl.literal(data_type), mt.s].sex)) mt = mt.select_entries(GT=hl.case(missing_false=True).when( hl.call(mt.PGT[0], mt.PGT[1]) == mt.GT, mt.PGT).default(mt.GT), PID=mt.PID, adj=mt.adj) mt = adjust_sex_ploidy(mt, mt.sex) mt = mt.select_cols().naive_coalesce(10000) mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=False), args.overwrite) if args.split_hardcalls: mt = get_gnomad_data(data_type, split=False, meta_root=None) mt = hl.split_multi_hts(mt) mt.write(get_gnomad_data_path(data_type, hardcalls=True, split=True), args.overwrite) if args.write_nonrefs: # CPU-hours: 600 (E) mt = get_gnomad_data(data_type, split=False, raw=True, meta_root=None).select_cols() mt = mt.annotate_entries(is_missing=hl.is_missing(mt.GT)) mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref()) mt = annotate_adj(mt) if args.exomes: mt = mt.naive_coalesce(10000) mt.write( get_gnomad_data_path(data_type, split=False, non_refs_only=True), args.overwrite) if args.split_nonrefs: # CPU-hours: 300 (E) mt = get_gnomad_data(data_type, split=False, non_refs_only=True) mt = hl.split_multi_hts(mt) mt = mt.filter_entries(mt.is_missing | mt.GT.is_non_ref()) mt.write( get_gnomad_data_path(data_type, split=True, non_refs_only=True), args.overwrite)
def generate_fam_stats( mt: hl.MatrixTable, fam_file: str ) -> hl.Table: """ Calculate transmission and de novo mutation statistics using trios in the dataset. :param mt: Input MatrixTable :param fam_file: path to text file containing trio pedigree :return: Table containing trio stats """ # Load Pedigree data and filter MT to samples present in any of the trios ped = hl.Pedigree.read(fam_file, delimiter="\t") fam_ht = hl.import_fam(fam_file, delimiter="\t") fam_ht = fam_ht.annotate( fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id] ) fam_ht = fam_ht.explode('fam_members', name='s') fam_ht = fam_ht.key_by('s').select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) logger.info(f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios.") mt = filter_to_autosomes(mt) mt = annotate_adj(mt) mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj') mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True) trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj) ht = mt.select_rows( **generate_trio_stats_expr( mt, transmitted_strata={ 'raw': True, 'adj': trio_adj }, de_novo_strata={ 'raw': True, 'adj': trio_adj, }, proband_is_female_expr=mt.is_female ) ).rows() return ht.filter( ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0 )
def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset truthset_ht = get_truth_ht(args.onmi, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/ddd-elgh-ukbb/truthset.ht', overwrite=True) # Trio data # trio annotation: mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint(f'{args.output_dir}/ddd-elgh-ukbb/mt_trios_adj.mt', overwrite=True) trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_trios_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) ht_inbreeding.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_inbreeding_new.ht', overwrite=True) qc_ac_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_allele_data_new.ht', overwrite=True)
def generate_fam_stats(mt: hl.MatrixTable, fam_file: str) -> hl.Table: # Load Pedigree data and filter MT to samples present in any of the trios ped = hl.Pedigree.read(fam_file, delimiter="\t") fam_ht = hl.import_fam(fam_file, delimiter="\t") fam_ht = fam_ht.annotate( fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]) fam_ht = fam_ht.explode('fam_members', name='s') fam_ht = fam_ht.key_by('s').select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) logger.info( f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios." ) mt = filter_to_autosomes(mt) mt = annotate_adj(mt) mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj') mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True) trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj) parents_no_alt = (mt.mother_entry.AD[1] == 0) & (mt.father_entry.AD[1] == 0) parents_high_depth = (mt.mother_entry.AD[0] + mt.mother_entry.AD[1] > 20) & (mt.father_entry.AD[0] + mt.father_entry.AD[1] > 20) parents_high_gq = (mt.mother_entry.GQ >= 30) & (mt.father_entry.GQ >= 30) ht = mt.select_rows(**generate_trio_stats_expr( mt, transmitted_strata={ 'raw': None, 'adj': trio_adj }, de_novo_strata={ 'raw': None, 'adj': trio_adj, 'hq': trio_adj & parents_high_gq & parents_high_depth & parents_no_alt }, proband_is_female_expr=mt.is_female)).rows() return ht.filter( ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0)
def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable: """ Filters a MatrixTable to a set of trios in `fam_ht` and annotates with adj. :param mt: A Matrix Table to filter to only trios :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam` :return: A MT filtered to trios and adj annotated """ # Filter MT to samples present in any of the trios fam_ht = fam_ht.annotate( fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]) fam_ht = fam_ht.explode("fam_members", name="s") fam_ht = fam_ht.key_by("s").select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) if "adj" not in mt.entry: mt = annotate_adj(mt) return mt
def query(output): """Query script entry point.""" hl.init(default_reference='GRCh38') mt_path = f'{output}/filtered_mt.mt' mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # reproduce gnomAD genotype filtering mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj) mt = hl.variant_qc(mt) # Filter to common and biallelic variants mt = mt.filter_rows((hl.len(mt.alleles) == 2) & (mt.variant_qc.AF[1] > 0.05)) pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000) filtered_mt = mt.filter_rows( hl.is_defined(pruned_variant_table[mt.row_key])) # save filtered mt table filtered_mt.write(mt_path, overwrite=True)
thousand_genomes = f'{temp_dir}/ddd-elgh-ukbb/training_sets/1000G_phase1.snps.high_confidence.hg38.ht' thousand_genomes_ht = hl.read_table(thousand_genomes) hapmap = f'{temp_dir}/ddd-elgh-ukbb/training_sets/hapmap_3.3.hg38.ht' hapmap_ht = hl.read_table(hapmap) # ANNOTATION TABLES: truth_data_ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht') trio_stats_table = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_trios_stats.ht') #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht') allele_data_ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_allele_data.ht') allele_counts_ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_qc_ac.ht') mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/filtering/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED.mt' ) mt = annotate_adj(mt) mt_freq = annotate_freq(mt) print("repartitioning:") #mt_freq = mt_freq.repartition(1000, shuffle=False) mt_freq = mt_freq.checkpoint( f'{tmp_dir}/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED_FREQ_adj.mt', overwrite=True) ht_freq = mt_freq.rows() ht_freq.describe() ht_freq.write( f'{tmp_dir}/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED_FREQ_adj.ht', overwrite=True)
# s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory group = "raw" mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/mts/chd_ukbb_split_v2_09092020.mt') # Truthset truthset_ht = get_truth_ht() truthset_ht.write(f'{nfs_dir}/hail_data/hts/truthset.ht', overwrite=True) truthset_ht = hl.read_table(f'{nfs_dir}/hail_data/hts/truthset.ht') # Trio data # trio annotation: mt_adj = annotate_adj(mt) fam = f"{project_dir}/data/annotation/samples/sample.complete_trios.wes50k.02022021.noheader.fam" pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint(f'{hdfs_dir}/chd_ukbb.trios.adj.mt', overwrite=True) trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write(f'{hdfs_dir}/chd_ukbb.trios.stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
def main(args): # Start Hail hl.init(default_reference=args.default_ref_genome) # Import raw split MT mt = (get_mt_data(dataset=args.exome_cohort, part='raw', split=True).select_cols()) ht = (mt.cols().key_by('s')) # Annotate samples filters sample_qc_filters = {} # 1. Add sample hard filters annotation expr sample_qc_hard_filters_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='hard_filters')) sample_qc_filters.update( {'hard_filters': sample_qc_hard_filters_ht[ht.s]['hard_filters']}) # 2. Add population qc filters annotation expr sample_qc_pop_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='population_qc')) sample_qc_filters.update( {'predicted_pop': sample_qc_pop_ht[ht.s]['predicted_pop']}) # 3. Add relatedness filters annotation expr related_samples_to_drop = get_related_samples_to_drop() related_samples = hl.set( related_samples_to_drop.aggregate( hl.agg.collect_as_set(related_samples_to_drop.node.id))) sample_qc_filters.update({'is_related': related_samples.contains(ht.s)}) # 4. Add stratified sample qc (population/platform) annotation expr sample_qc_pop_platform_filters_ht = hl.read_table( get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')) sample_qc_filters.update({ 'pop_platform_filters': sample_qc_pop_platform_filters_ht[ht.s]['pop_platform_filters'] }) ht = (ht.annotate(**sample_qc_filters)) # Final sample qc filter joint expression final_sample_qc_ann_expr = { 'pass_filters': hl.cond((hl.len(ht.hard_filters) == 0) & (hl.len(ht.pop_platform_filters) == 0) & (ht.predicted_pop == 'EUR') & ~ht.is_related, True, False) } ht = (ht.annotate(**final_sample_qc_ann_expr)) logger.info('Writing final sample qc HT to disk...') output_path_ht = get_sample_qc_ht_path(dataset=args.exome_cohort, part='final_qc') ht = ht.checkpoint(output_path_ht, overwrite=args.overwrite) # Export final sample QC annotations to file if args.write_to_file: (ht.export(f'{output_path_ht}.tsv.bgz')) ## Release final unphase MT with adjusted genotypes filtered mt = unphase_mt(mt) mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj).select_entries('GT', 'DP', 'GQ', 'adj') logger.info('Writing unphase MT with adjusted genotypes to disk...') # write MT mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True), overwrite=args.overwrite) # Stop Hail hl.stop() print("Finished!")
def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset mt = hl.variant_qc(mt) truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht', overwrite=True) # Trio data # trio annotation: logger.info("Trio annotation and writing trios_adj.mt") mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint( f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True) logger.info("Trio stats and writing MegaWes_stats.ht") trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) logger.info("Writing tables for inbreeding, allele counts") ht_inbreeding.write( f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht', overwrite=True) qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht', overwrite=True) # Trio matrix table logger.info("Split multi allelic variants and write mt") mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt = mt.checkpoint( f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt', overwrite=True) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) logger.info("Trio matrixtable generation:") trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt', overwrite=True) # Family stats logger.info("Family stats") (ht1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats) mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt', overwrite=True) #Family stats with Allele Frequencies from gnomad logger.info("Family stats with gnomad AF") priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt', overwrite=True) logger.info("De novo table cration") #De novo table de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht', overwrite=True)