def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset truthset_ht = get_truth_ht(args.onmi, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/ddd-elgh-ukbb/truthset.ht', overwrite=True) # Trio data # trio annotation: mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint(f'{args.output_dir}/ddd-elgh-ukbb/mt_trios_adj.mt', overwrite=True) trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_trios_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) ht_inbreeding.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_inbreeding_new.ht', overwrite=True) qc_ac_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_cohorts_allele_data_new.ht', overwrite=True)
def main(args): hl.init(log='/frequency_data_generation.log', default_reference='GRCh38') logger.info("Reading sparse MT and metadata table...") mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters') if args.test: logger.info("Filtering to chr20:1-1000000") mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) logger.info("Annotating sparse MT with metadata...") mt = mt.annotate_cols(meta=meta_ht[mt.s]) mt = mt.filter_cols(mt.meta.release) samples = mt.count_cols() logger.info(f"Running frequency table prep and generation pipeline on {samples} samples") logger.info("Computing adj and sex adjusted genotypes.") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...") # hotfix for depletion of homozygous alternate genotypes # Using v3.0 AF to avoid an extra frequency calculation # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments freq_ht = freq.versions["3"].ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries( GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, ) ) logger.info("Calculating InbreedingCoefficient...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Generating frequency data...") mt = annotate_freq( mt, sex_expr=mt.meta.sex, pop_expr=mt.meta.pop ) # Select freq, FAF and popmax faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( 'InbreedingCoeff', 'freq', faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX) ) mt = mt.annotate_globals(faf_meta=faf_meta) # Annotate quality metrics histograms, as these also require densifying mt = mt.annotate_rows( **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD) ) logger.info("Writing out frequency data...") if args.test: mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True) else: mt.rows().write(freq.path, overwrite=args.overwrite)
def main(args): subsets = args.subsets hl.init( log= f"/generate_frequency_data{'.' + '_'.join(subsets) if subsets else ''}.log", default_reference="GRCh38", ) invalid_subsets = [] n_subsets_use_subpops = 0 for s in subsets: if s not in SUBSETS: invalid_subsets.append(s) if s in COHORTS_WITH_POP_STORED_AS_SUBPOP: n_subsets_use_subpops += 1 if invalid_subsets: raise ValueError( f"{', '.join(invalid_subsets)} subset(s) are not one of the following official subsets: {SUBSETS}" ) if n_subsets_use_subpops & (n_subsets_use_subpops != len(subsets)): raise ValueError( f"All or none of the supplied subset(s) should be in the list of cohorts that need to use subpops instead " f"of pops in frequency calculations: {COHORTS_WITH_POP_STORED_AS_SUBPOP}" ) try: logger.info("Reading full sparse MT and metadata table...") mt = get_gnomad_v3_mt( key_by_locus_and_alleles=True, release_only=not args.include_non_release, samples_meta=True, ) if args.test: logger.info("Filtering to two partitions on chr20") mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20:1-1000000")]) mt = mt._filter_partitions(range(2)) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) if args.include_non_release: logger.info("Filtering MT columns to high quality samples") total_sample_count = mt.count_cols() mt = mt.filter_cols(mt.meta.high_quality) high_quality_sample_count = mt.count_cols() logger.info( f"Filtered {total_sample_count - high_quality_sample_count} from the full set of {total_sample_count} " f"samples...") if subsets: mt = mt.filter_cols(hl.any([mt.meta.subsets[s] for s in subsets])) logger.info( f"Running frequency generation pipeline on {mt.count_cols()} samples in {', '.join(subsets)} subset(s)..." ) else: logger.info( f"Running frequency generation pipeline on {mt.count_cols()} samples..." ) logger.info("Computing adj and sex adjusted genotypes...") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex_imputation.sex_karyotype), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD), ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) # Temporary hotfix for depletion of homozygous alternate genotypes logger.info( "Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt..." ) # Load v3.0 allele frequencies to avoid an extra frequency calculation # NOTE: Using previous callset AF works for small incremental changes to a callset, but we will need to revisit for large increments freq_ht = get_freq(version="3").ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries(GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, )) logger.info("Generating frequency data...") if subsets: mt = annotate_freq( mt, sex_expr=mt.meta.sex_imputation.sex_karyotype, pop_expr=mt.meta.population_inference.pop if not n_subsets_use_subpops else mt.meta.project_meta.project_subpop, # NOTE: TGP and HGDP labeled populations are highly specific and are stored in the project_subpop meta field ) # NOTE: no FAFs or popmax needed for subsets mt = mt.select_rows("freq") logger.info( f"Writing out frequency data for {', '.join(subsets)} subset(s)..." ) if args.test: mt.rows().write( get_checkpoint_path( f"chr20_test_freq.{'_'.join(subsets)}"), overwrite=True, ) else: mt.rows().write(get_freq(subset="_".join(subsets)).path, overwrite=args.overwrite) else: logger.info("Computing age histograms for each variant...") mt = mt.annotate_cols(age=hl.if_else( hl.is_defined(mt.meta.project_meta.age), mt.meta.project_meta.age, mt.meta.project_meta.age_alt, # NOTE: most age data is stored as integers in 'age' annotation, but for a select number of samples, age is stored as a bin range and 'age_alt' corresponds to an integer in the middle of the bin )) mt = mt.annotate_rows(**age_hists_expr(mt.adj, mt.GT, mt.age)) # Compute callset-wide age histogram global mt = mt.annotate_globals(age_distribution=mt.aggregate_cols( hl.agg.hist(mt.age, 30, 80, 10))) mt = annotate_freq( mt, sex_expr=mt.meta.sex_imputation.sex_karyotype, pop_expr=mt.meta.population_inference.pop, downsamplings=DOWNSAMPLINGS, ) # Remove all loci with raw AC=0 mt = mt.filter_rows(mt.freq[1].AC > 0) logger.info("Calculating InbreedingCoeff...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Computing filtering allele frequencies and popmax...") faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( "InbreedingCoeff", "freq", faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX), ) mt = mt.annotate_globals( faf_meta=faf_meta, faf_index_dict=make_faf_index_dict(faf_meta)) mt = mt.annotate_rows(popmax=mt.popmax.annotate( faf95=mt.faf[mt.faf_meta.index( lambda x: x.values() == ["adj", mt.popmax.pop])].faf95)) logger.info("Annotating quality metrics histograms...") # NOTE: these are performed here as the quality metrics histograms also require densifying mt = mt.annotate_rows( qual_hists=qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD, mt.adj)) ht = mt.rows() ht = ht.annotate( qual_hists=hl.Struct( **{ i.replace("_adj", ""): ht.qual_hists[i] for i in ht.qual_hists if "_adj" in i }), raw_qual_hists=hl.Struct(**{ i: ht.qual_hists[i] for i in ht.qual_hists if "_adj" not in i }), ) logger.info("Writing out frequency data...") if args.test: ht.write(get_checkpoint_path("chr20_test_freq"), overwrite=True) else: ht.write(get_freq().path, overwrite=args.overwrite) finally: logger.info("Copying hail log to logging bucket...") hl.copy_log(f"{qc_temp_prefix()}logs/")
# Trio data # trio annotation: mt_adj = annotate_adj(mt) fam = f"{project_dir}/data/annotation/samples/sample.complete_trios.wes50k.02022021.noheader.fam" pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint(f'{hdfs_dir}/chd_ukbb.trios.adj.mt', overwrite=True) trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write(f'{hdfs_dir}/chd_ukbb.trios.stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) ht_inbreeding.write(f'{hdfs_dir}/chd_ukbb.inbreeding.ht', overwrite=True) qc_ac_ht.write(f'{hdfs_dir}/chd_ukbb.qc_ac.ht', overwrite=True) allele_data_ht.write(f'{hdfs_dir}/chd_ukbb.allele_data.ht', overwrite=True)
def _initial_filter(data_type): """ Get Table of CCDG variants passing desired filters. Possible filters are: - Autosomes only - SNVs only - gnomAD v3.1.2 AC filter - CCDG high quality exome intervals - UK Biobank high quality exome intervals After densification of the VDS, rows are annotated with: - ccdg_{data_type}_was_split - ccdg_{data_type}_AC - ccdg_{data_type}_AN The filtered and annotated rows are returned as a Table and are also checkpointed :param data_type: Whether data is from genomes or exomes :return: Table of CCDG filtered variants """ logger.info( "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...", data_type, ) vds = get_ccdg_vds(data_type, filter_washu=filter_washu) logger.info( f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..." ) vds = hl.vds.split_multi(vds) if autosomes_only: logger.info("Filtering CCDG %s VDS to autosomes...", data_type) vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True) ht = vds.variant_data.rows() variant_filter_expr = True if snv_only: logger.info("Filtering CCDG %s VDS to SNVs...", data_type) variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1]) if min_gnomad_v3_ac: logger.info( "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...", data_type, min_gnomad_v3_ac, ) variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True) if high_qual_ccdg_exome_interval_only: logger.info( f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...", data_type, INTERVAL_DP, ) interval_qc_ht = hl.read_table( get_ccdg_results_path( data_type="exomes", result=f"intervals_{INTERVAL_DP}x" ) ) interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep) vds = hl.vds.filter_intervals( vds, intervals=interval_qc_ht.interval.collect(), keep=True ) if high_qual_ukbb_exome_interval_only: if not autosomes_only: raise ValueError( "UK Biobank interval QC filtering is only available for autosomes!" ) logger.info( "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...", data_type, ) interval_qc_ht = hl.read_table( ukbb_interval_qc_path("broad", 7, "autosomes") ) # Note: freeze 7 is all included in gnomAD v4 interval_qc_ht = interval_qc_ht.filter( interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval ) vds = hl.vds.filter_intervals( vds, intervals=interval_qc_ht.interval.collect(), keep=True ) logger.info("Densifying filtered CCDG %s VDS...", data_type) mt = hl.vds.to_dense_mt(vds) if adj_only: mt = filter_to_adj(mt) annotation_expr = { f"ccdg_{data_type}_was_split": mt.was_split, f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()), f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2, } if min_inbreeding_coeff_threshold is not None: annotation_expr[ f"ccdg_{data_type}_site_inbreeding_coeff" ] = bi_allelic_site_inbreeding_expr(mt.GT) if min_hardy_weinberg_threshold is not None: annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test( mt.GT ) mt = mt.annotate_rows(**annotation_expr) ht = mt.rows().checkpoint( get_ccdg_results_path( data_type=data_type, mt=False, result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}", ), overwrite=(not read_per_dataset_checkpoint_if_exists), _read_if_exists=read_per_dataset_checkpoint_if_exists, ) return ht
def filter_rows_for_qc( mt: hl.MatrixTable, min_af: Optional[float] = 0.001, min_callrate: Optional[float] = 0.99, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, apply_hard_filters: bool = True, bi_allelic_only: bool = True, snv_only: bool = True, ) -> hl.MatrixTable: """ Annotates rows with `sites_callrate`, `site_inbreeding_coeff` and `af`, then applies thresholds. AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from GATK best practices .. note:: This function expect the typical ``info`` annotation of type struct with fields ``MQ``, ``FS`` and ``QD`` if applying hard filters. :param mt: Input MT :param min_af: Minimum site AF to keep. Not applied if set to ``None``. :param min_callrate: Minimum site call rate to keep. Not applied if set to ``None``. :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. :paramapply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30 :parambi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too :paramsnv_only: Whether to only keep SNVs or include other variant types :return: annotated and filtered table """ annotation_expr = {} if min_af is not None: annotation_expr["af"] = hl.agg.mean(mt.GT.n_alt_alleles()) / 2 if min_callrate is not None: annotation_expr["site_callrate"] = hl.agg.fraction(hl.is_defined( mt.GT)) if min_inbreeding_coeff_threshold is not None: annotation_expr[ "site_inbreeding_coeff"] = bi_allelic_site_inbreeding_expr(mt.GT) if min_hardy_weinberg_threshold is not None: annotation_expr["hwe"] = hl.agg.hardy_weinberg_test(mt.GT) if annotation_expr: mt = mt.annotate_rows(**annotation_expr) filter_expr = [] if min_af is not None: filter_expr.append((mt.af > min_af)) if min_callrate is not None: filter_expr.append((mt.site_callrate > min_callrate)) if min_inbreeding_coeff_threshold is not None: filter_expr.append( (mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold)) if min_hardy_weinberg_threshold is not None: filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold)) if snv_only: filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1])) if bi_allelic_only: filter_expr.append(bi_allelic_expr(mt)) if apply_hard_filters: if "info" in mt.row_value: if "QD" in mt.info: filter_expr.append((mt.info.QD >= 2)) else: logger.warning( "Could not apply QD hard filter, as `info.QD` not found in schema." ) if "FS" in mt.info: filter_expr.append((mt.info.FS <= 60)) else: logger.warning( "Could not apply FS hard filter, as `info.FS` not found in schema." ) if "MQ" in mt.info: filter_expr.append((mt.info.MQ >= 30)) else: logger.warning( "Could not apply MQ hard filter, as `info.MQ` not found in schema." ) else: logger.warning( "Could not apply hard filters as `info` not found in schema.") return mt.filter_rows(functools.reduce(operator.iand, filter_expr))
from gnomad.variant_qc.pipeline import train_rf_model from hail_init import DEFAULT_REF # Variant Quality hard filters INBR_COEFF = -0.3 AB_LOWER_LIM = 0.2 AB_UPPER_LIM = 1 - AB_LOWER_LIM # Read MatrixTable with sample QC-passing dataset mt = hl.read_matrix_table("sampleqc_pass.mt") # Calculate variant statistics mt = hl.variant_qc(mt) # Calculate inbreeding coefficient mt = mt.annotate_rows(inbr_coeff=bi_allelic_site_inbreeding_expr(mt.GT)) # Determine the maximum p-value for sampling the observed allele balance under a binomial model mt = mt.annotate_rows( pab_max=hl.agg.max(hl.binom_test(mt.AD[1], mt.DP, 0.5, "two-sided"))) # Removing variants with excess of heterozygotes mt = mt.filter_rows(mt.inbr_coeff > INBR_COEFF) # Removing variants for which no sample had high quality genotypes mt = mt.filter_rows(hl.agg.any(mt.GQ >= 20)) mt = mt.filter_rows(hl.agg.any(mt.DP >= 10)) mt = mt.annotate_entries(AB=(mt.AD[1] / hl.sum(mt.AD))) mt = mt.filter_rows(
def main(args): group = "raw" mt = hl.read_matrix_table(args.matrixtable) # Truthset mt = hl.variant_qc(mt) truthset_ht = get_truth_ht(args.omni, args.mills, args.thousand_genomes, args.hapmap) truthset_ht.write(f'{args.output_dir}/variant_qc/truthset_table.ht', overwrite=True) # Trio data # trio annotation: logger.info("Trio annotation and writing trios_adj.mt") mt_adj = annotate_adj(mt) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) trio_dataset = hl.trio_matrix(mt_adj, pedigree, complete_trios=True) trio_dataset.checkpoint( f'{args.output_dir}/variant_qc/MegaWES_trios_adj.mt', overwrite=True) logger.info("Trio stats and writing MegaWes_stats.ht") trio_stats_ht = generate_trio_stats(trio_dataset, autosomes_only=True, bi_allelic_only=True) trio_stats_ht.write(f'{args.output_dir}/variant_qc/MegaWES_stats.ht', overwrite=True) # inbreeding ht mt_inbreeding = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') ht_inbreeding = mt_inbreeding.rows() # allele data and qc_ac ht allele_data_ht = generate_allele_data(mt) qc_ac_ht = generate_ac(mt, fam) logger.info("Writing tables for inbreeding, allele counts") ht_inbreeding.write( f'{args.output_dir}/variant_qc/MegaWES_inbreeding_new.ht', overwrite=True) qc_ac_ht.write(f'{args.output_dir}/variant_qc/MegaWES_qc_ac_new.ht', overwrite=True) allele_data_ht.write( f'{args.output_dir}/variant_qc/MegaWES_allele_data_new.ht', overwrite=True) # Trio matrix table logger.info("Split multi allelic variants and write mt") mt = hl.split_multi_hts(mt, keep_star=False, left_aligned=False, permit_shuffle=True) mt = mt.checkpoint( f'{args.output_dir}/variant_qc/MegaWESSanger_cohorts_sampleQC_filtered_split.mt', overwrite=True) fam = args.trio_fam pedigree = hl.Pedigree.read(fam) logger.info("Trio matrixtable generation:") trio_dataset = hl.trio_matrix(mt, pedigree, complete_trios=True) trio_dataset.write(f'{args.output_dir}/variant_qc/MegaWES_trio_table.mt', overwrite=True) # Family stats logger.info("Family stats") (ht1, famstats_ht) = generate_family_stats(mt, fam) print("Writing mt and family stats_ht") ht1.write(f'{args.output_dir}/variant_qc/MegaWES_family_stats.ht', overwrite=True) mt = mt.annotate_rows(family_stats=ht1[mt.row_key].family_stats) mt = mt.checkpoint(f'{args.output_dir}/variant_qc/MegaWES_family_stats.mt', overwrite=True) #Family stats with Allele Frequencies from gnomad logger.info("Family stats with gnomad AF") priors = hl.read_table(args.priors) mt = mt.annotate_rows(gnomad_maf=priors[mt.row_key].maf) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/MegaWES_family_stats_gnomad_AF.mt', overwrite=True) logger.info("De novo table cration") #De novo table de_novo_table = hl.de_novo(mt, pedigree, mt.gnomad_maf) de_novo_table = de_novo_table.key_by( 'locus', 'alleles').collect_by_key('de_novo_data') de_novo_table.write( f'{args.output_dir}/variant_qc/MegaWES_denovo_table.ht', overwrite=True)