Пример #1
0
    #at least one read in both forward and reverse orientations
    #remove  monomorphic variants
    mt3 = mt2.filter_entries(
        ((mt2.AD[1] < 2) | (mt2.F1R2[1] == 0) | (mt2.F2R1[1] == 0)),
        keep=False)
    mt3 = hl.variant_qc(mt3)
    mt3 = mt3.filter_rows(
        (mt3.variant_qc.AF[1] > 0) & (mt3.variant_qc.AF[1] < 1), keep=True)

    mt4 = mt3.annotate_rows(v = hl.variant_str(mt3.locus, mt3.alleles),\
         NumAltAlleles = hl.agg.max(mt3.GT.n_alt_alleles()), \
         VAF =hl.agg.explode(lambda x: hl.agg.mean(x), mt3.AF),\
         TLOD =mt3.info.TLOD[0], \
         GERMQ = mt3.info.GERMQ, \
         STR=mt3.info.STR,\
         AD_alt=hl.agg.mean(mt3.AD[1]),\
         AD_ref=hl.agg.mean(mt3.AD[0]))

    mt4 = mt4.annotate_entries(
        Binomial_Prob=hl.binom_test(mt4.AD[1], mt4.DP, 0.5, 'greater'))
    mt4 = mt4.key_rows_by("v")
    mt4 = mt4.drop('locus', 'alleles', 'qual', 'filters', 'variant_qc', 'GQ',
                   'PGT', 'PID', 'PL', 'PS', 'info', 'rsid', 'a_index',
                   'was_split')
    filt2 = mt4.count_rows()
    mt4.entries().export(filenamev2 + "." + str(filt2) + ".GTs.bgz")
    del (mt)
    del (mt2)
    del (mt3)
    del (mt4)
Пример #2
0
def main(args):
    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # import MT
    mt = hl.read_matrix_table(args.mt_input_path)

    n_variants, n_samples = mt.count()

    # Getting variant table. Basically, a table keyed by <locus> or <locus, alleles>
    # with all variants in the dataset and no extra fields (a.k.a reference table).
    tb_variants = (mt.select_rows().rows())

    # compute overall coverage
    if args.compute_overall_coverage:
        logger.info(
            f"Computing coverage stats for {n_variants} variant over {n_samples} samples..."
        )
        ht_cov_overall = compute_coverage_stats(mt=mt,
                                                reference_ht=tb_variants)

        tb_variants = (tb_variants.annotate(
            overall=ht_cov_overall[tb_variants.key]))

    # compute coverage stratified by phenotype status (expected binary)
    # force the input MT to have a case_control bool filed (is_case)
    # ***
    if args.compute_phe_coverage:
        logger.info(
            f"Computing coverage stats stratified by phenotype status...")

        # Annotate sample meta info
        # Note: Temporal solution, better to import annotated MT
        mt = (mt.annotate_cols(**get_sample_meta_data()[mt.col_key]))

        mt = (mt.annotate_cols(
            case_control=hl.if_else(mt[args.phe_field], 'case', 'control')))

        strata = (mt.aggregate_cols(hl.agg.collect_as_set(mt['case_control'])))

        dict_strata_ht = {
            s:
            compute_coverage_stats(mt=mt.filter_cols(mt['case_control'] == s),
                                   reference_ht=tb_variants)
            for s in strata
        }

        for k in dict_strata_ht.keys():
            _tb = dict_strata_ht.get(k)
            tb_variants = tb_variants.annotate(**{k: _tb[tb_variants.key]})

        if args.run_binomial_test:
            logger.info(f"Running binomial test...")
            # perform a binomial test on coverage and case/control status
            # DOI: https://doi.org/10.1002/acn3.582
            tb_binomial = (tb_variants.annotate(
                n_cases_over_10=hl.int(tb_variants.case.over_10 * 100),
                n_controls_over_10=hl.int(tb_variants.control.over_10 * 100),
                total_cases=tb_variants.case.n_samples,
                total_controls=tb_variants.control.n_samples,
            ).select('n_cases_over_10', 'n_controls_over_10', 'total_cases',
                     'total_controls'))

            binomial_expr = {
                'p_value':
                hl.binom_test(
                    x=tb_binomial.n_cases_over_10,
                    n=tb_binomial.n_cases_over_10 +
                    tb_binomial.n_controls_over_10,
                    p=tb_binomial.total_cases /
                    (tb_binomial.total_cases + tb_binomial.total_controls),
                    alternative='two.sided')
            }

            tb_binomial = (tb_binomial.annotate(**binomial_expr))

            tb_variants = (tb_variants.annotate(
                binomial_stats=tb_binomial[tb_variants.key]))

    # make coverage filter expressions
    # Note: the default number of reads is set to 10X
    logger.info(f"Assigning per site coverage filters...")

    significant_level = args.pvalue_threshold
    min_sample_prop = args.min_sample_proportion

    coverage_filter_dict_expr = {}

    if args.compute_overall_coverage:
        coverage_filter_dict_expr.update({
            'overall_hard_cutoff':
            hl.if_else((tb_variants.overall.over_10 >= min_sample_prop),
                       "pass", "fail")
        })
    if args.compute_phe_coverage:
        # DOI: https://doi.org/10.1016/j.ajhg.2018.08.016
        coverage_filter_dict_expr.update({
            'phe_hard_cutoff':
            hl.if_else((tb_variants.case.over_10 >= min_sample_prop) &
                       (tb_variants.control.over_10 >= min_sample_prop),
                       "concordant", "discordant")
        })
    if args.run_binomial_test:
        coverage_filter_dict_expr.update({
            'phe_binomial':
            hl.if_else(tb_variants.binomial_stats.p_value < significant_level,
                       'dependent', 'independent')
        })

    # annotate coverage filters
    tb_variants = (tb_variants.annotate(coverage_filter=hl.struct(
        **coverage_filter_dict_expr)))

    # add useful global annotations to final coverage stats ht
    # as well as affected/non-affected summary counts per filters
    global_ann_dict_expr = {
        'date': current_date(),
        'mt_path': args.mt_input_path,
        'min_sample_prop': min_sample_prop
    }
    if args.compute_overall_coverage:
        global_ann_dict_expr.update({
            'overall_hard_cutoff':
            tb_variants.aggregate(
                hl.agg.counter(
                    tb_variants.coverage_filter.overall_hard_cutoff))
        })
    if args.compute_phe_coverage:
        global_ann_dict_expr.update({
            'phe_hard_cutoff':
            tb_variants.aggregate(
                hl.agg.counter(tb_variants.coverage_filter.phe_hard_cutoff))
        })
    if args.run_binomial_test:
        global_ann_dict_expr.update({
            'phe_binomial':
            tb_variants.aggregate(
                hl.agg.counter(tb_variants.coverage_filter.phe_binomial)),
            'binomial_pvalue_cutoff':
            significant_level if args.run_binomial_test else hl.float('')
        })

    tb_variants = (tb_variants.annotate_globals(**global_ann_dict_expr))

    # check
    tb_variants.globals.show()
    tb_variants.describe()

    # write HT
    tb_variants = tb_variants.checkpoint(output=args.ht_output_path,
                                         overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (tb_variants.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
Пример #3
0
def compute_info() -> hl.Table:
    """
    Computes a HT with the typical GATK AS and site-level info fields as well as ACs and lowqual fields.

    Note that this table doesn't split multi-allelic sites.

    :return: Table with info fields
    :rtype: Table
    """
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                          remove_hard_filtered_samples=False)

    mt = mt.filter_rows((hl.len(mt.alleles) > 1))
    mt = mt.transmute_entries(**mt.gvcf_info)
    mt = mt.annotate_rows(
        alt_alleles_range_array=hl.range(1, hl.len(mt.alleles)))

    # Compute AS and site level info expr
    # Note that production defaults have changed:
    # For new releases, the `RAWMQ_andDP` field replaces the `RAW_MQ` and `MQ_DP` fields
    info_expr = get_site_info_expr(
        mt,
        sum_agg_fields=INFO_SUM_AGG_FIELDS + ["RAW_MQ"],
        int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ["MQ_DP"],
        array_sum_agg_fields=["SB"],
    )
    info_expr = info_expr.annotate(**get_as_info_expr(
        mt,
        sum_agg_fields=INFO_SUM_AGG_FIELDS + ["RAW_MQ"],
        int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ["MQ_DP"],
        array_sum_agg_fields=["SB"],
    ))

    # Add AC and AC_raw:
    # First compute ACs for each non-ref allele, grouped by adj
    grp_ac_expr = hl.agg.array_agg(
        lambda ai: hl.agg.filter(
            mt.LA.contains(ai),
            hl.agg.group_by(
                get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD),
                hl.agg.sum(
                    mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[
                        mt.LA.index(ai)]),
            ),
        ),
        mt.alt_alleles_range_array,
    )

    # Then, for each non-ref allele, compute
    # AC as the adj group
    # AC_raw as the sum of adj and non-adj groups
    info_expr = info_expr.annotate(
        AC_raw=grp_ac_expr.map(
            lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))),
        AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))),
    )

    # Annotating raw MT with pab max
    info_expr = info_expr.annotate(AS_pab_max=hl.agg.array_agg(
        lambda ai: hl.agg.filter(
            mt.LA.contains(ai) & mt.LGT.is_het(),
            hl.agg.max(
                hl.binom_test(mt.LAD[mt.LA.index(ai)], hl.sum(mt.LAD), 0.5,
                              "two-sided")),
        ),
        mt.alt_alleles_range_array,
    ))

    info_ht = mt.select_rows(info=info_expr).rows()

    # Add lowqual flag
    info_ht = info_ht.annotate(
        lowqual=get_lowqual_expr(
            info_ht.alleles,
            info_ht.info.QUALapprox,
            # The indel het prior used for gnomad v3 was 1/10k bases (phred=40).
            # This value is usually 1/8k bases (phred=39).
            indel_phred_het_prior=40,
        ),
        AS_lowqual=get_lowqual_expr(info_ht.alleles,
                                    info_ht.info.AS_QUALapprox,
                                    indel_phred_het_prior=40),
    )

    return info_ht.naive_coalesce(7500)
Пример #4
0
def test_deprecated_binom_test():
    assert hl.eval(hl.binom_test(2, 10, 0.5, 'two.sided')) == \
        pytest.approx(spst.binom_test(2, 10, 0.5, 'two-sided'))
Пример #5
0
def test_binom_test():
    arglists = [[2, 10, 0.5, 'two-sided'], [4, 10, 0.5, 'less'],
                [32, 50, 0.4, 'greater']]
    for args in arglists:
        assert hl.eval(hl.binom_test(*args)) == pytest.approx(
            spst.binom_test(*args)), args
INBR_COEFF = -0.3
AB_LOWER_LIM = 0.2
AB_UPPER_LIM = 1 - AB_LOWER_LIM

# Read MatrixTable with sample QC-passing dataset
mt = hl.read_matrix_table("sampleqc_pass.mt")

# Calculate variant statistics
mt = hl.variant_qc(mt)

# Calculate inbreeding coefficient
mt = mt.annotate_rows(inbr_coeff=bi_allelic_site_inbreeding_expr(mt.GT))

# Determine the maximum p-value for sampling the observed allele balance under a binomial model
mt = mt.annotate_rows(
    pab_max=hl.agg.max(hl.binom_test(mt.AD[1], mt.DP, 0.5, "two-sided")))

# Removing variants with excess of heterozygotes
mt = mt.filter_rows(mt.inbr_coeff > INBR_COEFF)

# Removing variants for which no sample had high quality genotypes
mt = mt.filter_rows(hl.agg.any(mt.GQ >= 20))
mt = mt.filter_rows(hl.agg.any(mt.DP >= 10))

mt = mt.annotate_entries(AB=(mt.AD[1] / hl.sum(mt.AD)))

mt = mt.filter_rows(
    hl.agg.any((mt.GT.is_hom_ref() & (mt.AB < AB_LOWER_LIM))
               | (mt.GT.is_het() & (mt.AB >= AB_LOWER_LIM)
                  & (mt.AB <= AB_UPPER_LIM))
               | (mt.GT.is_hom_var() & (mt.AB > AB_UPPER_LIM))))