예제 #1
0
def test_vcf_vds_combiner_equivalence():
    import hail.experimental.vcf_combiner.vcf_combiner as vcf
    import hail.vds.combiner as vds
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = [mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32)))
            for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38',
                                      array_elements_required=False)]
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs])
    smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs])
    smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ')
    smt = smt.select_entries(*smt_from_vds.entry)  # harmonize fields and order
    smt = smt.key_rows_by('locus', 'alleles')
    assert smt._same(smt_from_vds)
예제 #2
0
def test_combiner_works():
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vcfs = [transform_gvcf(mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32))),
                           reference_entry_fields_to_keep=entry_to_keep)
            for mt in vcfs]
    comb = combine_variant_datasets(vcfs)
    assert len(parts) == comb.variant_data.n_partitions()
    comb.variant_data._force_count_rows()
    comb.reference_data._force_count_rows()
예제 #3
0
def parse_as_ranksum(string, has_non_ref):
    typ = hl.ttuple(hl.tfloat64, hl.tint32)
    items = string.split(r'\|')
    items = hl.if_else(has_non_ref, items[:-1], items)
    return items.map(lambda s: hl.if_else(
        (hl.len(s) == 0) | (s == '.'),
        hl.missing(typ),
        hl.rbind(s.split(','), lambda ss: hl.if_else(
            hl.len(ss) != 2,  # bad field, possibly 'NaN', just set it null
            hl.missing(hl.ttuple(hl.tfloat64, hl.tint32)),
            hl.tuple([hl.float64(ss[0]), hl.int32(ss[1])])))))
예제 #4
0
def missing_callstats_expr() -> hl.expr.StructExpression:
    """
    Create a missing callstats struct for insertion into frequency annotation arrays when data is missing.

    :return: Hail Struct with missing values for each callstats element
    """
    return hl.struct(
        AC=hl.missing(hl.tint32),
        AF=hl.missing(hl.tfloat64),
        AN=hl.missing(hl.tint32),
        homozygote_count=hl.missing(hl.tint32),
    )
예제 #5
0
    def test_from_entry_expr_options(self):
        def build_mt(a):
            data = [{'v': 0, 's': 0, 'x': a[0]},
                    {'v': 0, 's': 1, 'x': a[1]},
                    {'v': 0, 's': 2, 'x': a[2]}]
            ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, x: float64}'))
            mt = ht.to_matrix_table(['v'], ['s'])
            ids = mt.key_cols_by()['s'].collect()
            return mt.choose_cols([ids.index(0), ids.index(1), ids.index(2)])

        def check(expr, mean_impute, center, normalize, expected):
            actual = np.squeeze(BlockMatrix.from_entry_expr(expr,
                                                            mean_impute=mean_impute,
                                                            center=center,
                                                            normalize=normalize).to_numpy())
            assert np.allclose(actual, expected)

        a = np.array([0.0, 1.0, 2.0])

        mt = build_mt(a)
        check(mt.x, False, False, False, a)
        check(mt.x, False, True, False, a - 1.0)
        check(mt.x, False, False, True, a / np.sqrt(5))
        check(mt.x, False, True, True, (a - 1.0) / np.sqrt(2))
        check(mt.x + 1 - 1, False, False, False, a)

        mt = build_mt([0.0, hl.missing('float64'), 2.0])
        check(mt.x, True, False, False, a)
        check(mt.x, True, True, False, a - 1.0)
        check(mt.x, True, False, True, a / np.sqrt(5))
        check(mt.x, True, True, True, (a - 1.0) / np.sqrt(2))
        with self.assertRaises(Exception):
            BlockMatrix.from_entry_expr(mt.x)
예제 #6
0
파일: read_file.py 프로젝트: atgu/GWASpy
def read_vcf(dirname: str, basename: str) -> hl.MatrixTable:
    hl._set_flags(no_whole_stage_codegen='1')
    vcf_file = '{}{}.vcf.gz'.format(dirname, basename)
    hl.import_vcf(vcf_file, force_bgz=True,
                  block_size=16).write('{}GWASpy.preimpQC.mt'.format(dirname),
                                       overwrite=True)
    in_mt = hl.read_matrix_table('{}GWASpy.preimpQC.mt'.format(dirname))

    # Unlike array data, a VCF might have multi-allelic sites
    # split multi-allelic sites into bi-allelic
    print("Checking for multi-allelic sites")
    pre_filt_multi_n = in_mt.count_rows()
    bi = in_mt.filter_rows(hl.len(in_mt.alleles) == 2)
    bi = bi.annotate_rows(
        a_index=hl.missing(hl.tint)
    )  # when we update Hail version, use hl.missing instead of hl.null
    bi = bi.annotate_rows(was_split=False)

    multi = in_mt.filter_rows(hl.len(in_mt.alleles) > 2)
    split = hl.split_multi_hts(multi)

    in_mt = split.union_rows(bi)
    pos_filt_multi_n = in_mt.count_rows()
    print("Number of multi-allelic SNPs in VCF file: {}".format(
        pos_filt_multi_n - pre_filt_multi_n))

    return in_mt
예제 #7
0
        def make_entry_struct(e, alleles_len, has_non_ref, row):
            handled_fields = dict()
            handled_names = {
                'LA', 'gvcf_info', 'END', 'LAD', 'AD', 'LGT', 'GT', 'LPL',
                'PL', 'LPGT', 'PGT'
            }

            if 'END' not in row.info:
                raise hl.utils.FatalError(
                    "the Hail GVCF combiner expects GVCFs to have an 'END' field in INFO."
                )
            if 'GT' not in e:
                raise hl.utils.FatalError(
                    "the Hail GVCF combiner expects GVCFs to have a 'GT' field in FORMAT."
                )

            handled_fields['LA'] = hl.range(
                0, alleles_len - hl.if_else(has_non_ref, 1, 0))
            handled_fields['LGT'] = get_lgt(e, alleles_len, has_non_ref, row)
            if 'AD' in e:
                handled_fields['LAD'] = hl.if_else(has_non_ref, e.AD[:-1],
                                                   e.AD)
            if 'PGT' in e:
                handled_fields['LPGT'] = e.PGT
            if 'PL' in e:
                handled_fields['LPL'] = hl.if_else(
                    has_non_ref,
                    hl.if_else(alleles_len > 2, e.PL[:-alleles_len],
                               hl.missing(e.PL.dtype)),
                    hl.if_else(alleles_len > 1, e.PL, hl.missing(e.PL.dtype)))
                handled_fields['RGQ'] = hl.if_else(
                    has_non_ref,
                    e.PL[hl.call(0,
                                 alleles_len - 1).unphased_diploid_gt_index()],
                    hl.missing(e.PL.dtype.element_type))

            handled_fields['END'] = row.info.END
            handled_fields['gvcf_info'] = (hl.case().when(
                hl.is_missing(row.info.END),
                hl.struct(**(parse_as_fields(row.info.select(
                    *info_to_keep), has_non_ref)))).or_missing())

            pass_through_fields = {
                k: v
                for k, v in e.items() if k not in handled_names
            }
            return hl.struct(**handled_fields, **pass_through_fields)
예제 #8
0
 def get_lgt(e, n_alleles, has_non_ref, row):
     index = e.GT.unphased_diploid_gt_index()
     n_no_nonref = n_alleles - hl.int(has_non_ref)
     triangle_without_nonref = hl.triangle(n_no_nonref)
     return (hl.case().when(index < triangle_without_nonref, e.GT).when(
         index < hl.triangle(n_alleles),
         hl.missing('call')).or_error('invalid GT ' + hl.str(e.GT) +
                                      ' at site ' + hl.str(row.locus)))
예제 #9
0
 def rewrite_ref(r):
     ref_block_selector = {}
     for k, t in merged_schema.items():
         if k == 'LA':
             ref_block_selector[k] = hl.literal([0])
         elif k in ('LGT', 'GT'):
             ref_block_selector[k] = hl.call(0, 0)
         else:
             ref_block_selector[k] = r[k] if k in r else hl.missing(t)
     return r.select(**ref_block_selector)
예제 #10
0
    def test_trio_matrix_null_keys(self):
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s].fam_id)

        # Make keys all null
        mt = mt.key_cols_by(s=hl.missing(hl.tstr))

        tt = hl.trio_matrix(mt, ped, complete_trios=True)
        self.assertEqual(tt.count_cols(), 0)
예제 #11
0
def _get_gnomad_variants(gnomad_exome_variants_path=None, gnomad_genome_variants_path=None):
    gnomad_exome_variants = None
    gnomad_genome_variants = None

    if gnomad_exome_variants_path:
        gnomad_exome_variants = hl.read_table(gnomad_exome_variants_path)
        gnomad_exome_variants = gnomad_exome_variants.select(
            exome=hl.struct(
                filters=gnomad_exome_variants.filters,
                ac=gnomad_exome_variants.freq[0].AC,
                an=gnomad_exome_variants.freq[0].AN,
            )
        )

        # For purposes of marking ClinVar variants as "in gnomAD", exclude AC=0 gnomAD variants
        gnomad_exome_variants = gnomad_exome_variants.filter(gnomad_exome_variants.exome.ac > 0)

    if gnomad_genome_variants_path:
        gnomad_genome_variants = hl.read_table(gnomad_genome_variants_path)
        gnomad_genome_variants = gnomad_genome_variants.select(
            genome=hl.struct(
                filters=gnomad_genome_variants.filters,
                ac=gnomad_genome_variants.freq[0].AC,
                an=gnomad_genome_variants.freq[0].AN,
            )
        )

        # For purposes of marking ClinVar variants as "in gnomAD", exclude AC=0 gnomAD variants
        gnomad_genome_variants = gnomad_genome_variants.filter(gnomad_genome_variants.genome.ac > 0)

    gnomad_variants = None
    if gnomad_exome_variants and gnomad_genome_variants:
        gnomad_variants = gnomad_exome_variants.join(gnomad_genome_variants, how="outer")
    elif gnomad_exome_variants:
        gnomad_variants = gnomad_exome_variants.annotate(genome=hl.missing(gnomad_exome_variants.exome.dtype))
    elif gnomad_genome_variants:
        gnomad_variants = gnomad_genome_variants.annotate(exome=hl.missing(gnomad_genome_variants.genome.dtype))

    return gnomad_variants
예제 #12
0
파일: helpers.py 프로젝트: chrisvittal/hail
def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.missing(hl.tfloat64),
        astruct=hl.struct(a=hl.missing(hl.tint32), b=5.5),
        mstruct=hl.missing(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.missing(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.missing(hl.tstr), 'z']): 0.3}),
        md=hl.missing(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.missing(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.missing(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.missing(hl.tstr)]),
        mt=hl.missing(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)),
        nd=hl.nd.arange(0, 10).reshape((2, 5)),
    )
예제 #13
0
def test_lgt_to_gt():
    call_0_0_f = hl.call(0, 0, phased=False)
    call_0_0_t = hl.call(0, 0, phased=True)
    call_0_1_f = hl.call(0, 1, phased=False)
    call_2_0_t = hl.call(2, 0, phased=True)

    call_1 = hl.call(1, phased=False)

    la = [0, 3, 5]

    assert hl.eval(tuple(hl.vds.lgt_to_gt(c, la) for c in [call_0_0_f, call_0_0_t, call_0_1_f, call_2_0_t, call_1])) == \
           tuple([hl.Call([0, 0], phased=False), hl.Call([0, 0], phased=True), hl.Call([0, 3], phased=False), hl.Call([5, 0], phased=True), hl.Call([3], phased=False)])

    assert hl.eval(hl.vds.lgt_to_gt(
        call_0_0_f, hl.missing('array<int32>'))) == hl.Call([0, 0],
                                                            phased=False)
예제 #14
0
    def test_annotate_intervals(self):
        ds = get_dataset()

        bed1 = hl.import_bed(resource('example1.bed'),
                             reference_genome='GRCh37')
        bed2 = hl.import_bed(resource('example2.bed'),
                             reference_genome='GRCh37')
        bed3 = hl.import_bed(resource('example3.bed'),
                             reference_genome='GRCh37')
        self.assertTrue(list(bed2.key.dtype) == ['interval'])
        self.assertTrue(list(bed2.row.dtype) == ['interval', 'target'])

        interval_list1 = hl.import_locus_intervals(
            resource('exampleAnnotation1.interval_list'))
        interval_list2 = hl.import_locus_intervals(
            resource('exampleAnnotation2.interval_list'))
        self.assertTrue(list(interval_list2.key.dtype) == ['interval'])
        self.assertTrue(
            list(interval_list2.row.dtype) == ['interval', 'target'])

        ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows()
        self.assertTrue(
            ann.all((ann.locus.position <= 14000000)
                    | (ann.locus.position >= 17000000)
                    | (hl.is_missing(ann.in_interval))))

        for bed in [bed2, bed3]:
            ann = ds.annotate_rows(target=bed[ds.locus].target).rows()
            expr = (hl.case().when(ann.locus.position <= 14000000,
                                   ann.target == 'gene1').when(
                                       ann.locus.position >= 17000000,
                                       ann.target == 'gene2').default(
                                           ann.target == hl.missing(hl.tstr)))
            self.assertTrue(ann.all(expr))

        self.assertTrue(
            ds.annotate_rows(
                in_interval=interval_list1[ds.locus]).rows()._same(
                    ds.annotate_rows(in_interval=bed1[ds.locus]).rows()))

        self.assertTrue(
            ds.annotate_rows(
                target=interval_list2[ds.locus].target).rows()._same(
                    ds.annotate_rows(target=bed2[ds.locus].target).rows()))
예제 #15
0
    def phase_y_nonpar(
        proband_call: hl.expr.CallExpression,
        father_call: hl.expr.CallExpression,
    ) -> hl.expr.ArrayExpression:
        """
        Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase)

        :param CallExpression proband_call: Input proband genotype call
        :param CallExpression father_call: Input father genotype call
        :return: Array containing: phased proband call, phased father call, phased mother call
        :rtype: ArrayExpression
        """
        return hl.or_missing(
            proband_call.is_haploid() & father_call.is_haploid() &
            (father_call[0] == proband_call[0]),
            hl.array([
                hl.call(proband_call[0], phased=True),
                hl.call(father_call[0], phased=True),
                hl.missing(hl.tcall)
            ]))
예제 #16
0
def annotate_fields(mt, gencode_release, gencode_path):
    genotypes = hl.agg.collect(
        hl.struct(sample_id=mt.s,
                  gq=mt.GQ,
                  cn=mt.RD_CN,
                  num_alt=hl.if_else(hl.is_defined(mt.GT),
                                     mt.GT.n_alt_alleles(), -1)))
    rows = mt.annotate_rows(genotypes=genotypes).rows()

    rows = rows.annotate(**{k: v(rows) for k, v in CORE_FIELDS.items()})

    gene_id_mapping = hl.literal(
        load_gencode(gencode_release, download_path=gencode_path))

    rows = rows.annotate(
        sortedTranscriptConsequences=hl.flatmap(
            lambda x: x,
            hl.filter(lambda x: hl.is_defined(x), [
                rows.info[col].map(lambda gene: hl.struct(
                    gene_symbol=gene,
                    gene_id=gene_id_mapping[gene],
                    predicted_consequence=col.split('__')[-1])) for col in [
                        gene_col for gene_col in rows.info
                        if gene_col.startswith('PROTEIN_CODING__')
                        and rows.info[gene_col].dtype == hl.dtype('array<str>')
                    ]
            ])),
        sv_type=rows.alleles[1].replace('[<>]', '').split(':', 2),
    )

    DERIVED_FIELDS.update({
        'filters':
        lambda rows: hl.if_else(
            hl.len(rows.filters) > 0, rows.filters,
            hl.missing(hl.dtype('array<str>')))
    })
    rows = rows.annotate(**{k: v(rows) for k, v in DERIVED_FIELDS.items()})

    rows = rows.rename({'rsid': 'variantId'})

    return rows.key_by().select(*FIELDS)
예제 #17
0
def combine_r(ts):
    if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map:
        f = hl.experimental.define_function(
            lambda row, gbl:
            hl.struct(
                locus=row.locus,
                ref_allele=hl.find(hl.is_defined, row.data.map(lambda d: d.ref_allele)),
                __entries=hl.range(0, hl.len(row.data)).flatmap(
                    lambda i:
                    hl.if_else(hl.is_missing(row.data[i]),
                               hl.range(0, hl.len(gbl.g[i].__cols))
                               .map(lambda _: hl.missing(row.data[i].__entries.dtype.element_type)),
                               row.data[i].__entries))),
            ts.row.dtype, ts.globals.dtype)
        _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f
    merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)]
    ts = Table(TableMapRows(ts._tir, Apply(merge_function._name,
                                           merge_function._ret_type,
                                           TopLevelReference('row'),
                                           TopLevelReference('global'))))
    return ts.transmute_globals(__cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
예제 #18
0
 def struct_from_min_rep(i):
     return hl.bind(
         lambda mr:
         (hl.case().
          when(
              ds.locus == mr.locus,
              hl.struct(locus=ds.locus,
                        alleles=[mr.alleles[0], mr.alleles[1]],
                        a_index=i,
                        was_split=True)).when(
                            filter_changed_loci,
                            hl.missing(
                                hl.tstruct(locus=ds.locus.dtype,
                                           alleles=hl.tarray(hl.tstr),
                                           a_index=hl.tint,
                                           was_split=hl.tbool))).
          or_error("Found non-left-aligned variant in sparse_split_multi\n"
                   + "old locus: " + hl.str(ds.locus) + "\n" + "old ref  : "
                   + ds.alleles[0] + "\n" + "old alt  : " + ds.alleles[
                       i] + "\n" + "mr locus : " + hl.str(
                           mr.locus) + "\n" + "mr ref   : " + mr.alleles[
                               0] + "\n" + "mr alt   : " + mr.alleles[1])),
         hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
예제 #19
0
    def test_locus_windows(self):
        def assert_eq(a, b):
            assert np.array_equal(a, np.array(b)), f"a={a}, b={b}"

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
                {'locus': hl.Locus('1', 2), 'cm': 3.0},
                {'locus': hl.Locus('1', 4), 'cm': 4.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('3', 3), 'cm': 5.0}]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(HailUserError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        assert 'ascending order' in str(cm.exception)

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        assert 'different source' in str(cm.exception)

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        assert "no source" in str(cm.exception)

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        assert "no source" in str(cm.exception)

        ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        assert "row-indexed" in str(cm.exception)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        assert "row-indexed" in str(cm.exception)

        ht = hl.Table.parallelize([{'locus': hl.missing(hl.tlocus()), 'cm': 1.0}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(HailUserError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        assert "missing value for 'locus_expr'" in str(cm.exception)
        with self.assertRaises(HailUserError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        assert "missing value for 'locus_expr'" in str(cm.exception)

        ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.missing(hl.tfloat64)}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(FatalError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        assert "missing value for 'coord_expr'" in str(cm.exception)
예제 #20
0
 def _summary_aggs(self):
     return hl.missing(hl.tint32)
예제 #21
0
 def rewrite_var(v):
     return v.select(**{
         k: v[k] if k in v else hl.missing(t)
         for k, t in merged_schema.items()
     })
예제 #22
0
    lambda rows: rows.info.N_HOMALT,
    'gnomad_svs_ID':
    lambda rows: rows.info.gnomAD_V2_SVID,
    'gnomad_svs_AF':
    lambda rows: rows.info.gnomAD_V2_AF,
    'pos':
    lambda rows: rows.locus.position,
    'filters':
    lambda rows: hl.array(rows.filters.filter(lambda x: x != 'PASS')),
    'xpos':
    lambda rows: get_xpos(rows.locus.contig, rows.locus.position),
    'cpx_intervals':
    lambda rows: hl.if_else(
        hl.is_defined(rows.info.CPX_INTERVALS),
        rows.info.CPX_INTERVALS.map(lambda x: get_cpx_interval(x)),
        hl.missing(hl.dtype(INTERVAL_TYPE))),
}

DERIVED_FIELDS = {
    'xstart':
    lambda rows: rows.xpos,
    'xstop':
    lambda rows: hl.if_else(hl.is_defined(rows.info.END2),
                            get_xpos(rows.info.CHR2, rows.info.END2),
                            get_xpos(rows.locus.contig, rows.info.END)),
    'svType':
    lambda rows: rows.sv_type[0],
    'transcriptConsequenceTerms':
    lambda rows: [rows.sv_type[0]],
    'sv_type_detail':
    lambda rows: hl.if_else(
예제 #23
0
def full_outer_join_mt(left: hl.MatrixTable, right: hl.MatrixTable) -> hl.MatrixTable:
    """Performs a full outer join on `left` and `right`.

    Replaces row, column, and entry fields with the following:

     - `left_row` / `right_row`: structs of row fields from left and right.
     - `left_col` / `right_col`: structs of column fields from left and right.
     - `left_entry` / `right_entry`: structs of entry fields from left and right.

    Examples
    --------

    The following creates and joins two random datasets with disjoint sample ids
    but non-disjoint variant sets. We use :func:`.or_else` to attempt to find a
    non-missing genotype. If neither genotype is non-missing, then the genotype
    is set to missing. In particular, note that Samples `2` and `3` have missing
    genotypes for loci 1:1 and 1:2 because those loci are not present in `mt2`
    and these samples are not present in `mt1`

    >>> hl.set_global_seed(0)
    >>> mt1 = hl.balding_nichols_model(1, 2, 3)
    >>> mt2 = hl.balding_nichols_model(1, 2, 3)
    >>> mt2 = mt2.key_rows_by(locus=hl.locus(mt2.locus.contig,
    ...                                      mt2.locus.position+2),
    ...                       alleles=mt2.alleles)
    >>> mt2 = mt2.key_cols_by(sample_idx=mt2.sample_idx+2)
    >>> mt1.show()
    +---------------+------------+------+------+
    | locus         | alleles    | 0.GT | 1.GT |
    +---------------+------------+------+------+
    | locus<GRCh37> | array<str> | call | call |
    +---------------+------------+------+------+
    | 1:1           | ["A","C"]  | 0/1  | 0/1  |
    | 1:2           | ["A","C"]  | 1/1  | 1/1  |
    | 1:3           | ["A","C"]  | 0/0  | 0/0  |
    +---------------+------------+------+------+
    <BLANKLINE>
    >>> mt2.show()  # doctest: +SKIP_OUTPUT_CHECK
    +---------------+------------+------+------+
    | locus         | alleles    | 0.GT | 1.GT |
    +---------------+------------+------+------+
    | locus<GRCh37> | array<str> | call | call |
    +---------------+------------+------+------+
    | 1:3           | ["A","C"]  | 0/1  | 1/1  |
    | 1:4           | ["A","C"]  | 0/1  | 0/1  |
    | 1:5           | ["A","C"]  | 1/1  | 0/0  |
    +---------------+------------+------+------+
    <BLANKLINE>
    >>> mt3 = hl.experimental.full_outer_join_mt(mt1, mt2)
    >>> mt3 = mt3.select_entries(GT=hl.or_else(mt3.left_entry.GT, mt3.right_entry.GT))
    >>> mt3.show()
    +---------------+------------+------+------+------+------+
    | locus         | alleles    | 0.GT | 1.GT | 2.GT | 3.GT |
    +---------------+------------+------+------+------+------+
    | locus<GRCh37> | array<str> | call | call | call | call |
    +---------------+------------+------+------+------+------+
    | 1:1           | ["A","C"]  | 0/1  | 0/1  | NA   | NA   |
    | 1:2           | ["A","C"]  | 1/1  | 1/1  | NA   | NA   |
    | 1:3           | ["A","C"]  | 0/0  | 0/0  | 0/1  | 1/1  |
    | 1:4           | ["A","C"]  | NA   | NA   | 0/1  | 0/1  |
    | 1:5           | ["A","C"]  | NA   | NA   | 1/1  | 0/0  |
    +---------------+------------+------+------+------+------+
    <BLANKLINE>

    Parameters
    ----------
    left : :class:`.MatrixTable`
    right : :class:`.MatrixTable`

    Returns
    -------
    :class:`.MatrixTable`
    """

    if [x.dtype for x in left.row_key.values()] != [x.dtype for x in right.row_key.values()]:
        raise ValueError(f"row key types do not match:\n"
                         f"  left:  {list(left.row_key.values())}\n"
                         f"  right: {list(right.row_key.values())}")

    if [x.dtype for x in left.col_key.values()] != [x.dtype for x in right.col_key.values()]:
        raise ValueError(f"column key types do not match:\n"
                         f"  left:  {list(left.col_key.values())}\n"
                         f"  right: {list(right.col_key.values())}")

    left = left.select_rows(left_row=left.row)
    left_t = left.localize_entries('left_entries', 'left_cols')
    right = right.select_rows(right_row=right.row)
    right_t = right.localize_entries('right_entries', 'right_cols')

    ht = left_t.join(right_t, how='outer')
    ht = ht.annotate_globals(
        left_keys=hl.group_by(
            lambda t: t[0],
            hl.enumerate(
                ht.left_cols.map(lambda x: hl.tuple([x[f] for f in left.col_key])), index_first=False)).map_values(
            lambda elts: elts.map(lambda t: t[1])),
        right_keys=hl.group_by(
            lambda t: t[0],
            hl.enumerate(
                ht.right_cols.map(lambda x: hl.tuple([x[f] for f in right.col_key])), index_first=False)).map_values(
            lambda elts: elts.map(lambda t: t[1])))
    ht = ht.annotate_globals(
        key_indices=hl.array(ht.left_keys.key_set().union(ht.right_keys.key_set()))
        .map(lambda k: hl.struct(k=k, left_indices=ht.left_keys.get(k), right_indices=ht.right_keys.get(k)))
        .flatmap(lambda s: hl.case()
                 .when(hl.is_defined(s.left_indices) & hl.is_defined(s.right_indices),
                       hl.range(0, s.left_indices.length()).flatmap(
                           lambda i: hl.range(0, s.right_indices.length()).map(
                               lambda j: hl.struct(k=s.k, left_index=s.left_indices[i],
                                                   right_index=s.right_indices[j]))))
                 .when(hl.is_defined(s.left_indices),
                       s.left_indices.map(
                           lambda elt: hl.struct(k=s.k, left_index=elt, right_index=hl.missing('int32'))))
                 .when(hl.is_defined(s.right_indices),
                       s.right_indices.map(
                           lambda elt: hl.struct(k=s.k, left_index=hl.missing('int32'), right_index=elt)))
                 .or_error('assertion error')))
    ht = ht.annotate(__entries=ht.key_indices.map(lambda s: hl.struct(left_entry=ht.left_entries[s.left_index],
                                                                      right_entry=ht.right_entries[s.right_index])))
    ht = ht.annotate_globals(__cols=ht.key_indices.map(
        lambda s: hl.struct(**{f: s.k[i] for i, f in enumerate(left.col_key)},
                            left_col=ht.left_cols[s.left_index],
                            right_col=ht.right_cols[s.right_index])))
    ht = ht.drop('left_entries', 'left_cols', 'left_keys', 'right_entries', 'right_cols', 'right_keys', 'key_indices')
    return ht._unlocalize_entries('__entries', '__cols', list(left.col_key))
예제 #24
0
def de_novo(mt: MatrixTable,
            pedigree: Pedigree,
            pop_frequency_prior,
            *,
            min_gq: int = 20,
            min_p: float = 0.05,
            max_parent_ab: float = 0.05,
            min_child_ab: float = 0.20,
            min_dp_ratio: float = 0.10,
            ignore_in_sample_allele_frequency: bool = False) -> Table:
    r"""Call putative *de novo* events from trio data.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Call de novo events:

    >>> pedigree = hl.Pedigree.read('data/trios.fam')
    >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True)
    >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles')
    >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF)

    Notes
    -----
    This method assumes the GATK high-throughput sequencing fields exist:
    `GT`, `AD`, `DP`, `GQ`, `PL`.

    This method replicates the functionality of `Kaitlin Samocha's de novo
    caller <https://github.com/ksamocha/de_novo_scripts>`__. The version
    corresponding to git commit ``bde3e40`` is implemented in Hail with her
    permission and assistance.

    This method produces a :class:`.Table` with the following fields:

     - `locus` (``locus``) -- Variant locus.
     - `alleles` (``array<str>``) -- Variant alleles.
     - `id` (``str``) -- Proband sample ID.
     - `prior` (``float64``) -- Site frequency prior. It is the maximum of:
       the computed dataset alternate allele frequency, the
       `pop_frequency_prior` parameter, and the global prior
       ``1 / 3e7``. If the `ignore_in_sample_allele_frequency` parameter is ``True``,
       then the computed allele frequency is not included in the calculation, and the
       prior is the maximum of the `pop_frequency_prior` and ``1 / 3e7``.
     - `proband` (``struct``) -- Proband column fields from `mt`.
     - `father` (``struct``) -- Father column fields from `mt`.
     - `mother` (``struct``) -- Mother column fields from `mt`.
     - `proband_entry` (``struct``) -- Proband entry fields from `mt`.
     - `father_entry` (``struct``) -- Father entry fields from `mt`.
     - `proband_entry` (``struct``) -- Mother entry fields from `mt`.
     - `is_female` (``bool``) -- ``True`` if proband is female.
     - `p_de_novo` (``float64``) -- Unfiltered posterior probability
       that the event is *de novo* rather than a missed heterozygous
       event in a parent.
     - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``,
       ``'MEDIUM'``, ``'LOW'``.

    The key of the table is ``['locus', 'alleles', 'id']``.

    The model looks for de novo events in which both parents are homozygous
    reference and the proband is a heterozygous. The model makes the simplifying
    assumption that when this configuration ``x = (AA, AA, AB)`` of calls
    occurs, exactly one of the following is true:

     - ``d``: a de novo mutation occurred in the proband and all calls are
       accurate.
     - ``m``: at least one parental allele is actually heterozygous and
       the proband call is accurate.

    We can then estimate the posterior probability of a de novo mutation as:

    .. math::

        \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d \mid x)}{\mathrm{P}(d \mid x) + \mathrm{P}(m \mid x)}

    Applying Bayes rule to the numerator and denominator yields

    .. math::

        \frac{\mathrm{P}(x \mid d)\,\mathrm{P}(d)}{\mathrm{P}(x \mid d)\,\mathrm{P}(d) +
        \mathrm{P}(x \mid m)\,\mathrm{P}(m)}

    The prior on de novo mutation is estimated from the rate in the literature:

    .. math::

        \mathrm{P}(d) = \frac{1 \, \text{mutation}}{30{,}000{,}000 \, \text{bases}}

    The prior used for at least one alternate allele between the parents
    depends on the alternate allele frequency:

    .. math::

        \mathrm{P}(m) = 1 - (1 - AF)^4

    The likelihoods :math:`\mathrm{P}(x \mid d)` and :math:`\mathrm{P}(x \mid m)`
    are computed from the PL (genotype likelihood) fields using these
    factorizations:

    .. math::
        \mathrm{P}(x = (AA, AA, AB) \mid d) = \left(
        \begin{aligned}
                &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AA) \\
                {} \cdot {} &\mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AA) \\
                {} \cdot {} &\mathrm{P}(x_{\mathrm{proband}} = AB \mid \mathrm{proband} = AB)
        \end{aligned}
        \right)

    .. math::
        \begin{aligned}
        \mathrm{P}(x = (AA, AA, AB) \mid m) = &\left(
            \begin{aligned}
                &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AB)
                    \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AA) \\
                {} + {} &\mathrm{P}(x_{\mathrm{father}} = AA \mid \mathrm{father} = AA)
                    \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \mid \mathrm{mother} = AB)
            \end{aligned}
        \right) \\
        &{} \cdot \mathrm{P}(x_{\mathrm{proband}} = AB \mid \mathrm{proband} = AB)
        \end{aligned}

    (Technically, the second factorization assumes there is exactly (rather
    than at least) one alternate allele among the parents, which may be
    justified on the grounds that it is typically the most likely case by far.)

    While this posterior probability is a good metric for grouping putative de
    novo mutations by validation likelihood, there exist error modes in
    high-throughput sequencing data that are not appropriately accounted for by
    the phred-scaled genotype likelihoods. To this end, a number of hard filters
    are applied in order to assign validation likelihood.

    These filters are different for SNPs and insertions/deletions. In the below
    rules, the following variables are used:

     - ``DR`` refers to the ratio of the read depth in the proband to the
       combined read depth in the parents.
     - ``DP`` refers to the read depth (DP field) of the proband.
     - ``AB`` refers to the read allele balance of the proband (number of
       alternate reads divided by total reads).
     - ``AC`` refers to the count of alternate alleles across all individuals
       in the dataset at the site.
     - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`.
     - ``min_p`` refers to the `min_p` function parameter.

    HIGH-quality SNV:

    .. code-block:: text

        (p > 0.99) AND (AB > 0.3) AND (AC == 1)
            OR
        (p > 0.99) AND (AB > 0.3) AND (DR > 0.2)
            OR
        (p > 0.5) AND (AB > 0.3) AND (AC < 10) AND (DP > 10)

    MEDIUM-quality SNV:

    .. code-block:: text

        (p > 0.5) AND (AB > 0.3)
            OR
        (AC == 1)

    LOW-quality SNV:

    .. code-block:: text

       (AB > 0.2)

    HIGH-quality indel:

    .. code-block:: text

        (p > 0.99) AND (AB > 0.3) AND (AC == 1)

    MEDIUM-quality indel:

    .. code-block:: text

        (p > 0.5) AND (AB > 0.3) AND (AC < 10)

    LOW-quality indel:

    .. code-block:: text

       (AB > 0.2)

    Additionally, de novo candidates are not considered if the proband GQ is
    smaller than the `min_gq` parameter, if the proband allele balance is
    lower than the `min_child_ab` parameter, if the depth ratio between the
    proband and parents is smaller than the `min_depth_ratio` parameter, if
    the allele balance in a parent is above the `max_parent_ab` parameter, or
    if the posterior probability `p` is smaller than the `min_p` parameter.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        High-throughput sequencing dataset.
    pedigree : :class:`.Pedigree`
        Sample pedigree.
    pop_frequency_prior : :class:`.Float64Expression`
        Expression for population alternate allele frequency prior.
    min_gq
        Minimum proband GQ to be considered for *de novo* calling.
    min_p
        Minimum posterior probability to be considered for *de novo* calling.
    max_parent_ab
        Maximum parent allele balance.
    min_child_ab
        Minimum proband allele balance/
    min_dp_ratio
        Minimum ratio between proband read depth and parental read depth.
    ignore_in_sample_allele_frequency
        Ignore in-sample allele frequency in computing site prior. Experimental.
    Returns
    -------
    :class:`.Table`
    """
    DE_NOVO_PRIOR = 1 / 30000000
    MIN_POP_PRIOR = 100 / 30000000

    required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'}
    missing_fields = required_entry_fields - set(mt.entry)
    if missing_fields:
        raise ValueError(
            f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, "
            f"missing {missing_fields}")

    pop_frequency_prior = hl.case() \
        .when((pop_frequency_prior >= 0) & (pop_frequency_prior <= 1), pop_frequency_prior) \
        .or_error(hl.str("de_novo: expect 0 <= pop_frequency_prior <= 1, found " + hl.str(pop_frequency_prior)))

    if ignore_in_sample_allele_frequency:
        # this mode is used when families larger than a single trio are observed, in which
        # an allele might be de novo in a parent and transmitted to a child in the dataset.
        # The original model does not handle this case correctly, and so this experimental
        # mode can be used to treat each trio as if it were the only one in the dataset.
        mt = mt.annotate_rows(__prior=pop_frequency_prior,
                              __alt_alleles=hl.int64(1),
                              __site_freq=hl.max(pop_frequency_prior,
                                                 MIN_POP_PRIOR))
    else:
        n_alt_alleles = hl.agg.sum(mt.GT.n_alt_alleles())
        total_alleles = 2 * hl.agg.sum(hl.is_defined(mt.GT))
        # subtract 1 from __alt_alleles to correct for the observed genotype
        mt = mt.annotate_rows(__prior=pop_frequency_prior,
                              __alt_alleles=n_alt_alleles,
                              __site_freq=hl.max(
                                  (n_alt_alleles - 1) / total_alleles,
                                  pop_frequency_prior, MIN_POP_PRIOR))

    mt = require_biallelic(mt, 'de_novo')

    tm = trio_matrix(mt, pedigree, complete_trios=True)

    autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar()
                                                 & tm.is_female)
    hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female
    hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female
    hemi_mt = tm.locus.in_mito() & tm.is_female

    is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1])
    n_alt_alleles = tm.__alt_alleles
    prior = tm.__site_freq
    het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref(
    ) & tm.mother_entry.GT.is_hom_ref()
    kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(
        tm.proband_entry.AD) < min_child_ab

    failure = hl.missing(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr))

    kid = tm.proband_entry
    dad = tm.father_entry
    mom = tm.mother_entry

    kid_linear_pl = 10**(-kid.PL / 10)
    kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl)

    dad_linear_pl = 10**(-dad.PL / 10)
    dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl)

    mom_linear_pl = 10**(-mom.PL / 10)
    mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl)

    kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD)
    dp_ratio = kid.DP / (dad.DP + mom.DP)

    def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio):
        p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] *
                                   mom_pp[1]) * kid_pp[1] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP / (dad.DP + mom.DP) < min_dp_ratio)
                | ~(kid_ad_ratio >= min_child_ab), failure).when(
                    (hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0),
                    failure).when(
                        (mom.AD[1] / hl.sum(mom.AD) > max_parent_ab)
                        | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab),
                        failure).when(p_de_novo < min_p, failure).when(
                            ~is_snp,
                            hl.case().when(
                                (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                (n_alt_alleles == 1),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              (kid_ad_ratio > 0.3) &
                                              (n_alt_alleles <= 5),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      kid_ad_ratio > 0.2,
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                            or_missing()).default(hl.case().when(
                                ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                 (dp_ratio > 0.2))
                                | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                                   (n_alt_alleles == 1))
                                | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                   (n_alt_alleles < 10) & (kid.DP > 10)),
                                hl.struct(p_de_novo=p_de_novo,
                                          confidence='HIGH')).when(
                                              (p_de_novo > 0.5) &
                                              ((kid_ad_ratio > 0.3) |
                                               (n_alt_alleles == 1)),
                                              hl.struct(
                                                  p_de_novo=p_de_novo,
                                                  confidence='MEDIUM')).when(
                                                      kid_ad_ratio > 0.2,
                                                      hl.struct(
                                                          p_de_novo=p_de_novo,
                                                          confidence='LOW')).
                                                  or_missing()))

        return hl.bind(solve, p_de_novo)

    def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio):
        p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR
        p_het_in_parent = 1 - (1 - prior)**4
        p_data_given_missed_het = (parent_pp[1] +
                                   parent_pp[2]) * kid_pp[2] * p_het_in_parent
        p_de_novo = p_data_given_dn / (p_data_given_dn +
                                       p_data_given_missed_het)

        def solve(p_de_novo):
            return (hl.case().when(kid.GQ < min_gq, failure).when(
                (kid.DP / (parent.DP) < min_dp_ratio)
                | (kid_ad_ratio < min_child_ab),
                failure).when((hl.sum(parent.AD) == 0), failure).when(
                    parent.AD[1] / hl.sum(parent.AD) > max_parent_ab,
                    failure).when(p_de_novo < min_p, failure).when(
                        ~is_snp,
                        hl.case().when(
                            (p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                            (n_alt_alleles == 1),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                                    (n_alt_alleles <= 5),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')).when(
                                                  kid_ad_ratio > 0.3,
                                                  hl.struct(
                                                      p_de_novo=p_de_novo,
                                                      confidence='LOW')).
                        or_missing()).default(hl.case().when(
                            ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                             (dp_ratio > 0.2))
                            | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) &
                               (n_alt_alleles == 1))
                            | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) &
                               (n_alt_alleles < 10) & (kid.DP > 10)),
                            hl.struct(
                                p_de_novo=p_de_novo, confidence='HIGH')).when(
                                    (p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) |
                                                         (n_alt_alleles == 1)),
                                    hl.struct(p_de_novo=p_de_novo,
                                              confidence='MEDIUM')).when(
                                                  kid_ad_ratio > 0.2,
                                                  hl.struct(
                                                      p_de_novo=p_de_novo,
                                                      confidence='LOW')).
                                              or_missing()))

        return hl.bind(solve, p_de_novo)

    de_novo_call = (hl.case().when(~het_hom_hom | kid_ad_fail, failure).when(
        autosomal,
        hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)).when(
            hemi_x | hemi_mt,
            hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)).when(
                hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp,
                                kid_ad_ratio)).or_missing())

    tm = tm.annotate_entries(__call=de_novo_call)
    tm = tm.filter_entries(hl.is_defined(tm.__call))
    entries = tm.entries()
    return (entries.select('__site_freq', 'proband', 'father', 'mother',
                           'proband_entry', 'father_entry', 'mother_entry',
                           'is_female',
                           **entries.__call).rename({'__site_freq': 'prior'}))
예제 #25
0
def combine(ts):
    def merge_alleles(alleles):
        from hail.expr.functions import _num_allele_type, _allele_ints
        return hl.rbind(
            alleles.map(lambda a: hl.or_else(a[0], '')).fold(
                lambda s, t: hl.if_else(hl.len(s) > hl.len(t), s, t), ''),
            lambda ref: hl.rbind(
                alleles.map(lambda al: hl.rbind(
                    al[0], lambda r: hl.array([ref]).
                    extend(al[1:].map(lambda a: hl.rbind(
                        _num_allele_type(r, a), lambda at: hl.if_else(
                            (_allele_ints['SNP'] == at)
                            | (_allele_ints['Insertion'] == at)
                            | (_allele_ints['Deletion'] == at)
                            | (_allele_ints['MNP'] == at)
                            | (_allele_ints['Complex'] == at), a + ref[hl.len(
                                r):], a)))))), lambda lal: hl.
                struct(globl=hl.array([ref]).extend(
                    hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                       local=lal)))

    def renumber_entry(entry, old_to_new) -> StructExpression:
        # global index of alternate (non-ref) alleles
        return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak]))

    if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map:
        f = hl.experimental.define_function(
            lambda row, gbl: hl.rbind(
                merge_alleles(row.data.map(lambda d: d.alleles)), lambda
                alleles: hl.struct(
                    locus=row.locus,
                    alleles=alleles.globl,
                    rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)
                                 ),
                    __entries=hl.bind(
                        lambda combined_allele_index: hl.range(
                            0, hl.len(row.data)).flatmap(lambda i: hl.if_else(
                                hl.is_missing(row.data[i].__entries),
                                hl.range(0, hl.len(gbl.g[i].__cols)).map(
                                    lambda _: hl.missing(row.data[i].__entries.
                                                         dtype.element_type)),
                                hl.bind(
                                    lambda old_to_new: row.data[i].__entries.
                                    map(lambda e: renumber_entry(
                                        e, old_to_new)),
                                    hl.range(0, hl.len(alleles.local[i])).map(
                                        lambda j: combined_allele_index[
                                            alleles.local[i][j]])))),
                        hl.dict(
                            hl.range(0, hl.len(alleles.globl)).map(
                                lambda j: hl.tuple([alleles.globl[j], j])))))),
            ts.row.dtype, ts.globals.dtype)
        _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f
    merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)]
    ts = Table(
        TableMapRows(
            ts._tir,
            Apply(merge_function._name, merge_function._ret_type,
                  TopLevelReference('row'), TopLevelReference('global'))))
    return ts.transmute_globals(
        __cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
예제 #26
0
def parse_as_doubles(string, has_non_ref):
    ints = string.split(r'\|')
    ints = hl.if_else(has_non_ref, ints[:-1], ints)
    return ints.map(lambda i: hl.if_else(
        (hl.len(i) == 0) | (i == '.'), hl.missing(hl.tfloat64), hl.float64(i)))
예제 #27
0
 def get_allele_type(allele_idx):
     return hl.if_else(allele_idx > 0, mt[variant_atypes][allele_idx - 1],
                       hl.missing(hl.tint32))
예제 #28
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38')
                        and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'),
                            'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'),
                            'GRCh37')
        assert grch37.has_liftover('GRCh38')
        assert grch38.has_liftover('GRCh37')
        self.assertEquals(
            grch37._liftovers,
            {'GRCh38': resource('grch37_to_grch38_chr20.over.chain.gz')})
        self.assertEquals(
            grch38._liftovers,
            {'GRCh37': resource('grch38_to_grch37_chr20.over.chain.gz')})

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(
            hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        assert t.all(t.locus == t.liftover)

        null_locus = hl.missing(hl.tlocus('GRCh38'))

        rows = [{
            'l37': hl.locus('20', 1, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 60000, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 60001, 'GRCh37'),
            'l38': hl.locus('chr20', 79360, 'GRCh38')
        }, {
            'l37': hl.locus('20', 278686, 'GRCh37'),
            'l38': hl.locus('chr20', 298045, 'GRCh38')
        }, {
            'l37': hl.locus('20', 278687, 'GRCh37'),
            'l38': hl.locus('chr20', 298046, 'GRCh38')
        }, {
            'l37': hl.locus('20', 278688, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 278689, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 278690, 'GRCh37'),
            'l38': null_locus
        }, {
            'l37': hl.locus('20', 278691, 'GRCh37'),
            'l38': hl.locus('chr20', 298047, 'GRCh38')
        }, {
            'l37': hl.locus('20', 37007586, 'GRCh37'),
            'l38': hl.locus('chr12', 32563117, 'GRCh38')
        }, {
            'l37': hl.locus('20', 62965520, 'GRCh37'),
            'l38': hl.locus('chr20', 64334167, 'GRCh38')
        }, {
            'l37': hl.locus('20', 62965521, 'GRCh37'),
            'l38': null_locus
        }]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(
            t.all(
                hl.if_else(hl.is_defined(t.l38),
                           hl.liftover(t.l37, 'GRCh38') == t.l38,
                           hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.missing(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [{
            'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'),
            'i38': null_locus_interval
        }, {
            'i37':
            hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
            'i38':
            hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')
        }]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)),
                            i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
예제 #29
0
def main(args):

    hl.init(log="/seqr_sample_qc.log")
    hl._set_flags(no_whole_stage_codegen="1") #Flag needed for hail 0.2.93, may be able to remove in future release.
    logger.info("Beginning seqr sample QC pipeline...")

    data_type = args.data_type
    build = args.build
    data_source = args.data_source
    version = args.callset_version
    is_test = args.is_test
    overwrite = args.overwrite

    logger.info("Importing callset...")
    if not args.skip_write_mt:
        logger.info("Converting vcf to MatrixTable...")
        mt = hl.import_vcf(
            args.vcf_path,
            force_bgz=True,
            reference_genome=f"GRCh{build}",
            min_partitions=4,
        ).write(
            mt_path(build, data_type, data_source, version, is_test), overwrite=True
        )
    mt = hl.read_matrix_table(mt_path(build, data_type, data_source, version, is_test))
    mt = mt.annotate_entries(
        GT=hl.case()
        .when(mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False))
        .when(mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False))
        .default(hl.missing(hl.tcall))
    )
    if not args.skip_validate_mt:
        logger.info("Validating data type...")
        validate_mt(mt, build, data_type)

    if is_test:
        logger.info("Creating test mt...")
        mt = hl.filter_intervals(
            mt,
            [
                hl.parse_locus_interval(
                    hl.if_else(build == "37", "20", "chr20"),
                    reference_genome=f"GRCh{build}",
                )
            ],
        ).persist()

    logger.info("Annotating with sequencing metrics and filtered callrate...")
    meta_ht = get_all_sample_metadata(mt, build, data_type, data_source, version)
    mt = mt.annotate_cols(**meta_ht[mt.col_key], data_type=data_type)

    logger.info("Annotating with sample metric filter flags...")
    metric_thresholds = {
        "callrate_thres": args.callrate_low_threshold,
        "contam_thres": args.contam_up_threshold,
        "chimera_thres": args.chimera_up_threshold,
        "wes_cov_thres": args.wes_coverage_low_threshold,
        "wgs_cov_thres": args.wgs_coverage_low_threshold,
    }
    mt = mt.annotate_cols(
        filter_flags=apply_filter_flags_expr(mt, data_type, metric_thresholds)
    )

    logger.info("Assign platform or product")
    if data_type == "WES" and data_source == "External":
        logger.info("Running platform imputation...")
        plat_ht = run_platform_imputation(
            mt,
            args.plat_min_cluster_size,
            args.plat_min_sample_size,
            args.plat_assignment_pcs,
        )
        mt = mt.annotate_cols(**plat_ht[mt.col_key])
    elif data_source == "Internal":
        logger.info("Assigning platform from product in metadata...")
        mt = mt.annotate_cols(
            qc_platform=hl.if_else(hl.is_defined(mt.PRODUCT), mt.PRODUCT, "Unknown")
        )

        missing_metrics = mt.filter_cols(hl.is_defined(mt.PRODUCT), keep=False)
        missing_metrics.cols().select().export(
            missing_metrics_path(build, data_type, data_source, version)
        )  #  TODO Add logging step that prints unexpected missing samples
    else:
        mt = mt.annotate_cols(qc_platform="Unknown")

    logger.info("Projecting gnomAD population PCs...")
    pop_ht = run_population_pca(mt, build)
    mt = mt.annotate_cols(**pop_ht[mt.col_key])

    logger.info("Running Hail's sample qc...")
    hail_metric_ht = run_hail_sample_qc(mt, data_type)
    mt = mt.annotate_cols(**hail_metric_ht[mt.col_key])

    logger.info("Exporting sample QC tables...")
    ht = mt.cols()
    ht = ht.checkpoint(
        sample_qc_ht_path(build, data_type, data_source, version, is_test), overwrite
    )
    ht.flatten().export(sample_qc_tsv_path(build, data_type, data_source, version))
예제 #30
0
    def transform_entries(old_entry):
        def with_local_a_index(local_a_index):
            fields = set(old_entry.keys())

            def with_pl(pl):
                new_exprs = {}
                dropped_fields = ['LA']
                if 'LGT' in fields:
                    new_exprs['GT'] = hl.downcode(
                        old_entry.LGT,
                        hl.or_else(local_a_index, hl.len(old_entry.LA)))
                    dropped_fields.append('LGT')
                if 'LPGT' in fields:
                    new_exprs['PGT'] = hl.downcode(
                        old_entry.LPGT,
                        hl.or_else(local_a_index, hl.len(old_entry.LA)))
                    dropped_fields.append('LPGT')
                if 'LAD' in fields:
                    non_ref_ad = hl.or_else(old_entry.LAD[local_a_index],
                                            0)  # zeroed if not in LAD
                    new_exprs['AD'] = hl.or_missing(
                        hl.is_defined(old_entry.LAD),
                        [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad])
                    dropped_fields.append('LAD')
                if 'LPL' in fields:
                    new_exprs['PL'] = pl
                    if 'GQ' in fields:
                        new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl),
                                                     old_entry.GQ)

                    dropped_fields.append('LPL')

                return (hl.case().when(
                    hl.len(ds.alleles) == 1,
                    old_entry.annotate(
                        **{
                            f[1:]: old_entry[f]
                            for f in ['LGT', 'LPGT', 'LAD', 'LPL']
                            if f in fields
                        }).drop(*dropped_fields)).when(
                            hl.or_else(old_entry.LGT.is_hom_ref(), False),
                            old_entry.annotate(
                                **{
                                    f: old_entry[f'L{f}'] if f in
                                    ['GT', 'PGT'] else e
                                    for f, e in new_exprs.items()
                                }).drop(*dropped_fields)).default(
                                    old_entry.annotate(**new_exprs).drop(
                                        *dropped_fields)))

            if 'LPL' in fields:
                new_pl = hl.or_missing(
                    hl.is_defined(old_entry.LPL),
                    hl.or_missing(
                        hl.is_defined(local_a_index),
                        hl.range(0, 3).map(lambda i: hl.min(
                            hl.range(0, hl.triangle(hl.len(old_entry.LA))).
                            filter(lambda j: hl.downcode(
                                hl.unphased_diploid_gt_index_call(j),
                                local_a_index) == hl.
                                   unphased_diploid_gt_index_call(i)).map(
                                       lambda idx: old_entry.LPL[idx])))))
                return hl.bind(with_pl, new_pl)
            else:
                return with_pl(None)

        lai = hl.fold(
            lambda accum, elt: hl.if_else(
                old_entry.LA[elt] == ds[new_id].a_index, elt, accum),
            hl.missing(hl.tint32), hl.range(0, hl.len(old_entry.LA)))
        return hl.bind(with_local_a_index, lai)