Exemplo n.º 1
0
 def make_call(gt):
     if gt == 0.0:
         return hl.Call([0, 0])
     if gt == 1.0:
         return hl.Call([0, 1])
     if gt == 2.0:
         return hl.Call([1, 1])
Exemplo n.º 2
0
def test_lgt_to_gt():
    call_0_0_f = hl.call(0, 0, phased=False)
    call_0_0_t = hl.call(0, 0, phased=True)
    call_0_1_f = hl.call(0, 1, phased=False)
    call_2_0_t = hl.call(2, 0, phased=True)

    call_1 = hl.call(1, phased=False)

    la = [0, 3, 5]

    assert hl.eval(tuple(hl.vds.lgt_to_gt(c, la) for c in [call_0_0_f, call_0_0_t, call_0_1_f, call_2_0_t, call_1])) == \
           tuple([hl.Call([0, 0], phased=False), hl.Call([0, 0], phased=True), hl.Call([0, 3], phased=False), hl.Call([5, 0], phased=True), hl.Call([3], phased=False)])
Exemplo n.º 3
0
    def test_errors(self):
        schema = hl.tstruct(status=hl.tint32, gt=hl.tcall, qPheno=hl.tint32)

        rows = [{'status': 0, 'gt': hl.Call([0, 0]), 'qPheno': 3},
                {'status': 0, 'gt': hl.Call([0, 1]), 'qPheno': 13},
                {'status': 1, 'gt': hl.Call([0, 1]), 'qPheno': 20}]

        kt = hl.Table.parallelize(rows, schema)

        def f():
            kt.a = 5

        self.assertRaises(NotImplementedError, f)
Exemplo n.º 4
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3},
                {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status)
                .aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))),
                x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)
            ).take(1)[0])

        expected = {u'status': 0,
                    u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777,
                             u'observed_homs': 1},
                    u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]},
                    u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'},
                    u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0},
                    u'x8': 1, u'x9': 0.0, u'x16': u'apple',
                    u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5},
                    u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16,
                    u'x17': [],
                    u'x18': [],
                    u'x19': [hl.Call([0, 1])]}

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Exemplo n.º 5
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Exemplo n.º 6
0
    def _convert_from_json(self, x):
        if x == '-':
            return hl.Call([])
        if x == '|-':
            return hl.Call([], phased=True)
        if x[0] == '|':
            return hl.Call([int(x[1:])], phased=True)

        n = len(x)
        i = 0
        while i < n:
            c = x[i]
            if c in '|/':
                break
            i += 1

        if i == n:
            return hl.Call([int(x)])

        return hl.Call([int(x[0:i]), int(x[i + 1:])], phased=(c == '|'))
Exemplo n.º 7
0
    def test_sample_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5},
            {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4},
            {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5},
            {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3},
            {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Exemplo n.º 8
0
 def values(self):
     values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0),
               (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"),
               (hl.tstruct(x=hl.tint32), hl.Struct(x=0)),
               (hl.tarray(hl.tint32), [0, 1, 4]),
               (hl.tset(hl.tint32), {0, 1, 4}),
               (hl.tdict(hl.tstr, hl.tint32), {
                   "a": 0,
                   "b": 1,
                   "c": 4
               }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True,
                                                         False)),
               (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)),
               (hl.tcall, hl.Call([0, 1]))]
     return values
Exemplo n.º 9
0
def get_ldsim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=None):
    data = []
    rs = np.random.RandomState(seed)
    for v in range(n_variants):
        for s in range(n_samples):
            for c in range(n_contigs):
                data.append({
                    'v': f'{c+1}:{v+1}:A:C',
                    's': f's{s+1:09d}',
                    'cm': .1,
                    'GT': hl.Call([rs.randint(0, 2),
                                   rs.randint(0, 2)])
                })
    ht = hl.Table.parallelize(
        data, hl.dtype('struct{v: str, s: str, cm: float64, GT: call}'))
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                            col_key=['s'],
                            row_fields=['cm'])
    return add_default_plink_fields(mt)
Exemplo n.º 10
0
def get_plink_sim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=0):
    data = []
    rs = np.random.RandomState(seed)
    contig_index = dividx(n_variants, n_contigs)
    assert contig_index.ndim == 1
    assert contig_index.size == n_variants
    for v in range(n_variants):
        c = contig_index[v]
        for s in range(n_samples):
            data.append({
                "v": f"{c+1}:{v+1}:A:C",
                "s": f"S{s+1:07d}",
                "cm": 0.1,
                "GT": hl.Call([rs.randint(0, 2),
                               rs.randint(0, 2)]),
            })
    ht = hl.Table.parallelize(
        data, hl.dtype("struct{v: str, s: str, cm: float64, GT: call}"))
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(row_key=["locus", "alleles"],
                            col_key=["s"],
                            row_fields=["cm"])
    return add_default_plink_fields(mt)
Exemplo n.º 11
0
def test_lgt_to_gt_invalid():

    c1 = hl.call(1, 1)
    c2 = hl.call(1, 1, phased=True)
    assert hl.eval(hl.vds.lgt_to_gt(c1, [0, 17495])) == hl.Call([17495, 17495])
Exemplo n.º 12
0
 intervals = ["chr5"]
 mt_filtered = hl.filter_intervals(mt_filtered, [
     hl.parse_locus_interval(x, reference_genome='GRCh38')
     for x in intervals
 ])
 mt_trans = mt_filtered.filter_rows(mt_filtered.info.AC[0] == 2, keep=True)
 mt_untrans = mt_filtered.filter_rows(mt_filtered.info.AC[0] == 1,
                                      keep=True)
 #.filter_rows(mt_filtered.info.AC[0] ==, keep=True)
 print(mt_filtered.info.AC.summarize())
 print(mt_filtered.info.AC.show())
 #group_cols_by(mt_trans.id)
 mt_trans_count = mt_trans.group_cols_by(mt_trans.fam_id).aggregate(
     transmitted_singletons_count=hl.agg.count_where(
         (mt_trans.info.AC[0] == 2)
         & (mt_trans.proband_entry.GT == hl.Call([0, 1]))
         & (((mt_trans.father_entry.GT == hl.Call([0, 1])) &
             (mt_trans.mother_entry.GT == hl.Call([0, 0])))
            | ((mt_trans.mother_entry.GT == hl.Call([0, 1])) &
               (mt_trans.father_entry.GT == hl.Call([0, 0]))))))
 #print(mt_untrans.father_entry.GT.show())
 #print(mt_untrans.mother_entry.GT.show())
 #print(mt_untrans.proband_entry.GT.show())
 #group_cols_by(mt_untrans.id)
 mt_untrans_count = mt_untrans.group_cols_by(mt_untrans.fam_id).aggregate(
     untransmitted_singletons_count=hl.agg.count_where(
         (mt_untrans.proband_entry.GT == hl.Call([0, 0]))
         & ((mt_untrans.father_entry.GT == hl.Call([0, 1]))
            | (mt_untrans.father_entry.GT == hl.Call([1, 0]))
            | (mt_untrans.mother_entry.GT == hl.Call([0, 1]))
            | (mt_untrans.mother_entry.GT == hl.Call([1, 0])))))
Exemplo n.º 13
0
    def test_concordance_n_discordant(self):
        dataset = get_dataset()
        _, cols_conc, rows_conc = hl.concordance(dataset, dataset)
        assert cols_conc.aggregate(
            hl.agg.count_where(cols_conc.n_discordant != 0)) == 0

        rows1 = [
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': hl.Call([0, 0])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '2',
                    'GT': hl.Call([0, 0])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '3',
                    'GT': hl.Call([1, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '4',
                    'GT': hl.Call([1, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 101),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': hl.Call([1, 1])
                }),
        ]
        rows2 = [
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': None
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '2',
                    'GT': hl.Call([0, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '3',
                    'GT': hl.Call([0, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '4',
                    'GT': hl.Call([1, 1])
                }),
        ]

        def make_mt(rows):
            ht = hl.Table.parallelize(
                rows,
                schema=
                'struct{locus:locus<GRCh37>,alleles:array<str>,s:str,GT:call}')
            return ht.to_matrix_table(row_key=['locus', 'alleles'],
                                      col_key=['s'])

        global_conc_2, cols_conc_2, rows_conc_2 = hl.concordance(
            make_mt(rows1), make_mt(rows2))
        assert cols_conc_2.collect() == [
            hl.Struct(s='1',
                      concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 1, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [1, 0, 0, 0, 0]],
                      n_discordant=0),
            hl.Struct(s='2',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0]],
                      n_discordant=1),
            hl.Struct(s='3',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 0]],
                      n_discordant=1),
            hl.Struct(s='4',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 1]],
                      n_discordant=0),
        ]

        assert global_conc_2 == [[3, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                 [0, 1, 0, 1, 0], [0, 0, 0, 0, 0],
                                 [1, 0, 0, 1, 1]]
        assert rows_conc_2.collect() == [
            hl.Struct(locus=hl.Locus('1', 100),
                      alleles=['A', 'T'],
                      concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 1, 0, 1, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 1]],
                      n_discordant=2),
            hl.Struct(locus=hl.Locus('1', 101),
                      alleles=['A', 'T'],
                      concordance=[[3, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [1, 0, 0, 0, 0]],
                      n_discordant=0),
        ]
Exemplo n.º 14
0
def test_to_dense_mt():
    vds = hl.vds.read_vds(
        os.path.join(resource('vds'), '1kg_2samples_starts.vds'))
    vds = hl.vds.filter_chromosomes(vds, keep='chr22')

    dense = hl.vds.to_dense_mt(vds).select_entries('LGT', 'LA', 'GQ', 'DP')

    assert dense.rows().select()._same(vds.variant_data.rows().select(
    )), "rows differ between variant data and dense mt"

    assert dense.filter_entries(hl.is_defined(dense.LA))._same(
        vds.variant_data.select_entries('LGT', 'LA', 'GQ',
                                        'DP')), "cannot recover variant data"

    as_dict = dense.aggregate_entries(
        hl.dict(
            hl.zip(hl.agg.collect((hl.str(dense.locus), dense.s)),
                   hl.agg.collect(dense.entry))))

    assert as_dict.get(('chr22:10514784', 'NA12891')) == None
    assert as_dict.get(
        ('chr22:10514784', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=23,
                                                    DP=4)

    assert as_dict.get(
        ('chr22:10516150', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=64,
                                                    DP=4)
    assert as_dict.get(
        ('chr22:10516150', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=10)

    assert as_dict.get(
        ('chr22:10519088', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=21)
    assert as_dict.get(('chr22:10519088', 'NA12878')) == None

    assert as_dict.get(
        ('chr22:10562435', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=15)
    assert as_dict.get(
        ('chr22:10562435', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]),
                                                    LA=None,
                                                    GQ=21,
                                                    DP=9)

    assert as_dict.get(
        ('chr22:10562436', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]),
                                                    LA=[0, 1],
                                                    GQ=99,
                                                    DP=15)
    assert as_dict.get(
        ('chr22:10562436', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]),
                                                    LA=None,
                                                    GQ=21,
                                                    DP=9)