예제 #1
0
    def test_trio_matrix(self):
        ped = Pedigree.read('src/test/resources/triomatrix.fam')
        from hail import KeyTable
        fam_table = KeyTable.import_fam(
            'src/test/resources/triomatrix.fam').to_hail2()

        dataset = hc.import_vcf('src/test/resources/triomatrix.vcf')
        dataset = dataset.annotate_cols(fam=fam_table[dataset.s])

        tm = methods.trio_matrix(dataset, ped, complete_trios=True)

        tm.count_rows()
예제 #2
0
    def test_trio_matrix(self):
        """
        This test depends on certain properties of the trio matrix VCF
        and pedigree structure.

        This test is NOT a valid test if the pedigree includes quads:
        the trio_matrix method will duplicate the parents appropriately,
        but the genotypes_table and samples_table orthogonal paths would
        require another duplication/explode that we haven't written.
        """
        ped = Pedigree.read('src/test/resources/triomatrix.fam')
        famkt = KeyTable.import_fam('src/test/resources/triomatrix.fam')

        vds = hc.import_vcf('src/test/resources/triomatrix.vcf')\
                .annotate_samples_table(famkt, root='sa.fam')

        dads = famkt.filter('isDefined(patID)')\
                    .annotate('isDad = true')\
                    .select(['patID', 'isDad'])\
                    .key_by('patID')

        moms = famkt.filter('isDefined(matID)') \
            .annotate('isMom = true') \
            .select(['matID', 'isMom']) \
            .key_by('matID')

        # test genotypes
        gkt = (vds.genotypes_table(
        ).key_by('s').join(dads, how='left').join(moms, how='left').annotate(
            'isDad = isDefined(isDad), isMom = isDefined(isMom)'
        ).aggregate_by_key(
            'v = v, fam = sa.fam.famID',
            'data = g.map(g => {role: if (isDad) 1 else if (isMom) 2 else 0, g: g}).collect()'
        ).filter('data.length() == 3').explode('data').select(
            ['v', 'fam', 'data']))

        tkt = (vds.trio_matrix(ped, complete_trios=True).genotypes_table(
        ).annotate(
            'fam = sa.proband.annotations.fam.famID, data = [{role: 0, g: g.proband}, {role: 1, g: g.father}, {role: 2, g: g.mother}]'
        ).select(['v', 'fam',
                  'data']).explode('data').filter('isDefined(data.g)').key_by(
                      ['v', 'fam']))

        self.assertTrue(gkt.same(tkt))

        # test annotations
        g_sa = (vds.samples_table(
        ).join(dads, how='left').join(moms, how='left').annotate(
            'isDad = isDefined(isDad), isMom = isDefined(isMom)'
        ).aggregate_by_key(
            'fam = sa.fam.famID',
            'data = sa.map(sa => {role: if (isDad) 1 else if (isMom) 2 else 0, sa: sa}).collect()'
        ).filter('data.length() == 3').explode('data').select(['fam', 'data']))

        t_sa = (vds.trio_matrix(ped, complete_trios=True).samples_table(
        ).annotate(
            'fam = sa.proband.annotations.fam.famID, data = [{role: 0, sa: sa.proband.annotations}, '
            '{role: 1, sa: sa.father.annotations}, '
            '{role: 2, sa: sa.mother.annotations}]').select([
                'fam', 'data'
            ]).explode('data').filter('isDefined(data.sa)').key_by(['fam']))

        self.assertTrue(g_sa.same(t_sa))
예제 #3
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_table(test_resources + '/sampleAnnotations.tsv',
                             impute=True).key_by('Sample')
        kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv',
                              impute=True).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key[0], "Sample")
        self.assertEqual(kt.columns[2], "qPhen")
        self.assertEqual(kt.count(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count())

        # Join
        kt.join(kt2, 'left').count()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.columns])

        kt.select("Sample")
        kt.select(["Sample", "Status"], qualified_name=True)

        kt.drop("Sample")
        kt.drop(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe().count()

        kt.show(10)
        kt.show(4, print_types=False, truncate_to=15)

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_table().annotate(
            'v = str(v), va.filters = va.filters.toArray()').flatten())

        sample_variants2 = KeyTable.from_dataframe(
            sample_variants.to_dataframe()).key_by('v')
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_table('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt32(), TString()])
        rows = [{
            'a': 5
        }, {
            'a': 5,
            'b': 'quam'
        }, {
            'a': -1,
            'b': 'quam'
        }, {
            'b': 'foo'
        }, {
            'a': 7,
            'b': 'baz'
        }]
        kt4 = KeyTable.parallelize(rows, schema, num_partitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b'))
                for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None),
                                (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None),
                                    (-1, 'quam'), (None, 'foo')])

        KeyTable.import_fam(test_resources + '/sample.fam')._typecheck()

        self.assertEqual(kt.union(kt).count(), kt.count() * 2)
        self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3)

        first3 = kt.take(3)
        self.assertEqual(first3[0].qPhen, 27704)
        self.assertEqual(first3[1].qPhen, 16636)
        self.assertEqual(first3[2].qPhen, 7256)
        self.assertEqual(first3[0].Sample, 'HG00096')
        self.assertEqual(first3[1].Sample, 'HG00097')
        self.assertEqual(first3[2].Sample, 'HG00099')
        self.assertTrue(all(x.Status == 'CASE' for x in first3))

        self.assertTrue(kt.head(3).count(), 3)

        self.assertEqual(range(10),
                         [x.idx for x in KeyTable.range(10).collect()])
        self.assertTrue(
            KeyTable.range(200).indexed('foo').forall('idx == foo'))

        kt3 = KeyTable.parallelize([{
            'A': Struct(c1=5, c2=21)
        }], TStruct(['A'],
                    [TStruct(['c1', 'c2'], [TInt32(), TInt32()])]))

        self.assertTrue(kt3.ungroup('A').group('A', 'c1', 'c2').same(kt3))
예제 #4
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_table(test_resources + '/sampleAnnotations.tsv',
                             impute=True).key_by('Sample')
        kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv',
                              impute=True).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key[0], "Sample")
        self.assertEqual(kt.columns[2], "qPhen")
        self.assertEqual(kt.count(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count())

        # Join
        kt.join(kt2, 'left').count()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.columns])

        kt.select(["Sample"])
        kt.select(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe()

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_table().annotate(
            'v = str(v), va.filters = va.filters.toArray()').flatten())

        sample_variants2 = KeyTable.from_dataframe(
            sample_variants.to_dataframe()).key_by('v')
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_table('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt(), TString()])
        rows = [{
            'a': 5
        }, {
            'a': 5,
            'b': 'quam'
        }, {
            'a': -1,
            'b': 'quam'
        }, {
            'b': 'foo'
        }, {
            'a': 7,
            'b': 'baz'
        }]
        kt4 = KeyTable.from_py(hc, rows, schema, num_partitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b'))
                for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None),
                                (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None),
                                    (-1, 'quam'), (None, 'foo')])

        KeyTable.import_fam(test_resources + '/sample.fam')._typecheck()