Пример #1
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_table(test_resources + '/sampleAnnotations.tsv',
                             impute=True).key_by('Sample')
        kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv',
                              impute=True).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key[0], "Sample")
        self.assertEqual(kt.columns[2], "qPhen")
        self.assertEqual(kt.count(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count())

        # Join
        kt.join(kt2, 'left').count()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.columns])

        kt.select("Sample")
        kt.select(["Sample", "Status"], qualified_name=True)

        kt.drop("Sample")
        kt.drop(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe().count()

        kt.show(10)
        kt.show(4, print_types=False, truncate_to=15)

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_table().annotate(
            'v = str(v), va.filters = va.filters.toArray()').flatten())

        sample_variants2 = KeyTable.from_dataframe(
            sample_variants.to_dataframe()).key_by('v')
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_table('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt32(), TString()])
        rows = [{
            'a': 5
        }, {
            'a': 5,
            'b': 'quam'
        }, {
            'a': -1,
            'b': 'quam'
        }, {
            'b': 'foo'
        }, {
            'a': 7,
            'b': 'baz'
        }]
        kt4 = KeyTable.parallelize(rows, schema, num_partitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b'))
                for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None),
                                (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None),
                                    (-1, 'quam'), (None, 'foo')])

        KeyTable.import_fam(test_resources + '/sample.fam')._typecheck()

        self.assertEqual(kt.union(kt).count(), kt.count() * 2)
        self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3)

        first3 = kt.take(3)
        self.assertEqual(first3[0].qPhen, 27704)
        self.assertEqual(first3[1].qPhen, 16636)
        self.assertEqual(first3[2].qPhen, 7256)
        self.assertEqual(first3[0].Sample, 'HG00096')
        self.assertEqual(first3[1].Sample, 'HG00097')
        self.assertEqual(first3[2].Sample, 'HG00099')
        self.assertTrue(all(x.Status == 'CASE' for x in first3))

        self.assertTrue(kt.head(3).count(), 3)

        self.assertEqual(range(10),
                         [x.idx for x in KeyTable.range(10).collect()])
        self.assertTrue(
            KeyTable.range(200).indexed('foo').forall('idx == foo'))

        kt3 = KeyTable.parallelize([{
            'A': Struct(c1=5, c2=21)
        }], TStruct(['A'],
                    [TStruct(['c1', 'c2'], [TInt32(), TInt32()])]))

        self.assertTrue(kt3.ungroup('A').group('A', 'c1', 'c2').same(kt3))
Пример #2
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_table(test_resources + '/sampleAnnotations.tsv',
                             impute=True).key_by('Sample')
        kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv',
                              impute=True).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key[0], "Sample")
        self.assertEqual(kt.columns[2], "qPhen")
        self.assertEqual(kt.count(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count())

        # Join
        kt.join(kt2, 'left').count()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.columns])

        kt.select(["Sample"])
        kt.select(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe()

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_table().annotate(
            'v = str(v), va.filters = va.filters.toArray()').flatten())

        sample_variants2 = KeyTable.from_dataframe(
            sample_variants.to_dataframe()).key_by('v')
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_table('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt(), TString()])
        rows = [{
            'a': 5
        }, {
            'a': 5,
            'b': 'quam'
        }, {
            'a': -1,
            'b': 'quam'
        }, {
            'b': 'foo'
        }, {
            'a': 7,
            'b': 'baz'
        }]
        kt4 = KeyTable.from_py(hc, rows, schema, num_partitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b'))
                for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None),
                                (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None),
                                    (-1, 'quam'), (None, 'foo')])

        KeyTable.import_fam(test_resources + '/sample.fam')._typecheck()