def test_trio_matrix(self): ped = Pedigree.read('src/test/resources/triomatrix.fam') from hail import KeyTable fam_table = KeyTable.import_fam( 'src/test/resources/triomatrix.fam').to_hail2() dataset = hc.import_vcf('src/test/resources/triomatrix.vcf') dataset = dataset.annotate_cols(fam=fam_table[dataset.s]) tm = methods.trio_matrix(dataset, ped, complete_trios=True) tm.count_rows()
def test_trio_matrix(self): """ This test depends on certain properties of the trio matrix VCF and pedigree structure. This test is NOT a valid test if the pedigree includes quads: the trio_matrix method will duplicate the parents appropriately, but the genotypes_table and samples_table orthogonal paths would require another duplication/explode that we haven't written. """ ped = Pedigree.read('src/test/resources/triomatrix.fam') famkt = KeyTable.import_fam('src/test/resources/triomatrix.fam') vds = hc.import_vcf('src/test/resources/triomatrix.vcf')\ .annotate_samples_table(famkt, root='sa.fam') dads = famkt.filter('isDefined(patID)')\ .annotate('isDad = true')\ .select(['patID', 'isDad'])\ .key_by('patID') moms = famkt.filter('isDefined(matID)') \ .annotate('isMom = true') \ .select(['matID', 'isMom']) \ .key_by('matID') # test genotypes gkt = (vds.genotypes_table( ).key_by('s').join(dads, how='left').join(moms, how='left').annotate( 'isDad = isDefined(isDad), isMom = isDefined(isMom)' ).aggregate_by_key( 'v = v, fam = sa.fam.famID', 'data = g.map(g => {role: if (isDad) 1 else if (isMom) 2 else 0, g: g}).collect()' ).filter('data.length() == 3').explode('data').select( ['v', 'fam', 'data'])) tkt = (vds.trio_matrix(ped, complete_trios=True).genotypes_table( ).annotate( 'fam = sa.proband.annotations.fam.famID, data = [{role: 0, g: g.proband}, {role: 1, g: g.father}, {role: 2, g: g.mother}]' ).select(['v', 'fam', 'data']).explode('data').filter('isDefined(data.g)').key_by( ['v', 'fam'])) self.assertTrue(gkt.same(tkt)) # test annotations g_sa = (vds.samples_table( ).join(dads, how='left').join(moms, how='left').annotate( 'isDad = isDefined(isDad), isMom = isDefined(isMom)' ).aggregate_by_key( 'fam = sa.fam.famID', 'data = sa.map(sa => {role: if (isDad) 1 else if (isMom) 2 else 0, sa: sa}).collect()' ).filter('data.length() == 3').explode('data').select(['fam', 'data'])) t_sa = (vds.trio_matrix(ped, complete_trios=True).samples_table( ).annotate( 'fam = sa.proband.annotations.fam.famID, data = [{role: 0, sa: sa.proband.annotations}, ' '{role: 1, sa: sa.father.annotations}, ' '{role: 2, sa: sa.mother.annotations}]').select([ 'fam', 'data' ]).explode('data').filter('isDefined(data.sa)').key_by(['fam'])) self.assertTrue(g_sa.same(t_sa))
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_table(test_resources + '/sampleAnnotations.tsv', impute=True).key_by('Sample') kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv', impute=True).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key[0], "Sample") self.assertEqual(kt.columns[2], "qPhen") self.assertEqual(kt.count(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count()) # Join kt.join(kt2, 'left').count() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.columns]) kt.select("Sample") kt.select(["Sample", "Status"], qualified_name=True) kt.drop("Sample") kt.drop(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe().count() kt.show(10) kt.show(4, print_types=False, truncate_to=15) kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_table().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = KeyTable.from_dataframe( sample_variants.to_dataframe()).key_by('v') self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_table('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt32(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.parallelize(rows, schema, num_partitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')]) KeyTable.import_fam(test_resources + '/sample.fam')._typecheck() self.assertEqual(kt.union(kt).count(), kt.count() * 2) self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3) first3 = kt.take(3) self.assertEqual(first3[0].qPhen, 27704) self.assertEqual(first3[1].qPhen, 16636) self.assertEqual(first3[2].qPhen, 7256) self.assertEqual(first3[0].Sample, 'HG00096') self.assertEqual(first3[1].Sample, 'HG00097') self.assertEqual(first3[2].Sample, 'HG00099') self.assertTrue(all(x.Status == 'CASE' for x in first3)) self.assertTrue(kt.head(3).count(), 3) self.assertEqual(range(10), [x.idx for x in KeyTable.range(10).collect()]) self.assertTrue( KeyTable.range(200).indexed('foo').forall('idx == foo')) kt3 = KeyTable.parallelize([{ 'A': Struct(c1=5, c2=21) }], TStruct(['A'], [TStruct(['c1', 'c2'], [TInt32(), TInt32()])])) self.assertTrue(kt3.ungroup('A').group('A', 'c1', 'c2').same(kt3))
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_table(test_resources + '/sampleAnnotations.tsv', impute=True).key_by('Sample') kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv', impute=True).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key[0], "Sample") self.assertEqual(kt.columns[2], "qPhen") self.assertEqual(kt.count(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count()) # Join kt.join(kt2, 'left').count() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.columns]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_table().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = KeyTable.from_dataframe( sample_variants.to_dataframe()).key_by('v') self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_table('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.from_py(hc, rows, schema, num_partitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')]) KeyTable.import_fam(test_resources + '/sample.fam')._typecheck()