def test_annotate(self): vds = self.get_vds() vds = vds.annotate_globals(foo=5) self.assertEqual(vds.globals.dtype, hl.tstruct(foo=hl.tint32)) vds = vds.annotate_rows(x1=agg.count(), x2=agg.fraction(False), x3=agg.count_where(True), x4=vds.info.AC + vds.foo) vds = vds.annotate_cols(apple=6) vds = vds.annotate_cols(y1=agg.count(), y2=agg.fraction(False), y3=agg.count_where(True), y4=vds.foo + vds.apple) expected_schema = hl.tstruct(s=hl.tstr, apple=hl.tint32, y1=hl.tint64, y2=hl.tfloat64, y3=hl.tint64, y4=hl.tint32) self.assertTrue(schema_eq(vds.col.dtype, expected_schema), "expected: " + str(vds.col.dtype) + "\nactual: " + str(expected_schema)) vds = vds.select_entries(z1=vds.x1 + vds.foo, z2=vds.x1 + vds.y1 + vds.foo) self.assertTrue(schema_eq(vds.entry.dtype, hl.tstruct(z1=hl.tint64, z2=hl.tint64)))
def test_annotate(self): vds = self.get_vds() vds = vds.annotate_globals(foo=5) self.assertEqual(vds.globals.dtype, hl.tstruct(foo=hl.tint32)) vds = vds.annotate_rows(x1=agg.count(), x2=agg.fraction(False), x3=agg.count_where(True), x4=vds.info.AC + vds.foo) vds = vds.annotate_cols(apple=6) vds = vds.annotate_cols(y1=agg.count(), y2=agg.fraction(False), y3=agg.count_where(True), y4=vds.foo + vds.apple) expected_schema = hl.tstruct(s=hl.tstr, apple=hl.tint32, y1=hl.tint64, y2=hl.tfloat64, y3=hl.tint64, y4=hl.tint32) self.assertTrue( schema_eq(vds.col.dtype, expected_schema), "expected: " + str(vds.col.dtype) + "\nactual: " + str(expected_schema)) vds = vds.select_entries(z1=vds.x1 + vds.foo, z2=vds.x1 + vds.y1 + vds.foo) self.assertTrue( schema_eq(vds.entry.dtype, hl.tstruct(z1=hl.tint64, z2=hl.tint64)))
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)
# COMMAND ---------- mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) print('After filter, %d/284 samples remain.' % mt.count_cols()) # COMMAND ---------- # MAGIC %md Next is genotype QC. To start, we'll print the post-sample-QC call rate. It's actually gone _up_ since the initial summary - dropping low-quality samples disproportionally removed missing genotypes. # MAGIC # MAGIC Import the `hail.expr.functions` module. # COMMAND ---------- call_rate = mt.aggregate_entries(agg.fraction(hl.is_defined(mt.GT))) print('pre QC call rate is %.3f' % call_rate) # COMMAND ---------- # MAGIC %md It's a good idea to filter out genotypes where the reads aren't where they should be: if we find a genotype called homozygous reference with >10% alternate reads, a genotype called homozygous alternate with >10% reference reads, or a genotype called heterozygote without a ref / alt balance near 1:1, it is likely to be an error. # COMMAND ---------- ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab)