def test_unions(self): dataset = hl.import_vcf(resource('sample2.vcf')) # test union_rows ds1 = dataset.filter_rows(dataset.locus.position % 2 == 1) ds2 = dataset.filter_rows(dataset.locus.position % 2 == 0) datasets = [ds1, ds2] r1 = ds1.union_rows(ds2) r2 = hl.MatrixTable.union_rows(*datasets) self.assertTrue(r1._same(r2)) # test union_cols ds = dataset.union_cols(dataset).union_cols(dataset) for s, count in ds.aggregate_cols(agg.counter(ds.s)).items(): self.assertEqual(count, 3)
# COMMAND ---------- # MAGIC %md ## Query functions and the Hail Expression Language # MAGIC # MAGIC Hail has a number of useful query functions that can be used for gathering statistics on our dataset. These query functions take Hail Expressions as arguments. # MAGIC # MAGIC We will start by looking at some statistics of the information in our table. The [aggregate](https://hail.is/docs/devel/hail.Table.html#hail.Table.aggregate) method can be used to aggregate over rows of the table. # COMMAND ---------- # MAGIC %md `counter` is an aggregation function that counts the number of occurrences of each unique element. We can use this to pull out the population distribution by passing in a Hail Expression for the field that we want to count by. # COMMAND ---------- pprint(table.aggregate(agg.counter(table.SuperPopulation))) # COMMAND ---------- # MAGIC %md `stats` is an aggregation function that produces some useful statistics about numeric collections. We can use this to see the distribution of the CaffeineConsumption phenotype. # COMMAND ---------- pprint(table.aggregate(agg.stats(table.CaffeineConsumption))) # COMMAND ---------- # MAGIC %md However, these metrics aren't perfectly representative of the samples in our dataset. Here's why: # COMMAND ----------