Пример #1
0
    def test_unions(self):
        dataset = hl.import_vcf(resource('sample2.vcf'))

        # test union_rows
        ds1 = dataset.filter_rows(dataset.locus.position % 2 == 1)
        ds2 = dataset.filter_rows(dataset.locus.position % 2 == 0)

        datasets = [ds1, ds2]
        r1 = ds1.union_rows(ds2)
        r2 = hl.MatrixTable.union_rows(*datasets)

        self.assertTrue(r1._same(r2))

        # test union_cols
        ds = dataset.union_cols(dataset).union_cols(dataset)
        for s, count in ds.aggregate_cols(agg.counter(ds.s)).items():
            self.assertEqual(count, 3)
Пример #2
0
    def test_unions(self):
        dataset = hl.import_vcf(resource('sample2.vcf'))

        # test union_rows
        ds1 = dataset.filter_rows(dataset.locus.position % 2 == 1)
        ds2 = dataset.filter_rows(dataset.locus.position % 2 == 0)

        datasets = [ds1, ds2]
        r1 = ds1.union_rows(ds2)
        r2 = hl.MatrixTable.union_rows(*datasets)

        self.assertTrue(r1._same(r2))

        # test union_cols
        ds = dataset.union_cols(dataset).union_cols(dataset)
        for s, count in ds.aggregate_cols(agg.counter(ds.s)).items():
            self.assertEqual(count, 3)
Пример #3
0
# COMMAND ----------

# MAGIC %md ## Query functions and the Hail Expression Language
# MAGIC
# MAGIC Hail has a number of useful query functions that can be used for gathering statistics on our dataset. These query functions take Hail Expressions as arguments.
# MAGIC
# MAGIC We will start by looking at some statistics of the information in our table. The [aggregate](https://hail.is/docs/devel/hail.Table.html#hail.Table.aggregate) method can be used to aggregate over rows of the table.

# COMMAND ----------

# MAGIC %md `counter` is an aggregation function that counts the number of occurrences of each unique element. We can use this to pull out the population distribution by passing in a Hail Expression for the field that we want to count by.

# COMMAND ----------

pprint(table.aggregate(agg.counter(table.SuperPopulation)))

# COMMAND ----------

# MAGIC %md `stats` is an aggregation function that produces some useful statistics about numeric collections. We can use this to see the distribution of the CaffeineConsumption phenotype.

# COMMAND ----------

pprint(table.aggregate(agg.stats(table.CaffeineConsumption)))

# COMMAND ----------

# MAGIC %md However, these metrics aren't perfectly representative of the samples in our dataset. Here's why:

# COMMAND ----------