def test_agg_cols_group_by(self): t = hl.utils.range_matrix_table(1, 10) tests = [ (agg.group_by( t.col_idx % 2, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), { 0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0] }), (agg.group_by( t.col_idx % 3, agg.filter( t.col_idx > 7, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), { 0: [10, 0], 1: [0], 2: [9, 0] }), (agg.group_by( t.col_idx % 3, agg.explode( lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32)))), { 0: [10, 11, 0], 1: [0], 2: [9, 10, 0] }), ] for aggregation, expected in tests: self.assertEqual( t.select_rows(result=aggregation).result.collect()[0], expected)
def test_agg_cols_group_by(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.group_by(t.col_idx % 2, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), {0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0]}), (agg.group_by(t.col_idx % 3, agg.filter(t.col_idx > 7, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {0: [10, 0], 1: [0], 2: [9, 0]}), (agg.group_by(t.col_idx % 3, agg.explode(lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32)))), {0: [10, 11, 0], 1: [0], 2:[9, 10, 0]}), ] for aggregation, expected in tests: self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
def test_agg_cols_filter(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.filter(t.col_idx > 7, agg.collect(t.col_idx + 1).append(0)), [9, 10, 0]), (agg.filter(t.col_idx > 7, agg.explode(lambda elt: agg.collect(elt + 1).append(0), [t.col_idx, t.col_idx + 1])), [9, 10, 10, 11, 0]), (agg.filter(t.col_idx > 7, agg.group_by(t.col_idx % 3, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {0: [10, 0], 2: [9, 0]}) ] for aggregation, expected in tests: self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
def import_gene_list(gene_list_path, gene_column, ensg=False, pLI_threshold=False, peek=False): """ Imports a gene list tsv and returns a set of ENSG or gene symbols :param str gene_list_path: Path to TSV file with gene list of interest :param str or None gene_column: Column in TSV file that specifies gene symbol or ENSG id. This column will be turned into a set. :param str or bool ENSG: If there are no ENSGs with version numbers in the file, specify False (Default) If there are ENSGs with version numbers in the file, specify column containing the ENSGs. :param float or bool pLI_threshold: If the file does not contain pLI scores to filter, specify False (Default) If the file contains pLI scores, specify threshold to filter files. e.g. pLI threshold = 0.95 :param bool peek: Default False. If you want to peek at the gene list to get the parameters Print out the first few lines of the gene list tsv, returns None :return: Set of genes of interest :rtype: set or None """ genes = hl.import_table(gene_list_path, impute=True) if peek: genes.show(width=200) return None if pLI_threshold: genes = genes.filter(genes.pLI > pLI_threshold) if ensg: genes = genes.annotate(ensg=genes[gene_column].split("\\.")[0]) gene_column = "ensg" genes = genes.aggregate(agg.collect_as_set(genes[gene_column])) return genes