def test_populate_from_datasets_with_measurments(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) paired1 = np.ones((5, 5)) * np.arange(0, 5) pair_names1 = ["gabou", "achille", "pedro", "oclivio", "gayoso"] y1 = CellMeasurement(name="dev", data=paired1, columns_attr_name="dev_names", columns=pair_names1) paired2 = np.ones((5, 4)) * np.arange(0, 4) pair_names2 = ["gabou", "oclivio", "achille", "pedro"] y2 = CellMeasurement(name="dev", data=paired2, columns_attr_name="dev_names", columns=pair_names2) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, Ys=[y1], gene_names=gene_names) dataset2.populate_from_data(data, Ys=[y2], gene_names=gene_names) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2]) self.assertTrue(hasattr(dataset, "dev")) self.assertTrue(hasattr(dataset, "dev_names")) self.assertListEqual(dataset.dev_names.tolist(), ["achille", "gabou", "oclivio", "pedro"]) self.assertListEqual(dataset.dev[0].tolist(), [1, 0, 3, 2]) self.assertListEqual(dataset.dev[5].tolist(), [2, 0, 1, 3])
def create_datasets(): rs = RandomState(0) data_a = np.sort(rs.normal(0, 10, 500)).astype(int).reshape(100, 5) gene_names_a = list("ABCDE") cell_types_a = ["alpha", "beta", "gamma", "delta"] labels_a = rs.choice(np.arange(len(cell_types_a)), data_a.shape[0]) batch_indices_a = np.random.choice(np.arange(5), size=data_a.shape[0]) data_b = np.sort(rs.normal(100, 10, 300)).astype(int).reshape(100, 3) gene_names_b = list("BFA") cell_types_b = ["alpha", "epsilon", "rho"] labels_b = rs.choice(np.arange(len(cell_types_b)), data_b.shape[0]) batch_indices_b = rs.choice(np.arange(5), size=data_b.shape[0]) dataset_a = GeneExpressionDataset() dataset_b = GeneExpressionDataset() dataset_a.populate_from_data(X=data_a, labels=labels_a, gene_names=gene_names_a, cell_types=cell_types_a, batch_indices=batch_indices_a) dataset_a.name = "test_a" dataset_b.populate_from_data(X=data_b, labels=labels_b, gene_names=gene_names_b, cell_types=cell_types_b, batch_indices=batch_indices_b) dataset_b.name = "test_b" return dataset_a, dataset_b
def test_dense_subsample_genes(self): data = [ np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), ] # With default dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) n_genes = dataset.nb_genes n_top = n_genes // 2 dataset.subsample_genes(new_n_genes=n_top, mode="cell_ranger") assert dataset.nb_genes == n_top # With Seurat v2 dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) dataset.subsample_genes(new_n_genes=n_top, mode="seurat_v2") assert dataset.nb_genes == n_top # With Seurat v3 dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) dataset.subsample_genes(new_n_genes=n_top, mode="seurat_v3") assert dataset.nb_genes == n_top
def test_labels(self): data = np.ones((25, 10)) * 100 labels = np.array(range(25)) dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels) self.assertTupleEqual((25, 1), dataset.labels.shape) self.assertEqual(dataset.labels[5, 0], 5) labels = np.ones(25) * 5 dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels) self.assertTupleEqual(dataset.labels.shape, (25, 1)) self.assertEqual(dataset.labels[5, 0], 0)
def test_populate_from_datasets_with_measurments(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) paired1 = np.ones((5, 5)) * np.arange(0, 5) pair_names1 = ["gabou", "achille", "pedro", "oclivio", "gayoso"] y1 = CellMeasurement(name="dev", data=paired1, columns_attr_name="dev_names", columns=pair_names1) paired2 = np.ones((5, 4)) * np.arange(0, 4) pair_names2 = ["gabou", "oclivio", "achille", "pedro"] y2 = CellMeasurement(name="dev", data=paired2, columns_attr_name="dev_names", columns=pair_names2) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, Ys=[y1], gene_names=gene_names) dataset2.populate_from_data(data, Ys=[y2], gene_names=gene_names) dataset = GeneExpressionDataset() dataset.populate_from_datasets( [copy.deepcopy(dataset1), copy.deepcopy(dataset2)]) self.assertTrue(hasattr(dataset, "dev")) self.assertTrue(hasattr(dataset, "dev_names")) self.assertListEqual(dataset.dev_names.tolist(), ["achille", "gabou", "oclivio", "pedro"]) self.assertListEqual(dataset.dev[0].tolist(), [1, 0, 3, 2]) self.assertListEqual(dataset.dev[5].tolist(), [2, 0, 1, 3]) # Take union of dev columns, 0s fill remainder dataset = GeneExpressionDataset() dataset.populate_from_datasets( [copy.deepcopy(dataset1), copy.deepcopy(dataset2)], cell_measurement_intersection={"dev": False}, ) self.assertListEqual( dataset.dev_names.tolist(), ["achille", "gabou", "gayoso", "oclivio", "pedro"], ) mask = dataset.get_batch_mask_cell_measurement("dev") self.assertEqual(mask[1][2].astype(int), 0)
def test_collate_add(self): data = np.ones((25, 2)) * np.arange(0, 25).reshape((-1, 1)) batch_indices = np.arange(0, 25).reshape((-1, 1)) x_coords = np.arange(0, 25).reshape((-1, 1)) proteins = (np.ones((25, 3)) + np.arange(0, 25).reshape( (-1, 1)) + np.arange(0, 3)) proteins_name = ["A", "B", "C"] dataset = GeneExpressionDataset() dataset.populate_from_data( data, batch_indices=batch_indices, cell_attributes_dict={"x_coords": x_coords}, Ys=[ CellMeasurement( name="proteins", data=proteins, columns_attr_name="protein_names", columns=proteins_name, ) ], ) collate_fn = dataset.collate_fn_builder(add_attributes_and_types={ "x_coords": np.float32, "proteins": np.float32 }) x, mean, var, batch, labels, x_coords_tensor, proteins_tensor = collate_fn( [1, 2]) self.assertListEqual(x_coords_tensor.tolist(), [[1.0], [2.0]]) self.assertListEqual(proteins_tensor.tolist(), [[2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
def create_dataset(self, path): print("Reading rds") ro.r("sce<-readRDS('%s')" % path) print("Extracting log counts") log_counts = ro.r("logcounts(sce)") print("Transforming log count to counts") counts = (np.exp(log_counts * np.log(2)) - 1).T.astype(np.int) gene_symbols = ro.r("rowData(sce)$feature_symbol") labels = ro.r("colData(sce)$cell_type1") labels_levels = ro.r("levels(colData(sce)$cell_type1)") if labels_levels is not rpy2.rinterface.NULL: labels = np.array([labels_levels[int(l) - 1] for l in labels]) cell_types = list(np.unique(labels)) labels = np.array([cell_types.index(l) for l in labels]) valid_idx = (counts.sum(axis=1) > 10).ravel() # Filter bad quality cells counts = counts[valid_idx] labels = labels[valid_idx] gene_expression_dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(counts, labels=labels), cell_types=cell_types) gene_expression_dataset.gene_symbols = gene_symbols return gene_expression_dataset
def test_special_dataset_size(self): gene_dataset = GeneExpressionDataset() x = np.random.randint(1, 100, (17 * 2, 10)) y = np.random.randint(1, 100, (17 * 2, 10)) gene_dataset.populate_from_data(x) protein_data = CellMeasurement( name="protein_expression", data=y, columns_attr_name="protein_names", columns=np.arange(10), ) gene_dataset.initialize_cell_measurement(protein_data) # Test UnsupervisedTrainer vae = VAE( gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, ) trainer = UnsupervisedTrainer( vae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) # Test JVATrainer jvae = JVAE( [gene_dataset.nb_genes, gene_dataset.nb_genes], gene_dataset.nb_genes, [slice(None)] * 2, ["zinb", "zinb"], [True, True], n_batch=1, ) cls = Classifier(gene_dataset.nb_genes, n_labels=2, logits=True) trainer = JVAETrainer( jvae, cls, [gene_dataset, gene_dataset], train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) totalvae = TOTALVI(gene_dataset.nb_genes, len(gene_dataset.protein_names)) trainer = TotalTrainer( totalvae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, early_stopping_kwargs=None, ) trainer.train(n_epochs=1)
def test_genes_to_idx(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) dataset = GeneExpressionDataset() dataset.populate_from_data(data, gene_names=gene_names) indices = dataset.genes_to_index(["GENE_%d" % i for i in range(10)]) self.assertListEqual([i for i in range(10)], indices.tolist())
def test_filter_cells(self): data = np.ones((25, 10)) * 100 data[4:6, :] = 0 dataset = GeneExpressionDataset() dataset.populate_from_data(data) self.assertEqual(25, dataset.nb_cells) dataset.filter_cells_by_count() self.assertEqual(23, dataset.nb_cells)
def test_filter_genes(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) dataset = GeneExpressionDataset() dataset.populate_from_data(data, gene_names=gene_names) gene_names_true = ["GENE_1", "GENE_3"] dataset.filter_genes_by_attribute(gene_names_true) self.assertListEqual(gene_names_true, dataset.gene_names.tolist())
def test_compute_library_size_batch(self): data = np.exp(10) / 10 * np.ones((7, 10), dtype=int) dataset = GeneExpressionDataset() dataset.populate_from_data(data) local_means_true = [[10.0] for _ in range(7)] local_vars_true = [[0.0] for _ in range(7)] self.assertEqual(local_means_true, dataset.local_means.tolist()) self.assertEqual(local_vars_true, dataset.local_vars.tolist())
def test_remap_categorical_attributes(self): data = np.random.randint(1, 5, size=(7, 11)) labels = [1, 1, 1, 1, 1, 2, 2] dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels) labels_true = [0, 0, 0, 0, 0, 1, 1] labels_true = [[i] for i in labels_true] self.assertListEqual(labels_true, dataset.labels.tolist())
def test_filter_cell_types(self): data = np.random.randint(1, 5, size=(5, 10)) labels = [0, 0, 1, 1, 1] cell_types = ["0", "1"] dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels, cell_types=cell_types) dataset.filter_cell_types(["0"]) self.assertListEqual(data[:2].tolist(), dataset.X.tolist())
def test_collate_normal(self): data = np.ones((25, 2)) * np.arange(0, 25).reshape((-1, 1)) batch_indices = np.arange(0, 25).reshape((-1, 1)) dataset = GeneExpressionDataset() dataset.populate_from_data(data, batch_indices=batch_indices) collate_fn = dataset.collate_fn_builder() x, mean, var, batch, labels = collate_fn([1, 2]) self.assertListEqual(x.tolist(), [[1.0, 1.0], [2.0, 2.0]]) self.assertListEqual(batch.tolist(), [[1], [2]])
def test_populate_from_datasets_dummy_data(self): data1 = np.random.randint(1, 5, size=(5, 10)) gene_names1 = np.array(["gene_%d" % i for i in range(10)]) dataset1 = GeneExpressionDataset() dataset1.populate_from_data(data1, gene_names=gene_names1) data2 = np.random.randint(1, 5, size=(7, 3)) gene_names2 = np.array(["gene_%d" % i for i in range(3)]) dataset2 = GeneExpressionDataset() dataset2.populate_from_data(data2, gene_names=gene_names2) data3 = np.random.randint(1, 5, size=(2, 5)) gene_names3 = np.array(["gene_%d" % i for i in range(5)]) dataset3 = GeneExpressionDataset() dataset3.populate_from_data(data3, gene_names=gene_names3) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2, dataset3]) self.assertEqual(14, dataset.nb_cells) self.assertEqual(3, dataset.nb_genes) self.assertListEqual(["GENE_0", "GENE_1", "GENE_2"], dataset.gene_names.tolist()) # test for labels sharing dataset2.labels = [0, 0, 0, 1, 1, 1, 1] dataset2.initialize_mapped_attribute("labels", "cell_types", ["0", "1"]) dataset3.labels = [0, 1] dataset3.initialize_mapped_attribute("labels", "cell_types", ["0", "2"]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset2, dataset3], shared_labels=True) self.assertListEqual( np.squeeze(dataset.labels).tolist(), [0, 0, 0, 1, 1, 1, 1, 0, 2]) self.assertListEqual(dataset.cell_types, ["0", "1", "2"]) dataset_unshared = GeneExpressionDataset() dataset_unshared.populate_from_datasets([dataset2, dataset3], shared_labels=False) self.assertListEqual( np.squeeze(dataset_unshared.labels).tolist(), [0, 0, 0, 1, 1, 1, 1, 2, 3]) self.assertListEqual(dataset_unshared.cell_types, ["0", "1", "0", "2"]) # test for batch_indices offsetting dataset2.batch_indices = [0, 0, 0, 1, 1, 1, 1] dataset2.initialize_mapped_attribute("batch_indices", "experiment", ["fish_2", "scrna_2"]) dataset3.batch_indices = [0, 1] dataset3.initialize_mapped_attribute("batch_indices", "experiment", ["fish_3", "scrna_3"]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset2, dataset3]) self.assertListEqual( np.squeeze(dataset.batch_indices).tolist(), [0, 0, 0, 1, 1, 1, 1, 2, 3]) self.assertListEqual(getattr(dataset, "experiment"), ["fish_2", "scrna_2", "fish_3", "scrna_3"])
def test_populate_from_datasets_gene_attributes_merging(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) gene_attr1 = np.array([["1"] for _ in range(10)]) gene_attr2 = np.array([["2"] for _ in range(10)]) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, gene_names=gene_names, gene_attributes_dict={"test": gene_attr1}) dataset2.populate_from_data(data, gene_names=gene_names, gene_attributes_dict={"test": gene_attr2}) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2]) # Should keep the gene attribute of the first dataset self.assertEqual(dataset.test[0, 0], "1")
def __init__(self, data, name, n_latent=10, reconstruction_seq='zinb'): super().__init__(data, name, n_latent) self.full_dataset = GeneExpressionDataset() self.full_dataset.populate_from_datasets([ copy.deepcopy(data.data_fish_partial), copy.deepcopy(data.data_seq) ]) self.full_dataset.compute_library_size_batch() self.reconstruction_seq = reconstruction_seq
def test_populate_from_data(self): data = np.ones((25, 10)) * 100 dataset = GeneExpressionDataset() dataset.populate_from_data(data) self.assertEqual(dataset.nb_genes, 10) self.assertEqual(dataset.nb_cells, 25) # default batch_indices and labels self.assertListEqual([[0] for i in range(25)], dataset.batch_indices.tolist()) self.assertListEqual([[0] for i in range(25)], dataset.labels.tolist())
def test_populate_from_datasets_cell_attributes_merging(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) cell_attr1 = np.array([["1"] for _ in range(5)]) cell_attr2 = np.array([["2"] for _ in range(5)]) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, gene_names=gene_names, cell_attributes_dict={"test": cell_attr1}) dataset2.populate_from_data(data, gene_names=gene_names, cell_attributes_dict={"test": cell_attr2}) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2]) self.assertTupleEqual(dataset.test.shape, (10, 1)) self.assertListEqual( np.squeeze(dataset.test).tolist(), ["1"] * 5 + ["2"] * 5)
def test_dense_subsample_genes(self): data = [ np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), ] # With default dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) n_genes = dataset.nb_genes n_top = n_genes // 2 dataset.subsample_genes(new_n_genes=n_top) assert dataset.nb_genes < n_genes # For some reason the new number of genes can be slightly different than n_top # With Seurat dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) dataset.subsample_genes(new_n_genes=n_top, mode="seurat") assert dataset.nb_genes < n_genes
def test_map_cell_types(self): data = np.random.randint(1, 5, size=(7, 10)) labels = [0, 0, 4, 4, 2, 3, 5] cell_types = ["0", "1", "2", "3", "4", "5"] dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels, cell_types=cell_types) dataset.map_cell_types({("0", "2"): "6", ("3", "4"): "7"}) dataset.remap_categorical_attributes() self.assertListEqual(dataset.cell_types.tolist(), ["5", "6", "7"]) self.assertListEqual( np.squeeze(dataset.labels).tolist(), [1, 1, 2, 2, 1, 2, 0])
def test_multibatches_features(): data = [ np.random.randint(1, 5, size=(20, 10)), np.random.randint(1, 10, size=(20, 10)), np.random.randint(1, 10, size=(20, 10)), np.random.randint(1, 10, size=(30, 10)), ] dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) trainer.test_set.imputation(n_samples=2, transform_batch=0) trainer.train_set.imputation(n_samples=2, transform_batch=[0, 1, 2])
def test_reorder_genes(self): data = np.ones((25, 100)) * 100 gene_names = np.array(["gene_%d" % i for i in range(100)]) dataset = GeneExpressionDataset() dataset.populate_from_data(data, gene_names=gene_names) dataset.reorder_genes(["GENE_47", "GENE_2", "GENE_3", "GENE_12"]) # New order should be 47, 2, 3, 12, 0, 1, ... self.assertListEqual( list(dataset.gene_names[0:6]), ["GENE_47", "GENE_2", "GENE_3", "GENE_12", "GENE_0", "GENE_1"], ) self.assertRaises(KeyError, dataset.reorder_genes, ["GENE_101"])
def test_merge_cell_types(self): data = np.random.randint(1, 5, size=(8, 20)) labels = [0, 0, 1, 2, 2, 1, 0, 1] cell_types = ["0", "1", "2"] dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels, cell_types=cell_types) dataset.merge_cell_types(["0", "1"], new_cell_type_name="0 and 1") self.assertListEqual([[3], [3], [3], [2], [2], [3], [3], [3]], dataset.labels.tolist()) dataset.remap_categorical_attributes() self.assertListEqual([[1], [1], [1], [0], [0], [1], [1], [1]], dataset.labels.tolist()) self.assertListEqual(["2", "0 and 1"], dataset.cell_types.tolist())
def test_subsample_cells(self): data = np.arange(1, 6)[:, None] * np.ones(7)[None, :] dataset = GeneExpressionDataset() dataset.populate_from_data(data) # default dataset.subsample_cells() self.assertEqual(5, dataset.nb_cells) # when size is a float dataset.subsample_cells(size=0.8) data_true = np.arange(5, 1, -1)[:, None] * np.ones(7)[None, :] self.assertListEqual(data_true.tolist(), dataset.X.tolist()) # when size is an int dataset.subsample_cells(size=2) self.assertEqual(2, dataset.nb_cells)
def test_populate_from_per_label_list(self): data = [ np.random.randint(1, 5, size=(7, 10)), np.random.randint(1, 5, size=(5, 10)), np.random.randint(1, 5, size=(3, 10)), ] dataset = GeneExpressionDataset() dataset.populate_from_per_label_list(data) self.assertEqual(dataset.nb_cells, 15) self.assertEqual(dataset.nb_genes, 10) true_labels = np.concatenate([ np.zeros((7, 1), dtype=int), np.ones((5, 1), dtype=int), 2 * np.ones((3, 1), dtype=int), ]) self.assertListEqual(dataset.labels.tolist(), true_labels.tolist())
def test_data_loader(self): data = np.ones((25, 10)) * 100 paired = np.ones((25, 4)) * np.arange(0, 4) pair_names = ["gabou", "achille", "pedro", "oclivio"] y = CellMeasurement(name="dev", data=paired, columns_attr_name="dev_names", columns=pair_names) dataset = GeneExpressionDataset() dataset.populate_from_data(data, Ys=[y]) ad = dataset.to_anndata() dataset_ad = AnnDatasetFromAnnData( ad, cell_measurements_col_mappings={"dev": "dev_names"}) self.assertTrue((paired == dataset_ad.dev).all()) self.assertTrue((dataset.X == dataset_ad.X).all()) self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
def training_score_scvi(train, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return st.poisson(mu=lam).logpmf(train).sum()
def test_populate_from_datasets_cortex(self): cortex_dataset_1 = CortexDataset(save_path="tests/data") cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3), mode="variance") cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path="tests/data") cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4), mode="variance") cortex_dataset_2.filter_cell_types([ "endothelial-mural", "interneurons", "microglia", "oligodendrocytes" ]) cortex_dataset_2.filter_cell_types([2, 0]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2]) self.assertEqual(2, dataset.nb_genes)