def test_populate_from_datasets_with_measurments(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) paired1 = np.ones((5, 5)) * np.arange(0, 5) pair_names1 = ["gabou", "achille", "pedro", "oclivio", "gayoso"] y1 = CellMeasurement(name="dev", data=paired1, columns_attr_name="dev_names", columns=pair_names1) paired2 = np.ones((5, 4)) * np.arange(0, 4) pair_names2 = ["gabou", "oclivio", "achille", "pedro"] y2 = CellMeasurement(name="dev", data=paired2, columns_attr_name="dev_names", columns=pair_names2) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, Ys=[y1], gene_names=gene_names) dataset2.populate_from_data(data, Ys=[y2], gene_names=gene_names) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2]) self.assertTrue(hasattr(dataset, "dev")) self.assertTrue(hasattr(dataset, "dev_names")) self.assertListEqual(dataset.dev_names.tolist(), ["achille", "gabou", "oclivio", "pedro"]) self.assertListEqual(dataset.dev[0].tolist(), [1, 0, 3, 2]) self.assertListEqual(dataset.dev[5].tolist(), [2, 0, 1, 3])
def test_populate_from_datasets_with_measurments(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) paired1 = np.ones((5, 5)) * np.arange(0, 5) pair_names1 = ["gabou", "achille", "pedro", "oclivio", "gayoso"] y1 = CellMeasurement(name="dev", data=paired1, columns_attr_name="dev_names", columns=pair_names1) paired2 = np.ones((5, 4)) * np.arange(0, 4) pair_names2 = ["gabou", "oclivio", "achille", "pedro"] y2 = CellMeasurement(name="dev", data=paired2, columns_attr_name="dev_names", columns=pair_names2) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, Ys=[y1], gene_names=gene_names) dataset2.populate_from_data(data, Ys=[y2], gene_names=gene_names) dataset = GeneExpressionDataset() dataset.populate_from_datasets( [copy.deepcopy(dataset1), copy.deepcopy(dataset2)]) self.assertTrue(hasattr(dataset, "dev")) self.assertTrue(hasattr(dataset, "dev_names")) self.assertListEqual(dataset.dev_names.tolist(), ["achille", "gabou", "oclivio", "pedro"]) self.assertListEqual(dataset.dev[0].tolist(), [1, 0, 3, 2]) self.assertListEqual(dataset.dev[5].tolist(), [2, 0, 1, 3]) # Take union of dev columns, 0s fill remainder dataset = GeneExpressionDataset() dataset.populate_from_datasets( [copy.deepcopy(dataset1), copy.deepcopy(dataset2)], cell_measurement_intersection={"dev": False}, ) self.assertListEqual( dataset.dev_names.tolist(), ["achille", "gabou", "gayoso", "oclivio", "pedro"], ) mask = dataset.get_batch_mask_cell_measurement("dev") self.assertEqual(mask[1][2].astype(int), 0)
def test_populate_from_datasets_cortex(self): cortex_dataset_1 = CortexDataset(save_path="tests/data") cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3), mode="variance") cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path="tests/data") cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4), mode="variance") cortex_dataset_2.filter_cell_types([ "endothelial-mural", "interneurons", "microglia", "oligodendrocytes" ]) cortex_dataset_2.filter_cell_types([2, 0]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2]) self.assertEqual(2, dataset.nb_genes)
def test_populate_from_datasets_dummy_data(self): data1 = np.random.randint(1, 5, size=(5, 10)) gene_names1 = np.array(["gene_%d" % i for i in range(10)]) dataset1 = GeneExpressionDataset() dataset1.populate_from_data(data1, gene_names=gene_names1) data2 = np.random.randint(1, 5, size=(7, 3)) gene_names2 = np.array(["gene_%d" % i for i in range(3)]) dataset2 = GeneExpressionDataset() dataset2.populate_from_data(data2, gene_names=gene_names2) data3 = np.random.randint(1, 5, size=(2, 5)) gene_names3 = np.array(["gene_%d" % i for i in range(5)]) dataset3 = GeneExpressionDataset() dataset3.populate_from_data(data3, gene_names=gene_names3) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2, dataset3]) self.assertEqual(14, dataset.nb_cells) self.assertEqual(3, dataset.nb_genes) self.assertListEqual(["GENE_0", "GENE_1", "GENE_2"], dataset.gene_names.tolist()) # test for labels sharing dataset2.labels = [0, 0, 0, 1, 1, 1, 1] dataset2.initialize_mapped_attribute("labels", "cell_types", ["0", "1"]) dataset3.labels = [0, 1] dataset3.initialize_mapped_attribute("labels", "cell_types", ["0", "2"]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset2, dataset3], shared_labels=True) self.assertListEqual( np.squeeze(dataset.labels).tolist(), [0, 0, 0, 1, 1, 1, 1, 0, 2]) self.assertListEqual(dataset.cell_types, ["0", "1", "2"]) dataset_unshared = GeneExpressionDataset() dataset_unshared.populate_from_datasets([dataset2, dataset3], shared_labels=False) self.assertListEqual( np.squeeze(dataset_unshared.labels).tolist(), [0, 0, 0, 1, 1, 1, 1, 2, 3]) self.assertListEqual(dataset_unshared.cell_types, ["0", "1", "0", "2"]) # test for batch_indices offsetting dataset2.batch_indices = [0, 0, 0, 1, 1, 1, 1] dataset2.initialize_mapped_attribute("batch_indices", "experiment", ["fish_2", "scrna_2"]) dataset3.batch_indices = [0, 1] dataset3.initialize_mapped_attribute("batch_indices", "experiment", ["fish_3", "scrna_3"]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset2, dataset3]) self.assertListEqual( np.squeeze(dataset.batch_indices).tolist(), [0, 0, 0, 1, 1, 1, 1, 2, 3]) self.assertListEqual(getattr(dataset, "experiment"), ["fish_2", "scrna_2", "fish_3", "scrna_3"])
def test_populate_from_datasets_cell_attributes_merging(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) cell_attr1 = np.array([["1"] for _ in range(5)]) cell_attr2 = np.array([["2"] for _ in range(5)]) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, gene_names=gene_names, cell_attributes_dict={"test": cell_attr1}) dataset2.populate_from_data(data, gene_names=gene_names, cell_attributes_dict={"test": cell_attr2}) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2]) self.assertTupleEqual(dataset.test.shape, (10, 1)) self.assertListEqual( np.squeeze(dataset.test).tolist(), ["1"] * 5 + ["2"] * 5)
def test_populate_from_datasets_gene_attributes_merging(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) gene_attr1 = np.array([["1"] for _ in range(10)]) gene_attr2 = np.array([["2"] for _ in range(10)]) dataset1 = GeneExpressionDataset() dataset2 = GeneExpressionDataset() dataset1.populate_from_data(data, gene_names=gene_names, gene_attributes_dict={"test": gene_attr1}) dataset2.populate_from_data(data, gene_names=gene_names, gene_attributes_dict={"test": gene_attr2}) dataset = GeneExpressionDataset() dataset.populate_from_datasets([dataset1, dataset2]) # Should keep the gene attribute of the first dataset self.assertEqual(dataset.test[0, 0], "1")
class scVI(Base_scVI): def __init__(self, data, name, n_latent=10, reconstruction_seq='zinb'): super().__init__(data, name, n_latent) self.full_dataset = GeneExpressionDataset() self.full_dataset.populate_from_datasets([ copy.deepcopy(data.data_fish_partial), copy.deepcopy(data.data_seq) ]) self.full_dataset.compute_library_size_batch() self.reconstruction_seq = reconstruction_seq def train_both(self, n_epochs=20): vae_both = VAE( self.full_dataset.nb_genes, n_latent=self.n_latent, n_batch=self.full_dataset.n_batches, dispersion="gene-batch", reconstruction_loss=self.reconstruction_seq, ) self.trainer_both = UnsupervisedTrainer( vae_both, self.full_dataset, train_size=0.95, use_cuda=self.USE_CUDA, frequency=1, ) self.trainer_both.train(n_epochs=n_epochs, lr=0.001) # self.posterior_both = self.trainer_both.create_posterior() def compute_latent(self): """ Return latent_both_fish, latent_both_seq, latent_only_fish, latent_only_seq """ both = self.trainer_both.create_posterior().get_latent()[0] self.latent_both = both self.latent_both_fish = self.latent_both[:self.data.data_fish_partial. X.shape[0], :] self.latent_both_seq = self.latent_both[ self.data.data_fish_partial.X.shape[0]:, :] fish = self.trainer_fish.create_posterior().get_latent()[0] self.latent_only_fish = fish seq = self.trainer_seq.create_posterior().get_latent()[0] self.latent_only_seq = seq return ( self.latent_both_fish, self.latent_both_seq, self.latent_only_fish, self.latent_only_seq, ) def compute_imputed_values(self, k=10): dataset = self.data.data_seq normalized_matrix = dataset.X / np.sum(dataset.X, axis=1)[:, np.newaxis] knn = KNeighborsRegressor(k, weights="distance") predicted = knn.fit(self.latent_both_seq, normalized_matrix).predict(self.latent_both_fish) self.imputed_full = predicted * self.data.data_fish_partial.X.sum( axis=1).reshape(-1, 1) self.imputed = self.imputed_full[:, self.data.test_indices] return self.imputed