def __init__(self, save_path="data/"): dataset = Dataset10X(filename="neuron_9k", save_path=save_path) self.save_path = save_path self.urls = [ "https://github.com/YosefLab/scVI-data/raw/master/brain_small_metadata.pickle" ] self.download_names = ["brain_small_metadata.pickle"] self.download() metadata = pickle.load( open(os.path.join(self.save_path, "brain_small_metadata.pickle"), "rb")) labels = metadata["clusters"].loc[dataset.barcodes.values.ravel()] - 1 self.raw_qc = metadata["raw_qc"].loc[dataset.barcodes.values.ravel()] self.qc_names = self.raw_qc.columns self.qc = self.raw_qc.values GeneExpressionDataset.__init__( self, dataset.X, dataset.local_means, dataset.local_vars, batch_indices=dataset.batch_indices, labels=labels, )
def create_dataset(self, path): print("Reading rds") ro.r("sce<-readRDS('%s')" % path) print("Extracting log counts") log_counts = ro.r("logcounts(sce)") print("Transforming log count to counts") counts = (np.exp(log_counts * np.log(2)) - 1).T.astype(np.int) gene_symbols = ro.r("rowData(sce)$feature_symbol") labels = ro.r("colData(sce)$cell_type1") labels_levels = ro.r("levels(colData(sce)$cell_type1)") if labels_levels is not rpy2.rinterface.NULL: labels = np.array([labels_levels[int(l) - 1] for l in labels]) cell_types = list(np.unique(labels)) labels = np.array([cell_types.index(l) for l in labels]) valid_idx = (counts.sum(axis=1) > 10).ravel() # Filter bad quality cells counts = counts[valid_idx] labels = labels[valid_idx] gene_expression_dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(counts, labels=labels), cell_types=cell_types) gene_expression_dataset.gene_symbols = gene_symbols return gene_expression_dataset
def create_datasets(): rs = RandomState(0) data_a = np.sort(rs.normal(0, 10, 500)).astype(int).reshape(100, 5) gene_names_a = list("ABCDE") cell_types_a = ["alpha", "beta", "gamma", "delta"] labels_a = rs.choice(np.arange(len(cell_types_a)), data_a.shape[0]) batch_indices_a = np.random.choice(np.arange(5), size=data_a.shape[0]) data_b = np.sort(rs.normal(100, 10, 300)).astype(int).reshape(100, 3) gene_names_b = list("BFA") cell_types_b = ["alpha", "epsilon", "rho"] labels_b = rs.choice(np.arange(len(cell_types_b)), data_b.shape[0]) batch_indices_b = rs.choice(np.arange(5), size=data_b.shape[0]) dataset_a = GeneExpressionDataset() dataset_b = GeneExpressionDataset() dataset_a.populate_from_data(X=data_a, labels=labels_a, gene_names=gene_names_a, cell_types=cell_types_a, batch_indices=batch_indices_a) dataset_a.name = "test_a" dataset_b.populate_from_data(X=data_b, labels=labels_b, gene_names=gene_names_b, cell_types=cell_types_b, batch_indices=batch_indices_b) dataset_b.name = "test_b" return dataset_a, dataset_b
def test_collate_add(self): data = np.ones((25, 2)) * np.arange(0, 25).reshape((-1, 1)) batch_indices = np.arange(0, 25).reshape((-1, 1)) x_coords = np.arange(0, 25).reshape((-1, 1)) proteins = (np.ones((25, 3)) + np.arange(0, 25).reshape( (-1, 1)) + np.arange(0, 3)) proteins_name = ["A", "B", "C"] dataset = GeneExpressionDataset() dataset.populate_from_data( data, batch_indices=batch_indices, cell_attributes_dict={"x_coords": x_coords}, Ys=[ CellMeasurement( name="proteins", data=proteins, columns_attr_name="protein_names", columns=proteins_name, ) ], ) collate_fn = dataset.collate_fn_builder(add_attributes_and_types={ "x_coords": np.float32, "proteins": np.float32 }) x, mean, var, batch, labels, x_coords_tensor, proteins_tensor = collate_fn( [1, 2]) self.assertListEqual(x_coords_tensor.tolist(), [[1.0], [2.0]]) self.assertListEqual(proteins_tensor.tolist(), [[2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
def test_filter_and_concat_datasets(): cortex_dataset_1 = CortexDataset() cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 300)) cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset() cortex_dataset_2.subsample_genes(subset_genes=np.arange(100, 400)) cortex_dataset_2.filter_cell_types( ["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"]) cortex_dataset_2.filter_cell_types([2, 0]) cortex_dataset_merged = GeneExpressionDataset.concat_datasets( cortex_dataset_1, cortex_dataset_2) assert cortex_dataset_merged.nb_genes == 200 synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5) synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3) synthetic_merged_1 = GeneExpressionDataset.concat_datasets( synthetic_dataset_1, synthetic_dataset_2) assert synthetic_merged_1.n_batches == 5 assert synthetic_merged_1.n_labels == 5 synthetic_merged_2 = GeneExpressionDataset.concat_datasets( synthetic_dataset_1, synthetic_dataset_2, shared_labels=False) assert synthetic_merged_2.n_batches == 5 assert synthetic_merged_2.n_labels == 8 synthetic_dataset_1.filter_cell_types([0, 1, 2, 3]) assert synthetic_dataset_1.n_labels == 4 synthetic_dataset_1.subsample_cells(50) assert len(synthetic_dataset_1) == 50
def test_filter_and_concat_datasets(): cortex_dataset_1 = CortexDataset(save_path='tests/data/') cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3)) cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path='tests/data/') cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4)) cortex_dataset_2.filter_cell_types(["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"]) cortex_dataset_2.filter_cell_types([2, 0]) cortex_dataset_merged = GeneExpressionDataset.concat_datasets(cortex_dataset_1, cortex_dataset_2) assert cortex_dataset_merged.nb_genes == 2 synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5) synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3) synthetic_merged_1 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2) assert synthetic_merged_1.n_batches == 5 assert synthetic_merged_1.n_labels == 5 synthetic_merged_2 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2, shared_labels=False) assert synthetic_merged_2.n_batches == 5 assert synthetic_merged_2.n_labels == 8 synthetic_dataset_1.filter_cell_types([0, 1, 2, 3]) assert synthetic_dataset_1.n_labels == 4 synthetic_dataset_1.subsample_cells(50) assert len(synthetic_dataset_1) == 50 synthetic_dataset_3 = SyntheticDataset(n_labels=6) synthetic_dataset_3.cell_types = np.arange(6).astype(np.str) synthetic_dataset_3.map_cell_types({"2": "9", ("4", "3"): "8"})
def test_genes_to_idx(self): data = np.random.randint(1, 5, size=(5, 10)) gene_names = np.array(["gene_%d" % i for i in range(10)]) dataset = GeneExpressionDataset() dataset.populate_from_data(data, gene_names=gene_names) indices = dataset.genes_to_index(["GENE_%d" % i for i in range(10)]) self.assertListEqual([i for i in range(10)], indices.tolist())
def test_compute_library_size_batch(self): data = np.exp(10) / 10 * np.ones((7, 10), dtype=int) dataset = GeneExpressionDataset() dataset.populate_from_data(data) local_means_true = [[10.0] for _ in range(7)] local_vars_true = [[0.0] for _ in range(7)] self.assertEqual(local_means_true, dataset.local_means.tolist()) self.assertEqual(local_vars_true, dataset.local_vars.tolist())
def test_remap_categorical_attributes(self): data = np.random.randint(1, 5, size=(7, 11)) labels = [1, 1, 1, 1, 1, 2, 2] dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels) labels_true = [0, 0, 0, 0, 0, 1, 1] labels_true = [[i] for i in labels_true] self.assertListEqual(labels_true, dataset.labels.tolist())
def test_collate_normal(self): data = np.ones((25, 2)) * np.arange(0, 25).reshape((-1, 1)) batch_indices = np.arange(0, 25).reshape((-1, 1)) dataset = GeneExpressionDataset() dataset.populate_from_data(data, batch_indices=batch_indices) collate_fn = dataset.collate_fn_builder() x, mean, var, batch, labels = collate_fn([1, 2]) self.assertListEqual(x.tolist(), [[1.0, 1.0], [2.0, 2.0]]) self.assertListEqual(batch.tolist(), [[1], [2]])
def __init__(self, data, name, n_latent=10, reconstruction_seq='zinb'): super().__init__(data, name, n_latent) self.full_dataset = GeneExpressionDataset() self.full_dataset.populate_from_datasets([ copy.deepcopy(data.data_fish_partial), copy.deepcopy(data.data_seq) ]) self.full_dataset.compute_library_size_batch() self.reconstruction_seq = reconstruction_seq
def test_populate_from_data(self): data = np.ones((25, 10)) * 100 dataset = GeneExpressionDataset() dataset.populate_from_data(data) self.assertEqual(dataset.nb_genes, 10) self.assertEqual(dataset.nb_cells, 25) # default batch_indices and labels self.assertListEqual([[0] for i in range(25)], dataset.batch_indices.tolist()) self.assertListEqual([[0] for i in range(25)], dataset.labels.tolist())
def test_multibatches_features(): data = [ np.random.randint(1, 5, size=(20, 10)), np.random.randint(1, 10, size=(20, 10)), np.random.randint(1, 10, size=(20, 10)), np.random.randint(1, 10, size=(30, 10)), ] dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) vae = VAE(dataset.nb_genes, dataset.n_batches) trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda) trainer.train(n_epochs=2) trainer.test_set.imputation(n_samples=2, transform_batch=0) trainer.train_set.imputation(n_samples=2, transform_batch=[0, 1, 2])
def test_subsample_cells(self): data = np.arange(1, 6)[:, None] * np.ones(7)[None, :] dataset = GeneExpressionDataset() dataset.populate_from_data(data) # default dataset.subsample_cells() self.assertEqual(5, dataset.nb_cells) # when size is a float dataset.subsample_cells(size=0.8) data_true = np.arange(5, 1, -1)[:, None] * np.ones(7)[None, :] self.assertListEqual(data_true.tolist(), dataset.X.tolist()) # when size is an int dataset.subsample_cells(size=2) self.assertEqual(2, dataset.nb_cells)
def __init__( self, model, gene_dataset: GeneExpressionDataset, shuffle=False, indices=None, use_cuda=True, data_loader_kwargs=dict(), ): """ When added to annotation, has a private name attribute """ self.model = model self.gene_dataset = gene_dataset self.to_monitor = [] self.use_cuda = use_cuda if indices is not None and shuffle: raise ValueError("indices is mutually exclusive with shuffle") if indices is None: if shuffle: sampler = RandomSampler(gene_dataset) else: sampler = SequentialSampler(gene_dataset) else: if hasattr(indices, "dtype") and indices.dtype is np.dtype("bool"): indices = np.where(indices)[0].ravel() sampler = SubsetRandomSampler(indices) self.data_loader_kwargs = copy.copy(data_loader_kwargs) self.data_loader_kwargs.update( {"collate_fn": gene_dataset.collate_fn_builder(), "sampler": sampler} ) self.data_loader = DataLoader(gene_dataset, **self.data_loader_kwargs)
def test_populate_from_per_label_list(self): data = [ np.random.randint(1, 5, size=(7, 10)), np.random.randint(1, 5, size=(5, 10)), np.random.randint(1, 5, size=(3, 10)), ] dataset = GeneExpressionDataset() dataset.populate_from_per_label_list(data) self.assertEqual(dataset.nb_cells, 15) self.assertEqual(dataset.nb_genes, 10) true_labels = np.concatenate([ np.zeros((7, 1), dtype=int), np.ones((5, 1), dtype=int), 2 * np.ones((3, 1), dtype=int), ]) self.assertListEqual(dataset.labels.tolist(), true_labels.tolist())
def test_data_loader(self): data = np.ones((25, 10)) * 100 paired = np.ones((25, 4)) * np.arange(0, 4) pair_names = ["gabou", "achille", "pedro", "oclivio"] y = CellMeasurement(name="dev", data=paired, columns_attr_name="dev_names", columns=pair_names) dataset = GeneExpressionDataset() dataset.populate_from_data(data, Ys=[y]) ad = dataset.to_anndata() dataset_ad = AnnDatasetFromAnnData( ad, cell_measurements_col_mappings={"dev": "dev_names"}) self.assertTrue((paired == dataset_ad.dev).all()) self.assertTrue((dataset.X == dataset_ad.X).all()) self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True, genecol=0): self.remote = remote self.save_path = save_path self.genecol = genecol if self.remote: group = to_groups[filename] url_skeleton = group_to_url_skeleton[group] self.url = url_skeleton.format(group, filename, filename, type) self.save_path = os.path.join(save_path, '10X/%s/' % filename) self.save_name = '%s_gene_bc_matrices' % type self.download_name = self.save_name + '.tar.gz' else: try: assert os.path.isdir(os.path.join(self.save_path, filename)) except AssertionError: print("The file %s was not found in the location you gave" % filename) raise self.save_path = os.path.join(self.save_path, filename) self.dense = dense expression_data, gene_names = self.download_and_preprocess() super().__init__(*GeneExpressionDataset.get_attributes_from_matrix( expression_data), gene_names=gene_names)
def __init__( self, model: TOTALVI, gene_dataset: GeneExpressionDataset, shuffle: bool = False, indices: Optional[np.ndarray] = None, use_cuda: bool = True, data_loader_kwargs=dict(), ): super().__init__( model, gene_dataset, shuffle=shuffle, indices=indices, use_cuda=use_cuda, data_loader_kwargs=data_loader_kwargs, ) # Add protein tensor as another tensor to be loaded self.data_loader_kwargs.update( { "collate_fn": gene_dataset.collate_fn_builder( {"protein_expression": np.float32} ) } ) self.data_loader = DataLoader(gene_dataset, **self.data_loader_kwargs)
def training_score_scvi(train, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return st.poisson(mu=lam).logpmf(train).sum()
def test_populate_from_datasets_cortex(self): cortex_dataset_1 = CortexDataset(save_path="tests/data") cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3), mode="variance") cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path="tests/data") cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4), mode="variance") cortex_dataset_2.filter_cell_types([ "endothelial-mural", "interneurons", "microglia", "oligodendrocytes" ]) cortex_dataset_2.filter_cell_types([2, 0]) dataset = GeneExpressionDataset() dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2]) self.assertEqual(2, dataset.nb_genes)
def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True): self.remote = remote self.save_path = save_path if self.remote: group = to_groups[filename] self.url = ( "http://cf.10xgenomics.com/samples/cell-exp/%s/%s/%s_%s_gene_bc_matrices.tar.gz" % (group, filename, filename, type)) self.save_path = os.path.join(save_path, '10X/%s/' % filename) self.save_name = '%s_gene_bc_matrices' % type self.download_name = self.save_name + '.tar.gz' else: try: assert os.path.isdir(os.path.join(self.save_path, filename)) except AssertionError: print("The file %s was not found in the location you gave" % filename) raise self.save_path = os.path.join(self.save_path, filename) self.dense = dense expression_data, gene_names = self.download_and_preprocess() super(Dataset10X, self).__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data), gene_names=gene_names)
def assign_label(cellid, geneid, labels_map, count, cell_type, seurat): labels = seurat[1:, 4] labels = np.int64(np.asarray(labels)) labels_new = deepcopy(labels) for i, j in enumerate(labels_map): labels_new[labels == i] = j temp = dict(zip(cellid, count)) new_count = [] for x in seurat[1:, 5]: new_count.append(temp[x]) new_count = sparse.vstack(new_count) dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(new_count, labels=labels_new), gene_names=geneid, cell_types=cell_type) return dataset
def __init__(self, n_proteins=7): assert n_proteins in ( 2, 5, 7), "Only support: 2, 5 or 7 protein FACS dataset" self.n_proteins = int(n_proteins) expression_data = self.download_and_preprocess() super().__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data))
def test_subsample_genes(self): data = np.ones((25, 100)) * 100 variable_data = data variable_data[0, :] = 2 variable_data *= np.arange(0, 100) gene_names = np.array(["gene_%d" % i for i in range(100)]) dataset = GeneExpressionDataset() dataset.populate_from_data(data, gene_names=gene_names) dataset.subsample_genes(new_ratio_genes=0.4, mode="variance") self.assertTupleEqual(dataset.gene_names.shape, (40, )) dataset.subsample_genes(new_n_genes=25, mode="variance") self.assertTupleEqual(dataset.gene_names.shape, (25, )) # The most variable genes should be in first position self.assertEqual(dataset.gene_names[0], "GENE_99") dataset.subsample_genes(subset_genes=[1, 6, 7]) self.assertEqual(dataset.gene_names[0], "GENE_98")
def make_gene_expression_dataset(data: np.ndarray, gene_names: np.ndarray): '''make an scVI GeneExpressionDataset Parameters ---------- data : np.array cell by genes matrix gene_names : np.array, string array with gene names Returns ------- ge_data : GeneExpressionDataset scVI GeneExpressionDataset for scVI processing ''' ge_data = GeneExpressionDataset() ge_data.populate_from_data(X=data, gene_names=gene_names) return ge_data
def generalization_score_scvi(train, test, **kwargs): from scvi.dataset import GeneExpressionDataset from scvi.inference import UnsupervisedTrainer from scvi.models import VAE data = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(train)) vae = VAE(n_input=train.shape[1]) m = UnsupervisedTrainer(vae, data, verbose=False) m.train(n_epochs=100) # Training permuted the data for minibatching. Unpermute before "imputing" # (estimating lambda) with torch.autograd.set_grad_enabled(False): lam = np.vstack([ m.train_set.sequential().imputation(), m.test_set.sequential().imputation() ]) return pois_llik(lam, train, test)
def test_map_cell_types(self): data = np.random.randint(1, 5, size=(7, 10)) labels = [0, 0, 4, 4, 2, 3, 5] cell_types = ["0", "1", "2", "3", "4", "5"] dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels, cell_types=cell_types) dataset.map_cell_types({("0", "2"): "6", ("3", "4"): "7"}) dataset.remap_categorical_attributes() self.assertListEqual(dataset.cell_types.tolist(), ["5", "6", "7"]) self.assertListEqual( np.squeeze(dataset.labels).tolist(), [1, 1, 2, 2, 1, 2, 0])
def test_labels(self): data = np.ones((25, 10)) * 100 labels = np.array(range(25)) dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels) self.assertTupleEqual((25, 1), dataset.labels.shape) self.assertEqual(dataset.labels[5, 0], 5) labels = np.ones(25) * 5 dataset = GeneExpressionDataset() dataset.populate_from_data(data, labels=labels) self.assertTupleEqual(dataset.labels.shape, (25, 1)) self.assertEqual(dataset.labels[5, 0], 0)
def test_populate_from_data_with_measurements(self): data = np.ones((25, 10)) * 100 paired = np.ones((25, 4)) * np.arange(0, 4) pair_names = ["gabou", "achille", "pedro", "oclivio"] y = CellMeasurement(name="dev", data=paired, columns_attr_name="dev_names", columns=pair_names) dataset = GeneExpressionDataset() dataset.populate_from_data(data, Ys=[y]) self.assertEqual(dataset.nb_genes, 10) self.assertEqual(dataset.nb_cells, 25) self.assertTrue(hasattr(dataset, "dev")) self.assertTrue(hasattr(dataset, "dev_names")) self.assertListEqual(dataset.dev_names.tolist(), pair_names) self.assertListEqual(dataset.dev[0].tolist(), [0, 1, 2, 3])