def test_dense_subsample_genes(self): data = [ np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), ] # With default dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) n_genes = dataset.nb_genes n_top = n_genes // 2 dataset.subsample_genes(new_n_genes=n_top) assert dataset.nb_genes < n_genes # For some reason the new number of genes can be slightly different than n_top # With Seurat dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) dataset.subsample_genes(new_n_genes=n_top, mode="seurat") assert dataset.nb_genes < n_genes
def test_batch_correction(self): data = [ np.random.randint(1, 5, size=(50, 25)), np.random.randint(1, 5, size=(50, 25)), np.random.randint(1, 5, size=(50, 25)), ] dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) n_genes = dataset.nb_genes n_top = n_genes // 2 dataset._highly_variable_genes(n_bins=3, flavor="seurat_v2") df = dataset._highly_variable_genes( n_bins=3, n_top_genes=n_top, flavor="seurat_v2" ) assert df["highly_variable"].sum() >= n_top dataset.subsample_genes(new_n_genes=n_top) new_genes = dataset.nb_genes assert n_genes > new_genes, "subsample_genes did not filter out genes" pass
def test_dense_subsample_genes(self): data = [ np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), np.random.randint(1, 5, size=(50, 26)), ] # With default dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) n_genes = dataset.nb_genes n_top = n_genes // 2 dataset.subsample_genes(new_n_genes=n_top, mode="cell_ranger") assert dataset.nb_genes == n_top # With Seurat v2 dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) dataset.subsample_genes(new_n_genes=n_top, mode="seurat_v2") assert dataset.nb_genes == n_top # With Seurat v3 dataset = GeneExpressionDataset() dataset.populate_from_per_batch_list(data) dataset.subsample_genes(new_n_genes=n_top, mode="seurat_v3") assert dataset.nb_genes == n_top
def test_subsample_genes(self): data = np.ones((25, 100)) * 100 variable_data = data variable_data[0, :] = 2 variable_data *= np.arange(0, 100) gene_names = np.array(["gene_%d" % i for i in range(100)]) dataset = GeneExpressionDataset() dataset.populate_from_data(data, gene_names=gene_names) dataset.subsample_genes(new_ratio_genes=0.4, mode="variance") self.assertTupleEqual(dataset.gene_names.shape, (40, )) dataset.subsample_genes(new_n_genes=25, mode="variance") self.assertTupleEqual(dataset.gene_names.shape, (25, )) # The most variable genes should be in first position self.assertEqual(dataset.gene_names[0], "GENE_99") dataset.subsample_genes(subset_genes=[1, 6, 7]) self.assertEqual(dataset.gene_names[0], "GENE_98")
#initial number of sampled cells sampling_size = 500 while sampling_size < n_retained_cells: cells_sizes.append(sampling_size) sampling_size = int(sampling_size*np.sqrt(2)) # cells_sizes = np.logspace(np.log2(500), np.log2(n_retained_cells), num=9, base=2).astype(int) print('Number of sampled cells for ', ds, cells_sizes) cells_dataset = GeneExpressionDataset() X_ = adata.layers['0'] cells_dataset.populate_from_data(X_, gene_names=adata.var.index.values) #we subsambple to 1000 genes for speed and to prevent overfitting cells_dataset.filter_genes_by_count(per_batch=True) cells_dataset.subsample_genes(1000) sel_genes = cells_dataset.gene_names n_validation = adata.shape[0] - n_retained_cells print(ds, ' n_validation:', n_validation) validation_cells = np.random.choice(adata.obs.index, size=n_validation, replace=False) learning_cells = adata.obs.index.difference(validation_cells) val_adata = adata[validation_cells] lea_adata = adata[learning_cells] ne_cells = X_.sum(axis=1) > 0 to_keep = np.where(ne_cells)[0] log_counts = np.log(X_[to_keep].sum(axis=1))