예제 #1
0
    def test_dense_subsample_genes(self):
        data = [
            np.random.randint(1, 5, size=(50, 26)),
            np.random.randint(1, 5, size=(50, 26)),
            np.random.randint(1, 5, size=(50, 26)),
        ]

        # With default
        dataset = GeneExpressionDataset()
        dataset.populate_from_per_batch_list(data)
        n_genes = dataset.nb_genes
        n_top = n_genes // 2
        dataset.subsample_genes(new_n_genes=n_top)
        assert dataset.nb_genes < n_genes
        # For some reason the new number of genes can be slightly different than n_top

        # With Seurat
        dataset = GeneExpressionDataset()
        dataset.populate_from_per_batch_list(data)
        dataset.subsample_genes(new_n_genes=n_top, mode="seurat")
        assert dataset.nb_genes < n_genes
예제 #2
0
    def test_batch_correction(self):
        data = [
            np.random.randint(1, 5, size=(50, 25)),
            np.random.randint(1, 5, size=(50, 25)),
            np.random.randint(1, 5, size=(50, 25)),
        ]
        dataset = GeneExpressionDataset()
        dataset.populate_from_per_batch_list(data)

        n_genes = dataset.nb_genes
        n_top = n_genes // 2
        dataset._highly_variable_genes(n_bins=3, flavor="seurat_v2")
        df = dataset._highly_variable_genes(
            n_bins=3, n_top_genes=n_top, flavor="seurat_v2"
        )
        assert df["highly_variable"].sum() >= n_top

        dataset.subsample_genes(new_n_genes=n_top)
        new_genes = dataset.nb_genes
        assert n_genes > new_genes, "subsample_genes did not filter out genes"
        pass
예제 #3
0
    def test_dense_subsample_genes(self):
        data = [
            np.random.randint(1, 5, size=(50, 26)),
            np.random.randint(1, 5, size=(50, 26)),
            np.random.randint(1, 5, size=(50, 26)),
        ]

        # With default
        dataset = GeneExpressionDataset()
        dataset.populate_from_per_batch_list(data)
        n_genes = dataset.nb_genes
        n_top = n_genes // 2
        dataset.subsample_genes(new_n_genes=n_top, mode="cell_ranger")
        assert dataset.nb_genes == n_top

        # With Seurat v2
        dataset = GeneExpressionDataset()
        dataset.populate_from_per_batch_list(data)
        dataset.subsample_genes(new_n_genes=n_top, mode="seurat_v2")
        assert dataset.nb_genes == n_top

        # With Seurat v3
        dataset = GeneExpressionDataset()
        dataset.populate_from_per_batch_list(data)
        dataset.subsample_genes(new_n_genes=n_top, mode="seurat_v3")
        assert dataset.nb_genes == n_top
예제 #4
0
    def test_subsample_genes(self):
        data = np.ones((25, 100)) * 100
        variable_data = data
        variable_data[0, :] = 2
        variable_data *= np.arange(0, 100)

        gene_names = np.array(["gene_%d" % i for i in range(100)])
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, gene_names=gene_names)
        dataset.subsample_genes(new_ratio_genes=0.4, mode="variance")
        self.assertTupleEqual(dataset.gene_names.shape, (40, ))
        dataset.subsample_genes(new_n_genes=25, mode="variance")
        self.assertTupleEqual(dataset.gene_names.shape, (25, ))
        # The most variable genes should be in first position
        self.assertEqual(dataset.gene_names[0], "GENE_99")
        dataset.subsample_genes(subset_genes=[1, 6, 7])
        self.assertEqual(dataset.gene_names[0], "GENE_98")
예제 #5
0
            #initial number of sampled cells
            sampling_size = 500
            while sampling_size < n_retained_cells:
                cells_sizes.append(sampling_size)
                sampling_size = int(sampling_size*np.sqrt(2))

            # cells_sizes = np.logspace(np.log2(500), np.log2(n_retained_cells), num=9, base=2).astype(int)
            print('Number of sampled cells for ', ds, cells_sizes)

            cells_dataset = GeneExpressionDataset()
            X_ = adata.layers['0']
            cells_dataset.populate_from_data(X_, gene_names=adata.var.index.values)

            #we subsambple to 1000 genes for speed and to prevent overfitting
            cells_dataset.filter_genes_by_count(per_batch=True)
            cells_dataset.subsample_genes(1000)
            sel_genes = cells_dataset.gene_names

            n_validation = adata.shape[0] - n_retained_cells
            print(ds, ' n_validation:', n_validation)

            validation_cells = np.random.choice(adata.obs.index, size=n_validation, replace=False)
            learning_cells = adata.obs.index.difference(validation_cells)

            val_adata = adata[validation_cells]
            lea_adata = adata[learning_cells]

            ne_cells = X_.sum(axis=1) > 0
            to_keep = np.where(ne_cells)[0]

            log_counts = np.log(X_[to_keep].sum(axis=1))