Exemplo n.º 1
0
    def __init__(self, save_path="data/"):
        dataset = Dataset10X(filename="neuron_9k", save_path=save_path)
        self.save_path = save_path
        self.urls = [
            "https://github.com/YosefLab/scVI-data/raw/master/brain_small_metadata.pickle"
        ]
        self.download_names = ["brain_small_metadata.pickle"]
        self.download()

        metadata = pickle.load(
            open(os.path.join(self.save_path, "brain_small_metadata.pickle"),
                 "rb"))
        labels = metadata["clusters"].loc[dataset.barcodes.values.ravel()] - 1

        self.raw_qc = metadata["raw_qc"].loc[dataset.barcodes.values.ravel()]
        self.qc_names = self.raw_qc.columns
        self.qc = self.raw_qc.values
        GeneExpressionDataset.__init__(
            self,
            dataset.X,
            dataset.local_means,
            dataset.local_vars,
            batch_indices=dataset.batch_indices,
            labels=labels,
        )
Exemplo n.º 2
0
    def create_dataset(self, path):
        print("Reading rds")
        ro.r("sce<-readRDS('%s')" % path)
        print("Extracting log counts")
        log_counts = ro.r("logcounts(sce)")
        print("Transforming log count to counts")
        counts = (np.exp(log_counts * np.log(2)) - 1).T.astype(np.int)
        gene_symbols = ro.r("rowData(sce)$feature_symbol")
        labels = ro.r("colData(sce)$cell_type1")
        labels_levels = ro.r("levels(colData(sce)$cell_type1)")
        if labels_levels is not rpy2.rinterface.NULL:
            labels = np.array([labels_levels[int(l) - 1] for l in labels])

        cell_types = list(np.unique(labels))
        labels = np.array([cell_types.index(l) for l in labels])

        valid_idx = (counts.sum(axis=1) >
                     10).ravel()  # Filter bad quality cells
        counts = counts[valid_idx]
        labels = labels[valid_idx]
        gene_expression_dataset = GeneExpressionDataset(
            *GeneExpressionDataset.get_attributes_from_matrix(counts,
                                                              labels=labels),
            cell_types=cell_types)
        gene_expression_dataset.gene_symbols = gene_symbols
        return gene_expression_dataset
Exemplo n.º 3
0
def create_datasets():
    rs = RandomState(0)
    data_a = np.sort(rs.normal(0, 10, 500)).astype(int).reshape(100, 5)
    gene_names_a = list("ABCDE")
    cell_types_a = ["alpha", "beta", "gamma", "delta"]
    labels_a = rs.choice(np.arange(len(cell_types_a)), data_a.shape[0])
    batch_indices_a = np.random.choice(np.arange(5), size=data_a.shape[0])

    data_b = np.sort(rs.normal(100, 10, 300)).astype(int).reshape(100, 3)
    gene_names_b = list("BFA")
    cell_types_b = ["alpha", "epsilon", "rho"]
    labels_b = rs.choice(np.arange(len(cell_types_b)), data_b.shape[0])
    batch_indices_b = rs.choice(np.arange(5), size=data_b.shape[0])

    dataset_a = GeneExpressionDataset()
    dataset_b = GeneExpressionDataset()
    dataset_a.populate_from_data(X=data_a,
                                 labels=labels_a,
                                 gene_names=gene_names_a,
                                 cell_types=cell_types_a,
                                 batch_indices=batch_indices_a)
    dataset_a.name = "test_a"

    dataset_b.populate_from_data(X=data_b,
                                 labels=labels_b,
                                 gene_names=gene_names_b,
                                 cell_types=cell_types_b,
                                 batch_indices=batch_indices_b)
    dataset_b.name = "test_b"
    return dataset_a, dataset_b
Exemplo n.º 4
0
    def test_collate_add(self):
        data = np.ones((25, 2)) * np.arange(0, 25).reshape((-1, 1))
        batch_indices = np.arange(0, 25).reshape((-1, 1))
        x_coords = np.arange(0, 25).reshape((-1, 1))
        proteins = (np.ones((25, 3)) + np.arange(0, 25).reshape(
            (-1, 1)) + np.arange(0, 3))
        proteins_name = ["A", "B", "C"]
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(
            data,
            batch_indices=batch_indices,
            cell_attributes_dict={"x_coords": x_coords},
            Ys=[
                CellMeasurement(
                    name="proteins",
                    data=proteins,
                    columns_attr_name="protein_names",
                    columns=proteins_name,
                )
            ],
        )

        collate_fn = dataset.collate_fn_builder(add_attributes_and_types={
            "x_coords": np.float32,
            "proteins": np.float32
        })
        x, mean, var, batch, labels, x_coords_tensor, proteins_tensor = collate_fn(
            [1, 2])
        self.assertListEqual(x_coords_tensor.tolist(), [[1.0], [2.0]])
        self.assertListEqual(proteins_tensor.tolist(),
                             [[2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
Exemplo n.º 5
0
def test_filter_and_concat_datasets():
    cortex_dataset_1 = CortexDataset()
    cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 300))
    cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
    cortex_dataset_2 = CortexDataset()
    cortex_dataset_2.subsample_genes(subset_genes=np.arange(100, 400))
    cortex_dataset_2.filter_cell_types(
        ["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"])
    cortex_dataset_2.filter_cell_types([2, 0])
    cortex_dataset_merged = GeneExpressionDataset.concat_datasets(
        cortex_dataset_1, cortex_dataset_2)
    assert cortex_dataset_merged.nb_genes == 200

    synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5)
    synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3)
    synthetic_merged_1 = GeneExpressionDataset.concat_datasets(
        synthetic_dataset_1, synthetic_dataset_2)
    assert synthetic_merged_1.n_batches == 5
    assert synthetic_merged_1.n_labels == 5

    synthetic_merged_2 = GeneExpressionDataset.concat_datasets(
        synthetic_dataset_1, synthetic_dataset_2, shared_labels=False)
    assert synthetic_merged_2.n_batches == 5
    assert synthetic_merged_2.n_labels == 8

    synthetic_dataset_1.filter_cell_types([0, 1, 2, 3])
    assert synthetic_dataset_1.n_labels == 4

    synthetic_dataset_1.subsample_cells(50)
    assert len(synthetic_dataset_1) == 50
Exemplo n.º 6
0
def test_filter_and_concat_datasets():
    cortex_dataset_1 = CortexDataset(save_path='tests/data/')
    cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3))
    cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
    cortex_dataset_2 = CortexDataset(save_path='tests/data/')
    cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4))
    cortex_dataset_2.filter_cell_types(["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"])
    cortex_dataset_2.filter_cell_types([2, 0])
    cortex_dataset_merged = GeneExpressionDataset.concat_datasets(cortex_dataset_1, cortex_dataset_2)
    assert cortex_dataset_merged.nb_genes == 2

    synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5)
    synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3)
    synthetic_merged_1 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2)
    assert synthetic_merged_1.n_batches == 5
    assert synthetic_merged_1.n_labels == 5

    synthetic_merged_2 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2,
                                                               shared_labels=False)
    assert synthetic_merged_2.n_batches == 5
    assert synthetic_merged_2.n_labels == 8

    synthetic_dataset_1.filter_cell_types([0, 1, 2, 3])
    assert synthetic_dataset_1.n_labels == 4

    synthetic_dataset_1.subsample_cells(50)
    assert len(synthetic_dataset_1) == 50

    synthetic_dataset_3 = SyntheticDataset(n_labels=6)
    synthetic_dataset_3.cell_types = np.arange(6).astype(np.str)
    synthetic_dataset_3.map_cell_types({"2": "9", ("4", "3"): "8"})
Exemplo n.º 7
0
    def test_genes_to_idx(self):
        data = np.random.randint(1, 5, size=(5, 10))
        gene_names = np.array(["gene_%d" % i for i in range(10)])

        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, gene_names=gene_names)
        indices = dataset.genes_to_index(["GENE_%d" % i for i in range(10)])
        self.assertListEqual([i for i in range(10)], indices.tolist())
Exemplo n.º 8
0
    def test_compute_library_size_batch(self):
        data = np.exp(10) / 10 * np.ones((7, 10), dtype=int)
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data)

        local_means_true = [[10.0] for _ in range(7)]
        local_vars_true = [[0.0] for _ in range(7)]
        self.assertEqual(local_means_true, dataset.local_means.tolist())
        self.assertEqual(local_vars_true, dataset.local_vars.tolist())
Exemplo n.º 9
0
    def test_remap_categorical_attributes(self):
        data = np.random.randint(1, 5, size=(7, 11))
        labels = [1, 1, 1, 1, 1, 2, 2]
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, labels=labels)

        labels_true = [0, 0, 0, 0, 0, 1, 1]
        labels_true = [[i] for i in labels_true]
        self.assertListEqual(labels_true, dataset.labels.tolist())
Exemplo n.º 10
0
    def test_collate_normal(self):
        data = np.ones((25, 2)) * np.arange(0, 25).reshape((-1, 1))
        batch_indices = np.arange(0, 25).reshape((-1, 1))
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, batch_indices=batch_indices)

        collate_fn = dataset.collate_fn_builder()
        x, mean, var, batch, labels = collate_fn([1, 2])
        self.assertListEqual(x.tolist(), [[1.0, 1.0], [2.0, 2.0]])
        self.assertListEqual(batch.tolist(), [[1], [2]])
Exemplo n.º 11
0
    def __init__(self, data, name, n_latent=10, reconstruction_seq='zinb'):
        super().__init__(data, name, n_latent)

        self.full_dataset = GeneExpressionDataset()

        self.full_dataset.populate_from_datasets([
            copy.deepcopy(data.data_fish_partial),
            copy.deepcopy(data.data_seq)
        ])
        self.full_dataset.compute_library_size_batch()
        self.reconstruction_seq = reconstruction_seq
Exemplo n.º 12
0
    def test_populate_from_data(self):
        data = np.ones((25, 10)) * 100
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data)

        self.assertEqual(dataset.nb_genes, 10)
        self.assertEqual(dataset.nb_cells, 25)
        # default batch_indices and labels
        self.assertListEqual([[0] for i in range(25)],
                             dataset.batch_indices.tolist())
        self.assertListEqual([[0] for i in range(25)], dataset.labels.tolist())
Exemplo n.º 13
0
def test_multibatches_features():
    data = [
        np.random.randint(1, 5, size=(20, 10)),
        np.random.randint(1, 10, size=(20, 10)),
        np.random.randint(1, 10, size=(20, 10)),
        np.random.randint(1, 10, size=(30, 10)),
    ]
    dataset = GeneExpressionDataset()
    dataset.populate_from_per_batch_list(data)
    vae = VAE(dataset.nb_genes, dataset.n_batches)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda)
    trainer.train(n_epochs=2)
    trainer.test_set.imputation(n_samples=2, transform_batch=0)
    trainer.train_set.imputation(n_samples=2, transform_batch=[0, 1, 2])
Exemplo n.º 14
0
    def test_subsample_cells(self):
        data = np.arange(1, 6)[:, None] * np.ones(7)[None, :]

        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data)
        # default
        dataset.subsample_cells()
        self.assertEqual(5, dataset.nb_cells)
        # when size is a float
        dataset.subsample_cells(size=0.8)
        data_true = np.arange(5, 1, -1)[:, None] * np.ones(7)[None, :]
        self.assertListEqual(data_true.tolist(), dataset.X.tolist())
        # when size is an int
        dataset.subsample_cells(size=2)
        self.assertEqual(2, dataset.nb_cells)
Exemplo n.º 15
0
    def __init__(
        self,
        model,
        gene_dataset: GeneExpressionDataset,
        shuffle=False,
        indices=None,
        use_cuda=True,
        data_loader_kwargs=dict(),
    ):
        """

        When added to annotation, has a private name attribute
        """
        self.model = model
        self.gene_dataset = gene_dataset
        self.to_monitor = []
        self.use_cuda = use_cuda

        if indices is not None and shuffle:
            raise ValueError("indices is mutually exclusive with shuffle")
        if indices is None:
            if shuffle:
                sampler = RandomSampler(gene_dataset)
            else:
                sampler = SequentialSampler(gene_dataset)
        else:
            if hasattr(indices, "dtype") and indices.dtype is np.dtype("bool"):
                indices = np.where(indices)[0].ravel()
            sampler = SubsetRandomSampler(indices)
        self.data_loader_kwargs = copy.copy(data_loader_kwargs)
        self.data_loader_kwargs.update(
            {"collate_fn": gene_dataset.collate_fn_builder(), "sampler": sampler}
        )
        self.data_loader = DataLoader(gene_dataset, **self.data_loader_kwargs)
Exemplo n.º 16
0
 def test_populate_from_per_label_list(self):
     data = [
         np.random.randint(1, 5, size=(7, 10)),
         np.random.randint(1, 5, size=(5, 10)),
         np.random.randint(1, 5, size=(3, 10)),
     ]
     dataset = GeneExpressionDataset()
     dataset.populate_from_per_label_list(data)
     self.assertEqual(dataset.nb_cells, 15)
     self.assertEqual(dataset.nb_genes, 10)
     true_labels = np.concatenate([
         np.zeros((7, 1), dtype=int),
         np.ones((5, 1), dtype=int),
         2 * np.ones((3, 1), dtype=int),
     ])
     self.assertListEqual(dataset.labels.tolist(), true_labels.tolist())
Exemplo n.º 17
0
 def test_data_loader(self):
     data = np.ones((25, 10)) * 100
     paired = np.ones((25, 4)) * np.arange(0, 4)
     pair_names = ["gabou", "achille", "pedro", "oclivio"]
     y = CellMeasurement(name="dev",
                         data=paired,
                         columns_attr_name="dev_names",
                         columns=pair_names)
     dataset = GeneExpressionDataset()
     dataset.populate_from_data(data, Ys=[y])
     ad = dataset.to_anndata()
     dataset_ad = AnnDatasetFromAnnData(
         ad, cell_measurements_col_mappings={"dev": "dev_names"})
     self.assertTrue((paired == dataset_ad.dev).all())
     self.assertTrue((dataset.X == dataset_ad.X).all())
     self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
Exemplo n.º 18
0
    def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True, genecol=0):

        self.remote = remote
        self.save_path = save_path
        self.genecol = genecol
        if self.remote:
            group = to_groups[filename]
            url_skeleton = group_to_url_skeleton[group]
            self.url = url_skeleton.format(group, filename, filename, type)
            self.save_path = os.path.join(save_path, '10X/%s/' % filename)
            self.save_name = '%s_gene_bc_matrices' % type
            self.download_name = self.save_name + '.tar.gz'
        else:
            try:
                assert os.path.isdir(os.path.join(self.save_path, filename))
            except AssertionError:
                print("The file %s was not found in the location you gave" % filename)
                raise
            self.save_path = os.path.join(self.save_path, filename)

        self.dense = dense

        expression_data, gene_names = self.download_and_preprocess()
        super().__init__(*GeneExpressionDataset.get_attributes_from_matrix(
            expression_data), gene_names=gene_names)
Exemplo n.º 19
0
    def __init__(
        self,
        model: TOTALVI,
        gene_dataset: GeneExpressionDataset,
        shuffle: bool = False,
        indices: Optional[np.ndarray] = None,
        use_cuda: bool = True,
        data_loader_kwargs=dict(),
    ):

        super().__init__(
            model,
            gene_dataset,
            shuffle=shuffle,
            indices=indices,
            use_cuda=use_cuda,
            data_loader_kwargs=data_loader_kwargs,
        )
        # Add protein tensor as another tensor to be loaded
        self.data_loader_kwargs.update(
            {
                "collate_fn": gene_dataset.collate_fn_builder(
                    {"protein_expression": np.float32}
                )
            }
        )
        self.data_loader = DataLoader(gene_dataset, **self.data_loader_kwargs)
Exemplo n.º 20
0
def training_score_scvi(train, **kwargs):
    from scvi.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE
    data = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(train))
    vae = VAE(n_input=train.shape[1])
    m = UnsupervisedTrainer(vae, data, verbose=False)
    m.train(n_epochs=100)
    # Training permuted the data for minibatching. Unpermute before "imputing"
    # (estimating lambda)
    lam = np.vstack([
        m.train_set.sequential().imputation(),
        m.test_set.sequential().imputation()
    ])
    return st.poisson(mu=lam).logpmf(train).sum()
Exemplo n.º 21
0
 def test_populate_from_datasets_cortex(self):
     cortex_dataset_1 = CortexDataset(save_path="tests/data")
     cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3),
                                      mode="variance")
     cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
     cortex_dataset_2 = CortexDataset(save_path="tests/data")
     cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4),
                                      mode="variance")
     cortex_dataset_2.filter_cell_types([
         "endothelial-mural", "interneurons", "microglia",
         "oligodendrocytes"
     ])
     cortex_dataset_2.filter_cell_types([2, 0])
     dataset = GeneExpressionDataset()
     dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2])
     self.assertEqual(2, dataset.nb_genes)
Exemplo n.º 22
0
    def __init__(self,
                 filename,
                 save_path='data/',
                 type='filtered',
                 dense=False,
                 remote=True):

        self.remote = remote
        self.save_path = save_path
        if self.remote:
            group = to_groups[filename]
            self.url = (
                "http://cf.10xgenomics.com/samples/cell-exp/%s/%s/%s_%s_gene_bc_matrices.tar.gz"
                % (group, filename, filename, type))
            self.save_path = os.path.join(save_path, '10X/%s/' % filename)
            self.save_name = '%s_gene_bc_matrices' % type
            self.download_name = self.save_name + '.tar.gz'
        else:
            try:
                assert os.path.isdir(os.path.join(self.save_path, filename))
            except AssertionError:
                print("The file %s was not found in the location you gave" %
                      filename)
                raise
            self.save_path = os.path.join(self.save_path, filename)

        self.dense = dense

        expression_data, gene_names = self.download_and_preprocess()
        super(Dataset10X, self).__init__(
            *GeneExpressionDataset.get_attributes_from_matrix(expression_data),
            gene_names=gene_names)
Exemplo n.º 23
0
def assign_label(cellid, geneid, labels_map, count, cell_type, seurat):
    labels = seurat[1:, 4]
    labels = np.int64(np.asarray(labels))
    labels_new = deepcopy(labels)
    for i, j in enumerate(labels_map):
        labels_new[labels == i] = j
    temp = dict(zip(cellid, count))
    new_count = []
    for x in seurat[1:, 5]:
        new_count.append(temp[x])
    new_count = sparse.vstack(new_count)
    dataset = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(new_count,
                                                          labels=labels_new),
        gene_names=geneid,
        cell_types=cell_type)
    return dataset
Exemplo n.º 24
0
    def __init__(self, n_proteins=7):
        assert n_proteins in (
            2, 5, 7), "Only support: 2, 5 or 7 protein FACS dataset"

        self.n_proteins = int(n_proteins)
        expression_data = self.download_and_preprocess()
        super().__init__(
            *GeneExpressionDataset.get_attributes_from_matrix(expression_data))
Exemplo n.º 25
0
    def test_subsample_genes(self):
        data = np.ones((25, 100)) * 100
        variable_data = data
        variable_data[0, :] = 2
        variable_data *= np.arange(0, 100)

        gene_names = np.array(["gene_%d" % i for i in range(100)])
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, gene_names=gene_names)
        dataset.subsample_genes(new_ratio_genes=0.4, mode="variance")
        self.assertTupleEqual(dataset.gene_names.shape, (40, ))
        dataset.subsample_genes(new_n_genes=25, mode="variance")
        self.assertTupleEqual(dataset.gene_names.shape, (25, ))
        # The most variable genes should be in first position
        self.assertEqual(dataset.gene_names[0], "GENE_99")
        dataset.subsample_genes(subset_genes=[1, 6, 7])
        self.assertEqual(dataset.gene_names[0], "GENE_98")
Exemplo n.º 26
0
Arquivo: utils.py Projeto: yynst2/solo
def make_gene_expression_dataset(data: np.ndarray, gene_names: np.ndarray):
    '''make an scVI GeneExpressionDataset

    Parameters
    ----------
    data : np.array
        cell by genes matrix
    gene_names : np.array,
        string array with gene names
    Returns
    -------
    ge_data : GeneExpressionDataset
        scVI GeneExpressionDataset for scVI processing
    '''
    ge_data = GeneExpressionDataset()
    ge_data.populate_from_data(X=data, gene_names=gene_names)
    return ge_data
Exemplo n.º 27
0
def generalization_score_scvi(train, test, **kwargs):
    from scvi.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE
    data = GeneExpressionDataset(
        *GeneExpressionDataset.get_attributes_from_matrix(train))
    vae = VAE(n_input=train.shape[1])
    m = UnsupervisedTrainer(vae, data, verbose=False)
    m.train(n_epochs=100)
    # Training permuted the data for minibatching. Unpermute before "imputing"
    # (estimating lambda)
    with torch.autograd.set_grad_enabled(False):
        lam = np.vstack([
            m.train_set.sequential().imputation(),
            m.test_set.sequential().imputation()
        ])
        return pois_llik(lam, train, test)
Exemplo n.º 28
0
    def test_map_cell_types(self):
        data = np.random.randint(1, 5, size=(7, 10))
        labels = [0, 0, 4, 4, 2, 3, 5]
        cell_types = ["0", "1", "2", "3", "4", "5"]

        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, labels=labels, cell_types=cell_types)
        dataset.map_cell_types({("0", "2"): "6", ("3", "4"): "7"})
        dataset.remap_categorical_attributes()
        self.assertListEqual(dataset.cell_types.tolist(), ["5", "6", "7"])
        self.assertListEqual(
            np.squeeze(dataset.labels).tolist(), [1, 1, 2, 2, 1, 2, 0])
Exemplo n.º 29
0
    def test_labels(self):
        data = np.ones((25, 10)) * 100
        labels = np.array(range(25))
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, labels=labels)
        self.assertTupleEqual((25, 1), dataset.labels.shape)
        self.assertEqual(dataset.labels[5, 0], 5)

        labels = np.ones(25) * 5
        dataset = GeneExpressionDataset()
        dataset.populate_from_data(data, labels=labels)
        self.assertTupleEqual(dataset.labels.shape, (25, 1))
        self.assertEqual(dataset.labels[5, 0], 0)
Exemplo n.º 30
0
    def test_populate_from_data_with_measurements(self):
        data = np.ones((25, 10)) * 100
        paired = np.ones((25, 4)) * np.arange(0, 4)
        pair_names = ["gabou", "achille", "pedro", "oclivio"]
        y = CellMeasurement(name="dev",
                            data=paired,
                            columns_attr_name="dev_names",
                            columns=pair_names)
        dataset = GeneExpressionDataset()

        dataset.populate_from_data(data, Ys=[y])

        self.assertEqual(dataset.nb_genes, 10)
        self.assertEqual(dataset.nb_cells, 25)

        self.assertTrue(hasattr(dataset, "dev"))
        self.assertTrue(hasattr(dataset, "dev_names"))

        self.assertListEqual(dataset.dev_names.tolist(), pair_names)
        self.assertListEqual(dataset.dev[0].tolist(), [0, 1, 2, 3])