Пример #1
0
    def test_remap_categories(self):
        labels = [0, 0, 0, 2, 2, 3]
        labels, n_labels = remap_categories(labels)
        labels_true = [0, 0, 0, 1, 1, 2]
        self.assertListEqual(labels_true, labels.tolist())
        self.assertEqual(3, n_labels)

        # with absent categories and mappings
        labels = [2, 2, 3]
        mappings_dict = {"cell_types": ["0", "1", "2", "3"]}
        labels, n_labels, mappings = remap_categories(
            labels, mappings_dict=mappings_dict)
        labels_true = [0, 0, 1]
        self.assertListEqual(labels_true, labels.tolist())
        self.assertEqual(2, n_labels)
        self.assertListEqual(["2", "3"], mappings["cell_types"].tolist())
Пример #2
0
    def populate(self):
        self.de_metadata = pd.read_csv(os.path.join(self.save_path,
                                                    "gene_info_pbmc.csv"),
                                       sep=",")
        pbmc_metadata = pickle.load(
            open(os.path.join(self.save_path, "pbmc_metadata.pickle"), "rb"))
        datasets = [
            Dataset10X(
                "pbmc8k",
                save_path=self.save_path_10X,
                remove_extracted_data=self.remove_extracted_data,
                measurement_names_column=0,
            ),
            Dataset10X(
                "pbmc4k",
                save_path=self.save_path_10X,
                remove_extracted_data=self.remove_extracted_data,
                measurement_names_column=0,
            ),
        ]
        self.populate_from_datasets(datasets)
        # filter cells according to barcodes
        dict_barcodes = dict(zip(self.barcodes, np.arange(len(self.barcodes))))
        subset_cells = []
        barcodes_metadata = (
            pbmc_metadata["barcodes"].index.values.ravel().astype(np.str))
        for barcode in barcodes_metadata:
            if (barcode in dict_barcodes
                ):  # barcodes with end -11 filtered on 10X website (49 cells)
                subset_cells += [dict_barcodes[barcode]]
        self.update_cells(subset_cells=np.asarray(subset_cells))
        idx_metadata = np.asarray(
            [not barcode.endswith("11") for barcode in barcodes_metadata],
            dtype=np.bool)
        labels = pbmc_metadata["clusters"][idx_metadata].reshape(-1,
                                                                 1)[:len(self)]
        self.labels, self.n_labels = remap_categories(labels)
        self.cell_types = pbmc_metadata["list_clusters"][:self.n_labels]

        genes_to_keep = list(self.de_metadata["ENSG"].values
                             )  # only keep the genes for which we have de data
        difference = list(set(genes_to_keep).difference(set(
            self.gene_names)))  # Non empty only for unit tests
        for gene in difference:
            genes_to_keep.remove(gene)
        self.filter_genes_by_attribute(genes_to_keep)
        self.de_metadata = self.de_metadata.head(
            len(genes_to_keep))  # this would only affect the unit tests
        self.design = pbmc_metadata["design"][idx_metadata]
        self.raw_qc = pbmc_metadata["raw_qc"][idx_metadata]
        self.qc_names = self.raw_qc.columns
        self.qc = self.raw_qc.values

        self.qc_pc = pbmc_metadata["qc_pc"][idx_metadata]
        self.normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]
Пример #3
0
    def extract_data_from_anndata(ad: anndata.AnnData):
        data, labels, batch_indices, gene_names, cell_types = None, None, None, None, None

        # treat all possible cases according to anndata doc
        if isinstance(ad.X, np.ndarray):
            data = ad.X.copy()
        if isinstance(ad.X, pd.DataFrame):
            data = ad.X.values
        if isinstance(ad.X, csr_matrix):
            # keep sparsity above 1 Gb in dense form
            if reduce(operator.mul, ad.X.shape) * ad.X.dtype.itemsize < 1e9:
                logger.info(
                    "Dense size under 1Gb, casting to dense format (np.ndarray)."
                )
                data = ad.X.toarray()
            else:
                data = ad.X.copy()

        gene_names = np.asarray(ad.var.index.values, dtype=str)

        if "batch_indices" in ad.obs.columns:
            batch_indices = ad.obs["batch_indices"].values

        if "cell_types" in ad.obs.columns:
            cell_types = ad.obs["cell_types"].astype(str)
            labels = cell_types.values
            cell_types = cell_types.drop_duplicates().values
            labels, _ = remap_categories(labels, mapping_from=cell_types)
            # labels = cell_types.rank(method="dense").astype("int")
            # labels.index = cell_types.values
            # cell_types = labels.drop_duplicates().sort_values().index.values.astype("str")
            # labels = labels.values

        if "labels" in ad.obs.columns:
            labels = ad.obs["labels"]

        return (
            data,
            batch_indices,
            labels,
            gene_names,
            cell_types,
            ad.obs,
            ad.obsm,
            ad.var,
            ad.varm,
            ad.uns,
        )