def wu2020() -> AnnData: """\ Return the dataset from :cite:`Wu2020` as AnnData object. 200k cells, of which 100k have TCRs. This is how the dataset was processed: .. code-block:: python {processing_code} """ url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020.h5ad" filename = settings.datasetdir / "wu2020.h5ad" adata = read(filename, backup_url=url) return adata
def wu2020_3k() -> AnnData: """\ Return the dataset from :cite:`Wu2020` as AnnData object, downsampled to 3000 TCR-containing cells. This is how the dataset was processed: .. code-block:: python {processing_code} """ url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad" filename = settings.datasetdir / "wu2020_3k.h5ad" adata = read(filename, backup_url=url) upgrade_schema(adata) return adata
def wu2020_3k() -> AnnData: """\ Return the dataset from :cite:`Wu2020` as AnnData object, downsampled to 3000 TCR-containing cells. This is how the dataset was processed: .. code-block:: python {processing_code} """ # os.makedirs(settings.datasetdir, exist_ok=True) # TODO host it on github or similar url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad" filename = settings.datasetdir / "wu2020_3k.h5ad" adata = read(filename, backup_url=url) return adata
def wu2020() -> AnnData: """ Return the dataset from [Wu2020]_ as AnnData object. 200k cells, of which 100k have TCRs. This is how the dataset was processed: ```python {processing_code} ``` """ # os.makedirs(settings.datasetdir, exist_ok=True) # TODO host it on github or similar url = "https://github.com/icbi-lab/scirpy/releases/download/v0.1/wu2020.h5ad" filename = settings.datasetdir / "wu2020.h5ad" with _monkey_patch_tqdm(): adata = read(filename, backup_url=url) return adata
def test_read_write_hdf5_sparse(): from scipy.sparse import csr_matrix adata = AnnData(data=csr_matrix([[1, 0], [3, 0], [5, 6]]), smp={ 'row_names': ['name1', 'name2', 'name3'], 'sanno1': ['cat1', 'cat2', 'cat2'], 'sanno2': [2.1, 2.2, 2.3] }, var={'vanno1': [3.1, 3.2]}, uns={ 'sanno1_colors': ['#000000', '#FFFFFF'], 'uns2_sparse': csr_matrix([[1, 0], [3, 0]]) }) assert pd.api.types.is_string_dtype(adata.smp['sanno1']) write('./test.h5', adata) adata = read('./test.h5') assert pd.api.types.is_categorical(adata.smp['sanno1']) assert adata.smp.index.tolist() == ['name1', 'name2', 'name3'] assert adata.smp['sanno1'].cat.categories.tolist() == ['cat1', 'cat2']
def maynard2020() -> AnnData: """\ Return the dataset from :cite:`Maynard2020` as AnnData object. 21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs<TCR>` and 1,500 have :term:`BCRs<BCR>`. The raw FASTQ files have been obtained from `PRJNA591860 <https://www.ebi.ac.uk/ena/browser/view/PRJNA591860>`__ and processed using the nf-core `Smart-seq2 pipeline <https://github.com/nf-core/smartseq2/>`__. The processed files have been imported and transformed into an :class:`anndata.AnnData` object using the following script: .. code-block:: python {processing_code} """ url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/maynard2020.h5ad" filename = settings.datasetdir / "maynard2020.h5ad" adata = read(filename, backup_url=url) return adata
def test_read_write_hdf5(): adata = AnnData( data=np.array([[1, 0], [3, 0], [5, 6]]), smp={ 'row_names': ['name1', 'name2', 'name3'], 'sanno1': ['cat1', 'cat2', 'cat2'], # categorical anno 'sanno2': ['s1', 's2', 's3'], # string annotation 'sanno3': [2.1, 2.2, 2.3] }, # float annotation var={'vanno1': [3.1, 3.2]}, uns={ 'sanno1_colors': ['#000000', '#FFFFFF'], 'uns2': ['some annotation'] }) assert pd.api.types.is_string_dtype(adata.smp['sanno1']) write('./test.h5', adata) adata = read('./test.h5') assert pd.api.types.is_categorical(adata.smp['sanno1']) assert pd.api.types.is_string_dtype(adata.smp['sanno2']) assert adata.smp.index.tolist() == ['name1', 'name2', 'name3'] assert adata.smp['sanno1'].cat.categories.tolist() == ['cat1', 'cat2']
def maynard2020_3k() -> AnnData: """\ Return the dataset from :cite:`Maynard2020` as AnnData object, downsampled to 3000 cells. In brief, this data set was processed as follows: * raw data downloaded from ENA * gene expression quantified using Salmon and the nf-core/rnaseq pipeline. * basic quality control (min_counts=20k, max_counts=5M, min_genes=1k, max_mitochondrial_fraction=0.2) * filtered to 6000 HVG using `sc.pp.highly_variable_genes(..., flavor="seurat_v3")` * raw counts processed using scVI, providing sample information as batch key. * cell types manually annotated based on marker genes and leiden clustering and subclustering. * downsampled to 3000 cells. `adata.X` contains the `log1p` transformed, cpm-normalized raw counts. The `scVI` latent representation is stored in `adata.obsm["X_scVI"]`. A UMAP for the 3000 cells is precomputed. """ url = "https://github.com/icbi-lab/infercnvpy/releases/download/d0.1.0/maynard2020_3k.h5ad" filename = settings.datasetdir / "maynard2020_3k.h5ad" adata = read(filename, backup_url=url) return adata