Exemplo n.º 1
0
def wu2020() -> AnnData:
    """\
    Return the dataset from :cite:`Wu2020` as AnnData object.

    200k cells, of which 100k have TCRs.

    This is how the dataset was processed:

    .. code-block:: python

    {processing_code}
    """
    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020.h5ad"
    filename = settings.datasetdir / "wu2020.h5ad"
    adata = read(filename, backup_url=url)
    return adata
Exemplo n.º 2
0
def wu2020_3k() -> AnnData:
    """\
    Return the dataset from :cite:`Wu2020` as AnnData object, downsampled
    to 3000 TCR-containing cells.

    This is how the dataset was processed:

    .. code-block:: python

    {processing_code}
    """
    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad"
    filename = settings.datasetdir / "wu2020_3k.h5ad"
    adata = read(filename, backup_url=url)
    upgrade_schema(adata)
    return adata
Exemplo n.º 3
0
def wu2020_3k() -> AnnData:
    """\
    Return the dataset from :cite:`Wu2020` as AnnData object, downsampled
    to 3000 TCR-containing cells.

    This is how the dataset was processed:

    .. code-block:: python

    {processing_code}
    """
    # os.makedirs(settings.datasetdir, exist_ok=True)
    # TODO host it on github or similar
    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad"
    filename = settings.datasetdir / "wu2020_3k.h5ad"
    adata = read(filename, backup_url=url)
    return adata
Exemplo n.º 4
0
def wu2020() -> AnnData:
    """
    Return the dataset from [Wu2020]_ as AnnData object. 

    200k cells, of which 100k have TCRs.

    This is how the dataset was processed:
    
    ```python
    {processing_code}
    ```
    """
    # os.makedirs(settings.datasetdir, exist_ok=True)
    # TODO host it on github or similar
    url = "https://github.com/icbi-lab/scirpy/releases/download/v0.1/wu2020.h5ad"
    filename = settings.datasetdir / "wu2020.h5ad"
    with _monkey_patch_tqdm():
        adata = read(filename, backup_url=url)
    return adata
Exemplo n.º 5
0
def test_read_write_hdf5_sparse():
    from scipy.sparse import csr_matrix
    adata = AnnData(data=csr_matrix([[1, 0], [3, 0], [5, 6]]),
                    smp={
                        'row_names': ['name1', 'name2', 'name3'],
                        'sanno1': ['cat1', 'cat2', 'cat2'],
                        'sanno2': [2.1, 2.2, 2.3]
                    },
                    var={'vanno1': [3.1, 3.2]},
                    uns={
                        'sanno1_colors': ['#000000', '#FFFFFF'],
                        'uns2_sparse': csr_matrix([[1, 0], [3, 0]])
                    })
    assert pd.api.types.is_string_dtype(adata.smp['sanno1'])
    write('./test.h5', adata)
    adata = read('./test.h5')
    assert pd.api.types.is_categorical(adata.smp['sanno1'])
    assert adata.smp.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.smp['sanno1'].cat.categories.tolist() == ['cat1', 'cat2']
Exemplo n.º 6
0
def maynard2020() -> AnnData:
    """\
    Return the dataset from :cite:`Maynard2020` as AnnData object.

    21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs<TCR>`
    and 1,500 have :term:`BCRs<BCR>`.

    The raw FASTQ files have been obtained from `PRJNA591860 <https://www.ebi.ac.uk/ena/browser/view/PRJNA591860>`__
    and processed using the nf-core `Smart-seq2 pipeline <https://github.com/nf-core/smartseq2/>`__.

    The processed files have been imported and transformed into an :class:`anndata.AnnData`
    object using the following script:

    .. code-block:: python

    {processing_code}
    """
    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/maynard2020.h5ad"
    filename = settings.datasetdir / "maynard2020.h5ad"
    adata = read(filename, backup_url=url)
    return adata
Exemplo n.º 7
0
def test_read_write_hdf5():
    adata = AnnData(
        data=np.array([[1, 0], [3, 0], [5, 6]]),
        smp={
            'row_names': ['name1', 'name2', 'name3'],
            'sanno1': ['cat1', 'cat2', 'cat2'],  # categorical anno
            'sanno2': ['s1', 's2', 's3'],  # string annotation
            'sanno3': [2.1, 2.2, 2.3]
        },  # float annotation
        var={'vanno1': [3.1, 3.2]},
        uns={
            'sanno1_colors': ['#000000', '#FFFFFF'],
            'uns2': ['some annotation']
        })
    assert pd.api.types.is_string_dtype(adata.smp['sanno1'])
    write('./test.h5', adata)
    adata = read('./test.h5')
    assert pd.api.types.is_categorical(adata.smp['sanno1'])
    assert pd.api.types.is_string_dtype(adata.smp['sanno2'])
    assert adata.smp.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.smp['sanno1'].cat.categories.tolist() == ['cat1', 'cat2']
Exemplo n.º 8
0
def maynard2020_3k() -> AnnData:
    """\
    Return the dataset from :cite:`Maynard2020` as AnnData object, downsampled
    to 3000 cells.

    In brief, this data set was processed as follows:
        * raw data downloaded from ENA
        * gene expression quantified using Salmon and the nf-core/rnaseq pipeline.
        * basic quality control (min_counts=20k, max_counts=5M, min_genes=1k, max_mitochondrial_fraction=0.2)
        * filtered to 6000 HVG using `sc.pp.highly_variable_genes(..., flavor="seurat_v3")`
        * raw counts processed using scVI, providing sample information as batch key.
        * cell types manually annotated based on marker genes and leiden clustering and subclustering.
        * downsampled to 3000 cells.

    `adata.X` contains the `log1p` transformed, cpm-normalized raw counts.
    The `scVI` latent representation is stored in `adata.obsm["X_scVI"]`.
    A UMAP for the 3000 cells is precomputed.
    """
    url = "https://github.com/icbi-lab/infercnvpy/releases/download/d0.1.0/maynard2020_3k.h5ad"
    filename = settings.datasetdir / "maynard2020_3k.h5ad"
    adata = read(filename, backup_url=url)
    return adata