Пример #1
0
def _load_annotation_simulation(
    name: str, save_path: str = "data/", run_setup_anndata: bool = True
) -> AnnData:
    """\
    Simulated datasets for scANVI tutorials

    name
        One of "1", "2", or "3"
    """

    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/simulation/simulation_{}.loom".format(
        name
    )
    save_fn = "simulation_{}.loom".format(name)
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))

    adata.obs["labels"] = adata.obs.ClusterID.values
    del adata.obs["ClusterID"]

    adata.obs["batch"] = adata.obs.BatchID.values
    del adata.obs["BatchID"]

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Пример #2
0
def _load_seqfishplus(
    save_path: str = "data/",
    tissue_region: str = "subventricular cortex",
    run_setup_anndata: bool = True,
) -> anndata.AnnData:

    if tissue_region == "subventricular cortex":
        file_prefix = "cortex_svz"
    elif tissue_region == "olfactory bulb":
        file_prefix = "ob"
    else:
        raise ValueError(
            '`tissue_type` must be "subventricular cortex" or "olfactory bulb", but got {}'
            .format(tissue_region))

    save_path = os.path.abspath(save_path)
    url = "https://github.com/CaiGroup/seqFISH-PLUS/raw/master/sourcedata.zip"
    save_fn = "seqfishplus.zip"

    _download(url, save_path, save_fn)
    adata = _load_seqfishplus_data(os.path.join(save_path, save_fn),
                                   file_prefix,
                                   save_path,
                                   gene_by_cell=False)
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
Пример #3
0
def _load_cortex(
    save_path: str = "data/", run_setup_anndata: bool = True
) -> anndata.AnnData:
    """Loads cortex dataset."""
    save_path = os.path.abspath(save_path)
    url = "https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt"
    save_fn = "expression.bin"
    _download(url, save_path, save_fn)
    adata = _load_cortex_txt(os.path.join(save_path, save_fn))
    if run_setup_anndata:
        setup_anndata(adata, labels_key="labels")
    return adata
Пример #4
0
def _load_seqfish(save_path: str = "data/",
                  run_setup_anndata: bool = True) -> anndata.AnnData:
    save_path = os.path.abspath(save_path)
    url = "https://www.cell.com/cms/attachment/2080562255/2072099886/mmc6.xlsx"
    save_fn = "SeqFISH.xlsx"
    _download(url, save_path, save_fn)
    adata = _load_seqfish_data(os.path.join(save_path, save_fn))
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)
    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
Пример #5
0
def _load_smfish(
    save_path: str = "data/",
    use_high_level_cluster: bool = True,
    run_setup_anndata: bool = True,
) -> anndata.AnnData:
    save_path = os.path.abspath(save_path)
    url = "http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom"
    save_fn = "osmFISH_SScortex_mouse_all_cell.loom"
    _download(url, save_path, save_fn)
    adata = _load_smfish_data(os.path.join(save_path, save_fn),
                              use_high_level_cluster=use_high_level_cluster)
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    if run_setup_anndata:
        setup_anndata(adata, labels_key="labels", batch_key="batch")
    return adata
Пример #6
0
def _load_mouse_ob_dataset(save_path: str = "data/", run_setup_anndata: bool = True):
    save_path = os.path.abspath(save_path)
    url = "http://www.spatialtranscriptomicsresearch.org/wp-content/uploads/2016/07/Rep11_MOB_count_matrix-1.tsv"
    save_fn = "Rep11_MOB_count_matrix-1.tsv"
    _download(url, save_path, save_fn)
    adata = _load_csv(
        os.path.join(save_path, save_fn), delimiter="\t", gene_by_cell=False
    )
    adata.obs["batch"] = np.zeros(adata.shape[0]).astype(int)
    adata.obs["labels"] = np.zeros(adata.shape[0]).astype(int)

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Пример #7
0
def _load_frontalcortex_dropseq(
    save_path: str = "data/", run_setup_anndata: bool = True
) -> AnnData:
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom"
    save_fn = "fc-dropseq.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))
    adata.obs["batch"] = adata.obs["Clusters"]
    del adata.obs["Clusters"]
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)

    # reorder labels such that layers of the cortex are in order
    # order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13]
    # self.reorder_cell_types(self.cell_types[order_labels])

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Пример #8
0
def _load_retina(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData:
    """\
    Loads retina dataset

    The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and
    13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author.
    We also extract their normalized data with Combat and use it for benchmarking.

    """
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/retina.loom"
    save_fn = "retina.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))
    cell_types = [
        "RBC",
        "MG",
        "BC5A",
        "BC7",
        "BC6",
        "BC5C",
        "BC1A",
        "BC3B",
        "BC1B",
        "BC2",
        "BC5D",
        "BC3A",
        "BC5B",
        "BC4",
        "BC8_9",
    ]
    adata.obs["labels"] = [
        cell_types[i] for i in adata.obs["ClusterID"].values.astype(int).ravel()
    ]
    del adata.obs["ClusterID"]
    adata.obs["batch"] = pd.Categorical(adata.obs["BatchID"].values.copy())
    del adata.obs["BatchID"]
    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Пример #9
0
def _load_prefrontalcortex_starmap(
    save_path: str = "data/", run_setup_anndata: bool = True
) -> AnnData:
    """\
    Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018)
    """
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom"
    save_fn = "mpfc-starmap.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))

    adata.obs["labels"] = adata.obs.Clusters.values
    del adata.obs["Clusters"]

    adata.obs["batch"] = adata.obs.BatchID.values
    del adata.obs["BatchID"]
    adata.obs["x_coord"] = adata.obsm["Spatial_coordinates"][:, 0]
    adata.obs["y_coord"] = adata.obsm["Spatial_coordinates"][:, 1]
    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
Пример #10
0
def _load_brainlarge_dataset(
    save_path: str = "data/",
    run_setup_anndata: bool = True,
    sample_size_gene_var: int = 10000,
    max_cells_to_keep: int = None,
    n_genes_to_keep: int = 720,
    loading_batch_size: int = 100000,
) -> anndata.AnnData:
    """Loads brain-large dataset."""
    url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5"
    save_fn = "brain_large.h5"

    _download(url, save_path, save_fn)
    adata = _load_brainlarge_file(
        os.path.join(save_path, save_fn),
        sample_size_gene_var=sample_size_gene_var,
        max_cells_to_keep=max_cells_to_keep,
        n_genes_to_keep=n_genes_to_keep,
        loading_batch_size=loading_batch_size,
    )
    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
Пример #11
0
def _load_purified_pbmc_dataset(
    save_path: str = "data/",
    subset_datasets: List[str] = None,
    run_setup_anndata: bool = True,
) -> anndata.AnnData:
    url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad"
    save_fn = "PurifiedPBMCDataset.h5ad"
    _download(url, save_path, save_fn)
    path_to_file = os.path.join(save_path, save_fn)
    adata = anndata.read(path_to_file)

    dataset_names = [
        "cd4_t_helper",
        "regulatory_t",
        "naive_t",
        "memory_t",
        "cytotoxic_t",
        "naive_cytotoxic",
        "b_cells",
        "cd4_t_helper",
        "cd34",
        "cd56_nk",
        "cd14_monocytes",
    ]
    if subset_datasets is not None:
        row_indices = []
        for dataset in subset_datasets:
            assert dataset in dataset_names
            idx = np.where(adata.obs["cell_types"] == dataset)[0]
            row_indices.append(idx)
        row_indices = np.concatenate(row_indices)
        adata = adata[row_indices].copy()

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Пример #12
0
def _load_pbmcs_10x_cite_seq(
    save_path: str = "data/",
    protein_join: str = "inner",
    run_setup_anndata: bool = True,
):
    """Filtered PBMCs from 10x Genomics profiled with RNA and protein

    Datasets were filtered for doublets and other outliers as in
    https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py

    Parameters
    ----------
    protein_join
        Whether to take an inner join or outer join of proteins

    Returns
    -------
    `AnnData` with `.obsm["protein_expression"]

    Missing protein values are zero, and are identified during `AnnData` setup.
    """

    url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true"
    save_fn = "pbmc_10k_protein_v3.h5ad"
    _download(url, save_path, save_fn)
    dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn))

    url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_5k_protein_v3.h5ad?raw=true"
    save_fn = "pbmc_5k_protein_v3.h5ad"
    _download(url, save_path, save_fn)
    dataset2 = anndata.read_h5ad(
        os.path.join(save_path, "pbmc_5k_protein_v3.h5ad"))

    common_genes = dataset1.var_names.intersection(dataset2.var_names)
    dataset1 = dataset1[:, common_genes]
    dataset2 = dataset2[:, common_genes]
    dataset1.obsm["protein_expression"] = pd.DataFrame(
        dataset1.obsm["protein_expression"],
        columns=dataset1.uns["protein_names"],
        index=dataset1.obs_names,
    )
    dataset2.obsm["protein_expression"] = pd.DataFrame(
        dataset2.obsm["protein_expression"],
        columns=dataset2.uns["protein_names"],
        index=dataset2.obs_names,
    )
    del dataset1.uns["protein_names"]
    del dataset2.uns["protein_names"]

    dataset = dataset1.concatenate(dataset2, join=protein_join)
    dataset.obsm["protein_expression"] = dataset.obsm[
        "protein_expression"].fillna(0)
    dataset.obs["labels"] = np.zeros(dataset.shape[0], dtype=np.int64)
    dataset.obs["batch"] = dataset.obs["batch"].astype(np.int64)

    if run_setup_anndata:
        setup_anndata(
            dataset,
            batch_key="batch",
            labels_key="labels",
            protein_expression_obsm_key="protein_expression",
        )

    return dataset
Пример #13
0
def _load_pbmc_dataset(
    save_path: str = "data/",
    run_setup_anndata: bool = True,
    remove_extracted_data: bool = True,
) -> anndata.AnnData:
    urls = [
        "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv",
        "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle",
    ]
    save_fns = ["gene_info_pbmc.csv", "pbmc_metadata.pickle"]

    for i in range(len(urls)):
        _download(urls[i], save_path, save_fns[i])

    de_metadata = pd.read_csv(os.path.join(save_path, "gene_info_pbmc.csv"), sep=",")
    pbmc_metadata = pickle.load(
        open(os.path.join(save_path, "pbmc_metadata.pickle"), "rb")
    )
    pbmc8k = _load_dataset_10x(
        "pbmc8k",
        save_path=save_path,
        var_names="gene_ids",
        remove_extracted_data=remove_extracted_data,
    )
    pbmc4k = _load_dataset_10x(
        "pbmc4k",
        save_path=save_path,
        var_names="gene_ids",
        remove_extracted_data=remove_extracted_data,
    )
    barcodes = np.concatenate((pbmc8k.obs_names, pbmc4k.obs_names))

    adata = pbmc8k.concatenate(pbmc4k)
    adata.obs_names = barcodes

    dict_barcodes = dict(zip(barcodes, np.arange(len(barcodes))))
    subset_cells = []
    barcodes_metadata = pbmc_metadata["barcodes"].index.values.ravel().astype(np.str)
    for barcode in barcodes_metadata:
        if (
            barcode in dict_barcodes
        ):  # barcodes with end -11 filtered on 10X website (49 cells)
            subset_cells += [dict_barcodes[barcode]]
    adata = adata[np.asarray(subset_cells), :].copy()
    idx_metadata = np.asarray(
        [not barcode.endswith("11") for barcode in barcodes_metadata], dtype=np.bool
    )
    genes_to_keep = list(
        de_metadata["ENSG"].values
    )  # only keep the genes for which we have de data
    difference = list(
        set(genes_to_keep).difference(set(adata.var_names))
    )  # Non empty only for unit tests
    for gene in difference:
        genes_to_keep.remove(gene)

    adata = adata[:, genes_to_keep].copy()
    design = pbmc_metadata["design"][idx_metadata]
    raw_qc = pbmc_metadata["raw_qc"][idx_metadata]
    normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]

    design.index = adata.obs_names
    raw_qc.index = adata.obs_names
    normalized_qc.index = adata.obs_names
    adata.obs["batch"] = adata.obs["batch"].astype(np.int64)
    adata.obsm["design"] = design
    adata.obsm["raw_qc"] = raw_qc
    adata.obsm["normalized_qc"] = normalized_qc

    adata.obsm["qc_pc"] = pbmc_metadata["qc_pc"][idx_metadata]
    labels = pbmc_metadata["clusters"][idx_metadata]
    cell_types = pbmc_metadata["list_clusters"]
    adata.obs["labels"] = labels
    adata.uns["cell_types"] = cell_types
    adata.obs["str_labels"] = [cell_types[i] for i in labels]

    adata.var["n_counts"] = np.squeeze(np.asarray(np.sum(adata.X, axis=0)))

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
Пример #14
0
def _load_dataset_10x(
    dataset_name: str = None,
    filename: str = None,
    save_path: str = "data/10X",
    url: str = None,
    return_filtered: bool = True,
    remove_extracted_data: bool = False,
    **scanpy_read_10x_kwargs,
):
    try:
        import scanpy
    except ImportError:
        raise ImportError("Please install scanpy -- `pip install scanpy`")

    # form data url and filename unless manual override
    if dataset_name is not None:
        if url is not None:
            logger.warning("dataset_name provided, manual url is disregarded.")
        if filename is not None:
            logger.warning("dataset_name provided, manual filename is disregarded.")
        group = dataset_to_group[dataset_name]
        url_skeleton = group_to_url_skeleton[group]

        filter_type = "filtered" if return_filtered else "raw"
        url = url_skeleton.format(group, dataset_name, dataset_name, filter_type)
        filename_skeleton = group_to_filename_skeleton[group]
        filename = filename_skeleton.format(filter_type)
        save_path = os.path.join(save_path, dataset_name)
    elif filename is not None and url is not None:
        logger.info("Loading 10X dataset with custom url and filename")
    elif filename is not None and url is None:
        logger.info("Loading local 10X dataset with custom filename")
    else:
        logger.info("Loading extracted local 10X dataset with custom filename")
    _download(url, save_path=save_path, filename=filename)
    file_path = os.path.join(save_path, filename)

    # untar
    download_is_targz = url[-7:] == ".tar.gz"
    was_extracted = False
    if download_is_targz is True:
        if not os.path.exists(file_path[:-7]):  # nothing extracted yet
            if tarfile.is_tarfile(file_path):
                logger.info("Extracting tar file")
                tar = tarfile.open(file_path, "r:gz")
                tar.extractall(path=save_path)
                was_extracted = True
                tar.close()
        path_to_data_folder, suffix = _find_path_to_mtx(save_path)
        adata = scanpy.read_10x_mtx(path_to_data_folder, **scanpy_read_10x_kwargs)
        if was_extracted and remove_extracted_data:
            folders_in_save_path = path_to_data_folder[len(save_path) + 1 :].split("/")
            extracted_folder_path = save_path + "/" + folders_in_save_path[0]
            logger.info("Removing extracted data at {}".format(extracted_folder_path))
            shutil.rmtree(extracted_folder_path)
    else:
        adata = scanpy.read_10x_h5(file_path, **scanpy_read_10x_kwargs)

    adata.var_names_make_unique()
    scanpy.pp.filter_cells(adata, min_counts=1)
    scanpy.pp.filter_genes(adata, min_counts=1)

    return adata