Exemplo n.º 1
0
def read_scale_dataset(dsname="leukemia",
                       filtered_genes=True,
                       override=False,
                       verbose=True) -> SingleCellOMIC:
    r""" Datasets provided by (Xiong et al. 2019), four datasets are supported:

    - 'breast_tumor'
    - 'forebrain'
    - 'leukemia'
    - 'insilico'

  Reference:
    Xiong, L. et al. SCALE method for single-cell ATAC-seq analysis via latent
      feature extraction. Nat Commun 10, 4576 (2019).

  """
    datasets = {'breast_tumor', 'forebrain', 'leukemia', 'insilico'}
    assert dsname in datasets, \
      f"Cannot find dataset with name {dsname}, available datasets are: {datasets}"
    download_path = os.path.join(DOWNLOAD_DIR, f"scale_dataset")
    preprocessed_path = os.path.join(DATA_DIR, f"scale_preprocessed")
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### Download data
    url = str(base64.decodebytes(_URL), 'utf-8')
    path = os.path.join(download_path, os.path.basename(url))
    download_file(url, path, override=False, md5=_MD5)
    ### extract the data
    if len(os.listdir(preprocessed_path)) == 0:
        with zipfile.ZipFile(path, "r") as f:
            for info in f.filelist:
                name = os.path.basename(info.filename)
                if len(name) == 0:
                    continue
                with open(os.path.join(preprocessed_path, name), 'wb') as fout:
                    fout.write(f.read(info))
    ### load the data
    cell = np.load(os.path.join(preprocessed_path, f"{dsname}_cell"))
    labels = np.load(os.path.join(preprocessed_path, f"{dsname}_labels"))
    peak = np.load(os.path.join(preprocessed_path, f"{dsname}_peak"))
    x = sparse.load_npz(os.path.join(preprocessed_path, f"{dsname}_x"))
    sco = SingleCellOMIC(X=x,
                         cell_id=cell,
                         gene_id=peak,
                         omic=OMIC.atac,
                         name=dsname)
    ids = {key: i for i, key in enumerate(sorted(set(labels)))}
    sco.add_omic(OMIC.celltype,
                 X=one_hot(np.array([ids[i] for i in labels]), len(ids)),
                 var_names=list(ids.keys()))
    return sco
Exemplo n.º 2
0
def get_dataset(dataset_name, override=False, verbose=True) -> SingleCellOMIC:
  r""" Check `get_dataset_meta` for more information

  List of all dataset available: ['call', 'callall', 'mpal', 'mpalall',
    'mpalatac', '100yo', '8klyall', '8kmyall', '8kly', '8kmy', '8k',
    '8kall', 'ecclyall', 'eccly', 'eccmyall', 'eccmy', 'ecc', 'eccall',
    '8kx', '8kxall', 'eccx', 'eccxall', 'vdj1x', 'vdj1xall', 'vdj4x',
    'vdj4xall', 'mpalx', 'mpalxall', 'callx', 'callxall', 'pbmcciteseq',
    'cbmcciteseq', 'pbmc5000', 'facs7', 'facs5', 'facs2', 'pbmcscvi',
    'cortex', 'retina', 'hemato', 'vdj1', 'vdj1all', 'vdj2', 'vdj2all',
    'vdj3', 'vdj3all', 'vdj4', 'vdj4all', 'vdjhs3', 'vdjhs3all', 'vdjhs4',
    'vdjhs4all', 'neuron10k', 'neuron10kall', 'heart10k', 'heart10kall',
    'memoryt', 'memorytall', 'naivet', 'naivetall', 'regulatoryt',
    'regulatorytall', 'cd4t', 'cd4tall', '5k', '5kall', '18k', '18kall',
    '4k', '4kall', '10k', '10kall']

  Return:
    mRNA data : `SingleCellOMIC`
    label data: `SingleCellOMIC`. If label data is not availabel, then None

  Example:
    gene, prot = get_dataset("cortex")
    X_train, X_test = gene.split(0.8, seed=1234)
    y_train, y_test = prot.split(0.8, seed=1234)
    X_train.assert_matching_cells(y_train)
    X_test.assert_matching_cells(y_test)
  """
  data_meta = get_dataset_meta()
  # ====== special case: get all dataset ====== #
  dataset_name = str(dataset_name).lower().strip()
  if dataset_name not in data_meta:
    raise RuntimeError(
        'Cannot find dataset with name: "%s", all dataset include: %s' %
        (dataset_name, ", ".join(list(data_meta.keys()))))
  with catch_warnings_ignore(FutureWarning):
    ds = data_meta[dataset_name](override=override, verbose=verbose)
  # ******************** create SCO ******************** #
  if isinstance(ds, SingleCellOMIC):
    return ds
  # ******************** return ******************** #
  validating_dataset(ds)
  with catch_warnings_ignore(FutureWarning):
    sc = SingleCellOMIC(X=ds['X'],
                        cell_id=ds['X_row'],
                        gene_id=ds['X_col'],
                        name=dataset_name)
    if 'y' in ds:
      y = ds['y']
      if is_binary_dtype(y):
        sc.add_omic(OMIC.celltype, y, ds['y_col'])
      else:
        sc.add_omic(OMIC.proteomic, y, ds['y_col'])
  return sc
Exemplo n.º 3
0
def read_mouse_ATLAS(filtered_genes=True,
                     override=False,
                     verbose=True) -> SingleCellOMIC:
    r""" sci-ATAC-seq, to profile genome-wide chromatin accessibility in ∼100,000
  single cells from 13 adult mouse tissues:

    - The regulatory landscape of adult mouse tissues mapped by single-cell
      chromatin assay
    - Characterization of 85 distinct chromatin patterns across 13 different
      tissues
    - Annotation of key regulators and regulatory sequences in diverse
      mammalian cell types
    - Dataset allows resolution of cell types underlying common human traits
      and diseases

  References:
    Cusanovich, D. A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin
      Accessibility. Cell 174, 1309-1324.e18 (2018).
    Link https://atlas.gs.washington.edu/mouse-atac/
  """
    download_path = os.path.join(DOWNLOAD_DIR, f"mouse_atac")
    preprocessed_path = os.path.join(DATA_DIR, f"mouse_atac_preprocessed")
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### Download data
    files = {}
    for name, (url, md5) in _URLs.items():
        filepath = os.path.join(download_path, os.path.basename(url))
        files[name] = download_file(url, filepath, override=False, md5=md5)
    ### save counts matrix
    path = os.path.join(preprocessed_path, 'counts')
    if not os.path.exists(path):
        print("Reading counts matrix ...")
        counts = mmread(files['counts'])
        counts: sparse.coo_matrix
        counts = counts.astype(np.unit8)
        with open(path, 'wb') as f:
            sparse.save_npz(f, counts, compressed=False)
    ### save metadata
    path = os.path.join(preprocessed_path, 'metadata')
    if not os.path.exists(path):
        with open(files['cellids'], 'r') as f:
            cell = np.array([i for i in f.read().split('\n') if len(i) > 0])
        with open(files['peakids'], 'r') as f:
            peak = np.array([i for i in f.read().split('\n') if len(i) > 0])
        metadata = pd.read_csv(files['metadata'], sep="\t")
        assert metadata.shape[0] == len(cell)
        tissue = metadata['tissue'].to_numpy()
        celltype = metadata['cell_label'].to_numpy()
        with open(path, 'wb') as f:
            np.savez(f, cell=cell, peak=peak, tissue=tissue, celltype=celltype)
    ### Read all data and create SCO
    counts = sparse.csr_matrix(
        sparse.load_npz(os.path.join(preprocessed_path, 'counts')))
    metadata = np.load(os.path.join(preprocessed_path, 'metadata'),
                       allow_pickle=True)
    cell = metadata['cell']
    peak = metadata['peak']
    tissue = metadata['tissue']
    celltype = metadata['celltype']
    # need to transpose here, counts matrix is [peaks, cells]
    sco = SingleCellOMIC(X=counts.T,
                         cell_id=cell,
                         gene_id=peak,
                         omic=OMIC.atac,
                         name="mouse_atlas")
    # add celltype
    labels = {name: i for i, name in enumerate(sorted(set(celltype)))}
    sco.add_omic(OMIC.celltype,
                 X=one_hot(np.array([labels[i] for i in celltype]),
                           len(labels)),
                 var_names=list(labels.keys()))
    # add tissue type
    labels = {name: i for i, name in enumerate(sorted(set(tissue)))}
    sco.add_omic(OMIC.tissue,
                 X=one_hot(np.array([labels[i] for i in tissue]), len(labels)),
                 var_names=list(labels.keys()))
    return sco
Exemplo n.º 4
0
def read_melanoma_cisTopicData(filtered_genes=True,
                               override=False,
                               verbose=True):
  r""" melanoma ATAC data from (Bravo González-Blas, et al. 2019)

  Reference:
    Bravo González-Blas, C. et al. cisTopic: cis-regulatory topic modeling
      on single-cell ATAC-seq data. Nat Methods 16, 397–400 (2019).
    Verfaillie, A. et al. Decoding the regulatory landscape of melanoma
      reveals TEADS as regulators of the invasive cell state.
      Nat Commun 6, (2015).
  """
  download_dir = os.path.join(DOWNLOAD_DIR, 'cistopic')
  if not os.path.exists(download_dir):
    os.makedirs(download_dir)
  preprocessed_path = os.path.join(DATA_DIR, 'cistopic_preprocessed')
  if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)
  ### downloading the data
  data = {}
  for url in _URL:
    fname = os.path.basename(url)
    fpath = os.path.join(download_dir, fname)
    if not os.path.exists(fpath):
      if verbose:
        print(f"Downloading file: {fname} ...")
      urlretrieve(url, filename=fpath)
    data[fname.split(".")[0]] = fpath
  ### preprocess data
  if len(os.listdir(preprocessed_path)) == 0:
    try:
      import rpy2.robjects as robjects
      from rpy2.robjects import pandas2ri
      from rpy2.robjects.conversion import localconverter
      robjects.r['options'](warn=-1)
      robjects.r("library(Matrix)")
      pandas2ri.activate()
    except ImportError:
      raise ImportError("Require package 'rpy2' for reading Rdata file.")
    for k, v in data.items():
      robjects.r['load'](v)
      x = robjects.r[k]
      outpath = os.path.join(preprocessed_path, k)
      if k == "counts_mel":
        with localconverter(robjects.default_converter + pandas2ri.converter):
          # dgCMatrix
          x = sparse.csr_matrix((x.slots["x"], x.slots["i"], x.slots["p"]),
                                shape=tuple(robjects.r("dim")(x))[::-1],
                                dtype=np.float32)
      else:
        x = robjects.conversion.rpy2py(x)
      with open(outpath, "wb") as f:
        pickle.dump(x, f)
      if verbose:
        print(f"Loaded file: {k} - {type(x)} - {x.shape}")
    pandas2ri.deactivate()
  ### load_data
  data = {}
  for name in os.listdir(preprocessed_path):
    with open(os.path.join(preprocessed_path, name), 'rb') as f:
      data[name] = pickle.load(f)
  ### sco
  # print(data["dm3_CtxRegions"])
  x = data['counts_mel']
  sco = SingleCellOMIC(X=x,
                       cell_id=data["cellData_mel"].index,
                       gene_id=[f"Region{i + 1}" for i in range(x.shape[1])],
                       omic=OMIC.atac)
  # celltype
  labels = []
  for i, j in zip(data["cellData_mel"]['cellLine'],
                  data["cellData_mel"]['LineType']):
    labels.append(i + '_' + j.split("-")[0])
  labels = np.array(labels)
  labels_name = {name: i for i, name in enumerate(sorted(set(labels)))}
  labels = np.array([labels_name[i] for i in labels])
  sco.add_omic(OMIC.celltype, one_hot(labels, len(labels_name)),
               list(labels_name.keys()))
  return sco
Exemplo n.º 5
0
def read_PBMCeec(subset='ly',
                 override=False,
                 verbose=True,
                 filtered_genes=True) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    if subset in ('my', 'full'):
        raise NotImplementedError("No support for subset: %s - PBMCecc" %
                                  subset)
    download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset)
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at path {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        # ====== full ====== #
        if subset == 'full':
            raise NotImplementedError
        # ====== ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # ====== extract the data ====== #
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_var']
                X_col = data['X_var_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array(['ly'] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"ecc{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
def read_human_embryos(filtered_genes=True,
                       override=False,
                       verbose=True) -> SingleCellOMIC:
    r""" Transcriptional map of human embryo development, including the sequenced
    transcriptomes of 1529 individual cells from 88 human preimplantation
    embryos. These data show that cells undergo an intermediate state of
    co-expression of lineage-specific genes, followed by a concurrent
    establishment of the trophectoderm, epiblast, and primitive endoderm
    lineages, which coincide with blastocyst formation.

  References:
    Petropoulos S, Edsgärd D, Reinius B, et al. Single-Cell RNA-Seq Reveals
      Lineage and X Chromosome Dynamics in Human Preimplantation Embryos.
      Cell. 2016 Sep

  Note:
    Gene expression levels (RefSeq annotations) were estimated in terms of
      reads per kilobase exon model and per million mapped reads (RPKM)
      using rpkmforgenes
    Genes were filtered, keeping 15633/26178 genes that
      * were expressed in at least 5 out of 1919 sequenced cells (RPKM >= 10).
        and
      * for which cells with expression came from at least two
        different embryos.
    Cells were quality-filtered based on 4 criteria, keeping 1529/1919 cells.
      * First, Spearman correlations, using the RPKM expression levels of
        all genes, for every possible pair of cells were calculated and a
        histogram of the maximum correlation obtained for each cell,
        corresponding to the most similar cell, was used to identify 305
        outlier cells with a maximum pair-wise correlations below 0.63.
      * Second, a histogram of the number of expressed genes per cell was
        used to identify 330 outlier cells with less than 5000 expressed
        genes.
      * Third, a histogram of the total transcriptional expression output
        from the sex chromosomes (RPKM sum) was used to identify 33 cells
        with indeterminable sex, or a called sex that was inconsistent with
        other cells of that embryo
      * Fourth, 13 outlier cells were identified using PCA and t-SNE
        dimensionality reduction.

  """
    download_dir = os.path.join(DOWNLOAD_DIR, 'human_embryos')
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    preprocessed_path = os.path.join(DATA_DIR, 'human_embryos_preprocessed')
    if override:
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### download data
    files = []
    for url, md5 in zip(_URLs, _MD5s):
        path = download_file(url=url,
                             filename=os.path.join(download_dir,
                                                   os.path.basename(url)),
                             override=False,
                             md5=md5)
        files.append(path)
    ### preprocessing
    if len(os.listdir(preprocessed_path)) == 0:
        data_map = {}
        for f in files:
            zipname = os.path.basename(f)
            with zipfile.ZipFile(f, mode="r") as f:
                for dat_file in f.filelist:
                    filename = dat_file.filename
                    dat = str(f.read(filename), 'utf-8')
                    x = []
                    for line in dat.split('\n'):
                        if len(line) == 0:
                            continue
                        line = line.split('\t')
                        x.append(line)
                    x = np.asarray(x).T
                    row_name = x[1:, 0]
                    col_name = x[0, 1:]
                    x = x[1:, 1:].astype(np.float32)
                    x = sparse.coo_matrix(x)
                    data_map[filename] = (x, row_name, col_name)
                    print(f"Read: {zipname} - {filename}")
                    print(f" * Matrix: {x.shape}")
                    print(f" * Row   : {row_name.shape}-{row_name[:3]}")
                    print(f" * Col   : {col_name.shape}-{col_name[:3]}")
        # save loaded data to disk
        for name, (x, row, col) in data_map.items():
            with open(os.path.join(preprocessed_path, f"{name}:x"), "wb") as f:
                sparse.save_npz(f, x)
            with open(os.path.join(preprocessed_path, f"{name}:row"),
                      "wb") as f:
                np.save(f, row)
            with open(os.path.join(preprocessed_path, f"{name}:col"),
                      "wb") as f:
                np.save(f, col)
        del data_map
    ### read the data
    # counts.txt (1529, 26178)
    # ercc.counts.txt (1529, 92)
    # rpkm.txt (1529, 26178)
    # ercc.rpkm.txt (1529, 92)
    data = {}
    genes_path = os.path.join(preprocessed_path, "filtered_genes")
    for path in os.listdir(preprocessed_path):
        if path == os.path.basename(genes_path):
            continue
        name, ftype = os.path.basename(path).split(':')
        with open(os.path.join(preprocessed_path, path), 'rb') as f:
            if ftype == 'x':
                x = sparse.load_npz(f).tocsr()
            else:
                x = np.load(f)
        data[f"{name}_{ftype}"] = x
    rpkm = data['rpkm.txt_x']
    counts = data['counts.txt_x']
    genes = data['counts.txt_col']
    cells = data['counts.txt_row']
    ### filter genes
    if not os.path.exists(genes_path):
        # filter genes by rpkm
        ids = np.asarray(np.sum(rpkm, axis=0) >= 10).ravel()
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
        # filter genes by min 5 cells
        ids = np.asarray(np.sum(counts > 0, axis=0) >= 5).ravel()
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
        # filter highly variable genes
        sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes)
        sco.normalize(omic=OMIC.transcriptomic, log1p=True)
        sco.filter_highly_variable_genes(n_top_genes=2000)
        filtered = sco.var_names.to_numpy()
        with open(genes_path, 'wb') as f:
            pickle.dump([genes, filtered], f)
        del sco
    else:
        with open(genes_path, 'rb') as f:
            ids, filtered = pickle.load(f)
        ids = set(ids)
        ids = np.asarray([i in ids for i in genes])
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
    # last filtering
    if filtered_genes:
        filtered = set(filtered)
        ids = np.asarray([i in filtered for i in genes])
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
    ### create the SingleCellOMIC
    sco = SingleCellOMIC(X=counts,
                         cell_id=cells,
                         gene_id=genes,
                         omic=OMIC.transcriptomic,
                         name="HumanEmbryos")
    sco.add_omic(omic=OMIC.rpkm, X=rpkm, var_names=genes)
    labels = ['.'.join(i.split('.')[:-2]) for i in sco.obs_names]
    labels = ['E7' if i == 'E7.4' else i for i in labels]
    labels_name = {j: i for i, j in enumerate(sorted(set(labels)))}
    labels = np.array([labels_name[i] for i in labels])
    sco.add_omic(omic=OMIC.celltype,
                 X=one_hot(labels, len(labels_name)),
                 var_names=list(labels_name.keys()))
    sco.add_omic(omic=OMIC.ercc,
                 X=data['ercc.counts.txt_x'],
                 var_names=data['ercc.counts.txt_col'])
    return sco
Exemplo n.º 7
0
def read_PBMC8k(subset='full',
                override=False,
                verbose=True,
                filtered_genes=True,
                return_arrays=False) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    # prepare the path
    download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        # ====== pbmc 8k ====== #
        if subset == 'full':
            ly = read_PBMC8k('ly',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            my = read_PBMC8k('my',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # load data
            data = np.load(path)
            X = data['X']
            X_row = data['X_row']
            X_col = data['X_col'].tolist()
            y = data['y']
            y_col = data['y_col'].tolist()
            # merge all genes from my and ly subset
            all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist())
            all_genes = sorted([X_col.index(i) for i in all_genes])
            # same for protein
            all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist())
            all_proteins = sorted([y_col.index(i) for i in all_proteins])
            #
            X = X[:, all_genes]
            y = y[:, all_proteins]
            X_col = np.array(X_col)[all_genes]
            y_col = np.array(y_col)[all_proteins]
            cell_types = np.array(
                ['ly' if i in ly['X_row'] else 'my' for i in X_row])
        # ====== pbmc ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # extract the data
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_filt']
                X_col = data['X_filt_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array([subset] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    if return_arrays:
        return ds
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"8k{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
Exemplo n.º 8
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
def read_leukemia_MixedPhenotypes(filtered_genes=True,
                                  omic='rna',
                                  ignore_na=True,
                                  override=False,
                                  verbose=True) -> SingleCellOMIC:
  r""" Integrates highly multiplexed protein quantification, transcriptome
  profiling, and chromatin accessibility analysis. Using this approach,
  we establish a normal epigenetic baseline for healthy blood development,
  which we then use to deconvolve aberrant molecular features within blood
  from mixed-phenotype acute leukemia (MPAL) patients.

  scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow,
  peripheral blood, and MPAL donors

  References:
    Granja JM et al., 2019. "Single-cell multiomic analysis identifies
      regulatory  programs in mixed-phenotype acute leukemia".
      Nature Biotechnology.
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369
    https://github.com/GreenleafLab/MPAL-Single-Cell-2019
  """
  ### prepare the path
  download_dir = os.path.join(DOWNLOAD_DIR, 'mpal')
  if not os.path.exists(download_dir):
    os.makedirs(download_dir)
  preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed')
  if override:
    shutil.rmtree(preprocessed_path)
    if verbose:
      print(f"Override preprocessed data at {preprocessed_path}")
  if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)
  ### download
  files = {}
  for name, (url, md5) in _URL.items():
    path = download_file(url=url,
                         filename=os.path.join(download_dir,
                                               os.path.basename(url)),
                         override=False,
                         md5=md5)
    files[name] = path
  ### read the files
  if omic == 'atac':
    del files['rna']
    del files['adt']
  elif omic == 'rna':
    del files['atac']
  else:
    raise NotImplementedError(f"No support for omic type: {omic}")
  all_data = {}
  for name, data in MPI(jobs=list(files.items()),
                        func=partial(_read_data,
                                     verbose=True,
                                     preprocessed_path=preprocessed_path),
                        batch=1,
                        ncpu=4):
    all_data[name] = data.load()
  ### load scRNA and ADT
  if omic == 'rna':
    rna = all_data['rna']
    adt = all_data['adt']
    cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode']))
    #
    barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])}
    ids = [barcode2ids[i] for i in cell_id]
    X_rna = rna.X[ids].astype(np.float32)
    classification = rna.celldata['ProjectClassification'][ids].values
    #
    barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])}
    X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32)
    #
    if filtered_genes:
      top_genes_path = os.path.join(preprocessed_path, 'top_genes')
      if os.path.exists(top_genes_path):
        with open(top_genes_path, 'rb') as f:
          top_genes = set(pickle.load(f))
        ids = [i for i, j in enumerate(rna.genenames) if j in top_genes]
        sco = SingleCellOMIC(X_rna[:, ids],
                             cell_id=cell_id,
                             gene_id=rna.genenames[ids],
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
      else:
        sco = SingleCellOMIC(X_rna,
                             cell_id=cell_id,
                             gene_id=rna.genenames,
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        # make sure all marker genes are included
        gene_subset = result.gene_subset
        gene_indices = sco.get_var_indices()
        for gene in MARKER_GENES:
          idx = gene_indices.get(gene, None)
          if idx is not None:
            gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)
        with open(top_genes_path, 'wb') as f:
          pickle.dump(sco.var_names.values, f)
    else:
      sco = SingleCellOMIC(X_rna,
                           cell_id=cell_id,
                           gene_id=rna.genenames,
                           omic=OMIC.transcriptomic,
                           name='mpalRNAall')
    # loading dataset
    if ignore_na:
      ids = np.logical_not(np.isnan(np.max(X_adt, axis=0)))
      sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids])
    else:
      sco.add_omic(OMIC.proteomic, X_adt, adt.genenames)
    y, labels = _celltypes(classification)
    sco.add_omic(OMIC.celltype, y, labels)
    exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values}
    sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names],
                                     dtype=np.float32)
  ### load ATAC
  else:
    atac = all_data['atac']
    sco = SingleCellOMIC(atac.X.astype(np.float32),
                         cell_id=atac.celldata['Barcode'],
                         gene_id=atac.genenames,
                         omic=OMIC.atac,
                         name='mpalATAC')
    y, labels = _celltypes(atac.celldata['ProjectClassification'].values)
    sco.add_omic(OMIC.celltype, y, labels)
    sco.obs['clusters'] = atac.celldata['Clusters'].values
    sco.var['score'] = atac.genedata['score'].values
  return sco