def reconstruct():
    train_path = "../data/train_pbmc.h5ad"
    data = sc.read(train_path)
    ctrl_key = "control"
    stim_key = "stimulated"
    all_data = anndata.AnnData()
    print(data.obs["cell_type"].unique().tolist())
    for idx, cell_type in enumerate(data.obs["cell_type"].unique().tolist()):
        pca = PCA(n_components=100)
        train = data[~((data.obs["condition"] == stim_key) &
                       (data.obs["cell_type"] == cell_type))]
        pca.fit(train.X.A)
        print(cell_type, end="\t")
        train_real_stimulated = data[data.obs["condition"] == stim_key, :]
        train_real_stimulated = train_real_stimulated[
            train_real_stimulated.obs["cell_type"] != cell_type]
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)
        train_real_stimulated_PCA = pca.transform(train_real_stimulated.X)

        train_real_cd = data[data.obs["condition"] == ctrl_key, :]
        train_real_cd = scgen.util.balancer(train_real_cd)
        train_real_cd_PCA = pca.transform(train_real_cd.X)

        cell_type_adata = data[data.obs["cell_type"] == cell_type]
        cell_type_ctrl = cell_type_adata[cell_type_adata.obs["condition"] ==
                                         ctrl_key]
        cell_type_stim = cell_type_adata[cell_type_adata.obs["condition"] ==
                                         stim_key]
        if sparse.issparse(cell_type_ctrl.X):
            cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X.A)
        else:
            cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X)
        predicted_cells = predict(pca, train_real_cd_PCA,
                                  train_real_stimulated_PCA,
                                  cell_type_ctrl_PCA)
        if sparse.issparse(cell_type_ctrl.X):
            all_Data = sc.AnnData(
                np.concatenate(
                    [cell_type_ctrl.X.A, cell_type_stim.X.A, predicted_cells]))
        else:
            all_Data = sc.AnnData(
                np.concatenate(
                    [cell_type_ctrl.X, cell_type_stim.X, predicted_cells]))
        all_Data.obs["condition"] = [f"{cell_type}_ctrl"] * cell_type_ctrl.shape[0] + [f"{cell_type}_real_stim"] * \
                                    cell_type_stim.shape[0] + \
                                    [f"{cell_type}_pred_stim"] * len(predicted_cells)
        all_Data.obs["cell_type"] = [f"{cell_type}"] * (
            cell_type_ctrl.shape[0] + cell_type_stim.shape[0] +
            len(predicted_cells))
        all_Data.var_names = cell_type_adata.var_names

        if idx == 0:
            all_data = all_Data
        else:
            all_data = all_data.concatenate(all_Data)
        print(cell_type)
    sc.write("../data/reconstructed/PCAVecArithm/PCA_pbmc.h5ad", all_data)
예제 #2
0
def vector_batch_removal():
    #projecting data to latent space
    latent_all = give_me_latent(data.X)
    latent_ann = sc.AnnData(latent_all)
    latent_ann.obs["cell_type"] = data.obs["cell_type"].tolist()
    latent_ann.obs["batch"] = data.obs["batch"].tolist()
    latent_ann.obs["sample"] = data.obs["sample"].tolist()
    unique_cell_types = np.unique(latent_ann.obs["cell_type"])
    shared_anns = []
    not_shared_ann = []
    for cell_type in unique_cell_types:
        temp_cell = latent_ann[latent_ann.obs["cell_type"] == cell_type]
        if (len(np.unique(temp_cell.obs["batch"])) < 2):
            cell_type_ann = latent_ann[latent_ann.obs["cell_type"] ==
                                       cell_type]
            not_shared_ann.append(cell_type_ann)
            continue
        print(cell_type)
        temp_cell = latent_ann[latent_ann.obs["cell_type"] == cell_type]
        batch_list = {}
        max_batch = 0
        max_batch_ind = ""
        batchs = np.unique(temp_cell.obs["batch"])
        for i in batchs:
            temp = temp_cell[temp_cell.obs["batch"] == i]
            if max_batch < len(temp):
                max_batch = len(temp)
                max_batch_ind = i
            batch_list[i] = temp
        max_batch_ann = batch_list[max_batch_ind]
        for study in batch_list:
            delta = np.average(max_batch_ann.X, axis=0) - np.average(
                batch_list[study].X, axis=0)
            batch_list[study].X = delta + batch_list[study].X
        corrected = sc.AnnData.concatenate(*list(batch_list.values()))
        shared_anns.append(corrected)
    all_shared_ann = sc.AnnData.concatenate(*shared_anns)
    all_not_shared_ann = sc.AnnData.concatenate(*not_shared_ann)
    all_corrected_data = sc.AnnData.concatenate(all_shared_ann,
                                                all_not_shared_ann)
    #reconstructing data to gene epxression space
    corrected = sc.AnnData(reconstruct(all_corrected_data.X, use_data=True))
    corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist(
    ) + all_not_shared_ann.obs["cell_type"].tolist()
    corrected.obs["study"] = all_shared_ann.obs["sample"].tolist(
    ) + all_not_shared_ann.obs["sample"].tolist()
    corrected.var_names = data.var_names.tolist()
    #shared cell_types
    corrected_shared = sc.AnnData(reconstruct(all_shared_ann.X, use_data=True))
    corrected_shared.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist(
    )
    corrected_shared.obs["study"] = all_shared_ann.obs["sample"].tolist()
    corrected_shared.var_names = data.var_names.tolist()
    return corrected, corrected_shared
예제 #3
0
def test_qc_metrics_format():
    a = np.random.binomial(100, .005, (1000, 1000))
    init_var = pd.DataFrame({
        "mito":
        np.concatenate((np.ones(100, dtype=bool), np.zeros(900, dtype=bool)))
    })
    adata_dense = sc.AnnData(X=a, var=init_var.copy())
    sc.pp.calculate_qc_metrics(adata_dense, qc_vars=["mito"], inplace=True)
    for fmt in [sparse.csr_matrix, sparse.csc_matrix, sparse.coo_matrix]:
        adata = sc.AnnData(X=fmt(a), var=init_var.copy())
        sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)
        assert np.allclose(adata.obs, adata_dense.obs)
        for col in adata.var:  # np.allclose doesn't like mix of types
            assert np.allclose(adata.var[col], adata_dense.var[col])
예제 #4
0
def merge_matrix(ad, obskeys=None, use_raw=False, keep_only_mutual=False):
    '''merge matrix stored in ad
    ad: dictionary of anndata to merge
    obskeys: list to merge within anndata
    use_raw: if True, merge from .raw.X'''

    smp_list = list(ad.keys())
    obs_dict = defaultdict(list)
    obs_names = []

    for smp in smp_list:
        ad[smp].obs['name'] = smp

    if not obskeys:
        obskey_list = []
        obskeys = []
        for sample in smp_list:
            obskey_list.extend(list(ad[sample].obs.columns))
        for (obskey, number) in Counter(obskey_list).items():
            if number == len(smp_list):
                obskeys.append(obskey)
            else:
                if keep_only_mutual:
                    pass
                else:
                    for sample in smp_list:
                        if obskey not in ad[sample].obs.columns:
                            ad[sample].obs[obskey] = 'n/a'
                    obskeys.append(obskey)

    for sample in smp_list:
        obs_names.extend(list(ad[sample].obs_names))
        for key in obskeys:
            obs_dict[key].extend(list(ad[sample].obs[key]))

    from scipy.sparse import vstack
    if use_raw == True:
        stack = vstack([ad[x].raw.X for x in smp_list])  # stack data
        adata = sc.AnnData(stack, var=ad[smp_list[0]].raw.var)
    else:
        stack = vstack([ad[x].X for x in smp_list])  # stack data
        adata = sc.AnnData(stack, var=ad[smp_list[0]].var)

    adata.obs_names = obs_names
    print(len(adata))
    for obs_col in obs_dict:
        print(obs_col)
        adata.obs[obs_col] = obs_dict[obs_col]
    return adata
예제 #5
0
def poi_data_gen(p, x_grid, Nc=10000, Nr=5, G=2, require_X=False, sigma=0.2):

    X = np.zeros([Nc, G], dtype=float)
    for i in range(G):
        temp = np.random.choice(x_grid, Nc, p=p, replace=True)
        X[:, i] = temp
    #X[:,-1] = 1
    #X = (X.T/np.sum(X,axis=1)).T    # normalize to be a probability distribution
    new_Nr = Nr * Nc / X.sum()

    ## sample the size factor
    size_factor = np.random.randn(Nc) * sigma + 1
    size_factor = size_factor.clip(min=0.5)

    ## generating the reads
    Y = np.random.poisson((X.T * size_factor).T * new_Nr)
    Y = sp.sparse.csr_matrix(Y)

    ## assign some fake gene names
    gene_name = []
    for i in range(G):
        gene_name.append('gene %d' % (i))
    var = pd.DataFrame(index=gene_name)

    data = sc.AnnData(Y, var=var)

    if require_X:
        return data, size_factor, X
    else:
        return data, size_factor
예제 #6
0
def poi_data_gen_nd(p, val, Nc=10000, Nr=5, sigma=0.2, random_seed=0):
    """Add Poisson noise to the data.
    """
    np.random.seed(random_seed)
    val_size, G = val.shape
    rand_ind = np.random.choice(np.arange(val_size), Nc, p=p, replace=True)
    X = val[rand_ind, :]

    new_Nr = Nr * Nc / X.sum()

    ## sample the size factor
    size_factor = np.random.randn(Nc) * sigma + 1
    #size_factor = np.random.randn(Nc)*0 + 1
    size_factor = size_factor.clip(min=0.5)

    ## generating the reads
    Y = np.random.poisson((X.T * size_factor).T * new_Nr)
    Y = sp.sparse.csr_matrix(Y)

    ## assign some fake gene names
    gene_name = []
    for i in range(G):
        gene_name.append('gene %d' % (i))
    var = pd.DataFrame(index=gene_name)

    data = sc.AnnData(Y, var=var)

    return data, size_factor
예제 #7
0
def creatadata(datadir=None,exprmatrix=None,expermatrix_filename="matrix.mtx",is_mtx=True,cell_info=None,cell_info_filename="barcodes.tsv",gene_info=None,gene_info_filename="genes.tsv",project_name=None):
    """
    Construct a anndata object
    
    Construct a anndata from data in memory or files on disk. If datadir is a dir, there must be at least include "matrix.mtx" or data.txt(without anly columns name or rowname and sep="\t") , 

    """
    if (datadir is None and expermatrix is None and expermatrix_filename is None):
        raise ValueError("Please provide either the expression matrix or the ful path to the expression  matrix!!")
        #something wrong
    cell_info=pd.DataFrame(["cell_"+str(i) for i in range(1,x.shape[0]+1)],columns=["cellname"]) if cell_info is not None else cell_info
    gene_info=pd.DataFrame(["gene_"+str(i) for i in range(1,x.shape[1]+1)],columns=["genename"]) if gene_info is not None else gene_info 
    if datadir is not None:
        cell_and_gene_file = [f for f in os.listdir(datadir) if os.path.isfile(os.path.join(datadir, f))]
        if (os.path.isdir(datadir) and is_mtx==True): #sparse
            print("Start to read expression data (matrix.mtx)")
            x=sc.read_mtx(os.path.join(datadir,expermatrix_filename)).X.T
        else: #nonsparse
            x=pd.read_csv(os.path.join(datadir,expermatrix_filename),sep="\t",header=F)
       
            #only matrix with row names and colnames
        if cell_info_filename in cell_and_gene_file:
            cell_info=pd.read_csv(os.path.join(datadir,cell_info_filename),sep="\t",header=0,na_filter=False) 
        if gene_info_filename in cell_and_gene_file:
            gene_info=pd.read_csv(os.path.join(datadir,gene_info_filename),sep="\t",header=0,na_filter=False)
    else:
        x=exprmatrix # n*p matrix, cell* gene
  
    adata=sc.AnnData(x,obs=cell_info,var=gene_info)
    a=adata.obs["cellname"] if "cellname" in adata.obs.keys() else adata.obs.index
    adata.var_names=adata.var["genename"] if "genename" in adata.var.keys() else adata.var.index
    adata.obs_names_make_unique(join="-")
    adata.var_names_make_unique(join="-")
    adata.uns["ProjectName"]="DEC_clust_algorithm" if project_name is None else project_name 
    return adata
예제 #8
0
def preprocess(X, nb_genes = 500):
    """
    Preprocessing phase as proposed in scanpy package.
    Keeps only nb_genes most variable genes and normalizes
    the data to 0 mean and 1 std.
    Args:
        X ([type]): [description]
        nb_genes (int, optional): [description]. Defaults to 500.
    Returns:
        [type]: [description]
    """
    X = np.ceil(X).astype(np.int)
    count_X = X
    print(X.shape, count_X.shape, f"keeping {nb_genes} genes")
    orig_X = X.copy()
    adata = sc.AnnData(X)

    adata = utils.normalize(adata,
                      copy=True,
                      highly_genes=nb_genes,
                      size_factors=True,
                      normalize_input=True,
                      logtrans_input=True)
    X = adata.X.astype(np.float32)
    return X
예제 #9
0
def impute_neighbor(bdata, n_neighbor=10):
    from scipy.spatial import cKDTree
    from sklearn.neighbors import KDTree
    import multiprocessing as mp

    n_jobs = mp.cpu_count()

    # Get neighborhood structure based on
    ckd = cKDTree(bdata.obsm["X_umap"])
    ckdout = ckd.query(x=bdata.obsm["X_umap"], k=n_neighbor, n_jobs=n_jobs)
    indices = ckdout[1]

    sum_list = []
    import scipy
    for i in range(0, bdata.raw.X.shape[0], 10000):
        start = i
        end = min(i + 10000, bdata.raw.X.shape[0])
        X_list = [
            bdata.raw.X[indices[start:end, i]] for i in range(n_neighbor)
        ]
        X_sum = scipy.sparse.csr_matrix(np.sum(X_list) / n_neighbor)
        sum_list.append(X_sum)
        print(i)

    imputed = scipy.sparse.vstack(sum_list)
    idata = sc.AnnData(imputed)
    idata.obs = bdata.obs.copy()
    idata.var = bdata.raw.var.copy()
    idata.obsm = bdata.obsm.copy()
    idata.uns = bdata.uns.copy()

    return idata
예제 #10
0
def filter_data(X,  highly_genes=500):
    """
    Remove less variable genes

    Args:
        X ([type]): [description]
        highly_genes (int, optional): [description]. Defaults to 500.

    Returns:
        [type]: [description]
    """

    X = np.ceil(X).astype(np.int)
    adata = sc.AnnData(X)

    sc.pp.filter_genes(adata, min_counts=3)
    sc.pp.filter_cells(adata, min_counts=1)
    sc.pp.normalize_per_cell(adata)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=4,
                                min_disp=0.5, n_top_genes=highly_genes, subset=True)
    genes_idx = np.array(adata.var_names.tolist()).astype(int)
    cells_idx = np.array(adata.obs_names.tolist()).astype(int)

    return genes_idx, cells_idx
예제 #11
0
def preprocess(hdf5_file, out_path, n_top_genes):

    h5f = h5py.File(hdf5_file,'r')
    matrix = h5f['matrix'].value

    adata = sc.AnnData(matrix)

    print(adata.X.shape)
    # do not normalize after cell_cycle effects are regressed out (negative values are introduced)
    #sc.pp.normalize_per_cell(adata)          # normalize with total UMI count per cell
    print(adata.X.shape)
    filter_result = sc.pp.filter_genes_dispersion(adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False)
    # filter results is a recarray
    # mask2 to select the top 1000 genes
    mask2 = filter_result.gene_subset
    adata = adata[:, mask2]
    #sc.pp.normalize_per_cell(adata)  # need to redo normalization after filtering

    # Writing the output hdf5 files
    matrix = adata.X

    f = h5py.File(out_path, "w")
    f.create_dataset(name = 'matrix', data = matrix)
    gg = f.create_group('gene_attrs')
    cg = f.create_group('cell_attrs')
    print(h5f['gene_attrs'].keys())
    for key in h5f['gene_attrs'].keys():
        # apply the masks to the gene attributes
        gg.create_dataset(name = key, data =h5f['gene_attrs'][key].value[mask2])
    for key in h5f['cell_attrs'].keys():
        cg.create_dataset(name = key, data = h5f['cell_attrs'][key].value)

    f.close()
    h5f.close()
예제 #12
0
    def balancer(self, data):
        class_names = np.unique(data.obs[self.cell_type_key])
        class_pop = {}
        for cls in class_names:
            class_pop[cls] = len(data[data.obs[self.cell_type_key] == cls])

        max_number = np.max(list(class_pop.values()))

        all_data_x = []
        all_data_label = []
        all_data_condition = []

        for cls in class_names:
            temp = data[data.obs[self.cell_type_key] == cls]
            index = np.random.choice(range(len(temp)), max_number)
            temp_x = temp.X[index]
            all_data_x.append(temp_x)
            temp_ct = np.repeat(cls, max_number)
            all_data_label.append(temp_ct)
            temp_cc = np.repeat(np.unique(temp.obs["condition"]), max_number)
            all_data_condition.append(temp_cc)

        balanced_data = sc.AnnData(np.concatenate(all_data_x))
        balanced_data.obs[self.cell_type_key] = np.concatenate(all_data_label)
        balanced_data.obs["condition"] = np.concatenate(all_data_label)

        class_names = np.unique(balanced_data.obs[self.cell_type_key])
        class_pop = {}
        for cls in class_names:
            class_pop[cls] = len(balanced_data[balanced_data.obs[self.cell_type_key] == cls])
        # print(class_pop)
        return balanced_data
예제 #13
0
def regress_batch_v2(adata, batch_key, confounder_key):
    '''batch regression tool
    batch_key=list of observation categories to be regressed out
    confounder_key=list of observation categories to be kept
    returns ndata with corrected X'''

    from sklearn.linear_model import Ridge

    dummy = pd.get_dummies(adata.obs[batch_key + confounder_key],
                           drop_first=False)
    X_exp = adata.X  # scaled data
    if scipy.sparse.issparse(X_exp):
        X_exp = X_exp.todense()
    LR = Ridge(fit_intercept=False, alpha=1.0)
    LR.fit(dummy, X_exp)

    if len(batch_key) > 1:
        batch_index = np.logical_or.reduce(
            np.vstack([dummy.columns.str.startswith(x) for x in batch_key]))
    else:
        batch_index = np.vstack(
            [dummy.columns.str.startswith(x) for x in batch_key])[0]

    dm = np.array(dummy)[:, batch_index]
    X_explained = dm.dot(LR.coef_[:, batch_index].T)
    X_remain = X_exp - X_explained
    ndata = sc.AnnData(X_remain)
    ndata.obs = adata.obs
    ndata.var = adata.var
    return ndata, X_explained
예제 #14
0
파일: utilities.py 프로젝트: ZJQxxn/scGAN
def save_generated_cells(fake_cells, file_name, fake_labels=None):
    """
    Functions that writes a gene expression matrix and the associated
    cluster indices into a file. Check the AnnData documentation of the
     write method to check the supported formats.

    Parameters
    ----------
    fake_cells : 2-D array
        A matrix (cells x genes) containing the expression levels.
        It can be dense or sparse. It will be encoded in a sparse format.
    file_name : str
        Path of the file to write to.
    fake_labels : array
        an array containing the cluster indices of the corresponding cells.
        Default is None.

    Returns
    -------

    """

    s_gen_mat = sp_sparse.csr_matrix(fake_cells)
    sc_fake = sc.AnnData(s_gen_mat)

    if fake_labels is not None:
        groups = fake_labels.astype('U')
        unique_groups = np.unique(groups)
        sc_fake.obs['cluster'] = pd.Categorical(
            values=groups, categories=natsorted(unique_groups))

    sc_fake.obs_names = np.repeat('fake', sc_fake.shape[0])
    sc_fake.obs_names_make_unique()

    sc_fake.write(file_name)
예제 #15
0
파일: model.py 프로젝트: cartal/scjp
def get_common_var_raw(a,b):
    common = sorted(list(set(a.raw.var_names).intersection(set(b.raw.var_names))))
    list_a_names = list(a.raw.var_names)
    list_b_names = list(b.raw.var_names)
    a_index = np.array([list_a_names.index(x) for x in common])
    b_index = np.array([list_b_names.index(x) for x in common])
    print('calculating a...')
    a_new_X = a.raw.X[:,a_index]
    print('calculating b...')
    b_new_X = b.raw.X[:,b_index]
    a_new = sc.AnnData(a_new_X,obs = a.obs)
    a_new.obsm = a.obsm
    a_new.var_names = common
    b_new = sc.AnnData(b_new_X,obs = b.obs)
    b_new.obsm = b.obsm
    b_new.var_names = common
    return a_new,b_new
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"):
    train_path = f"../data/train_{data_name}.h5ad"
    if data_name == "pbmc":
        ctrl_key = "control"
        stim_key = "stimulated"
        cell_type_key = "cell_type"
    elif data_name == "hpoly":
        ctrl_key = "Control"
        stim_key = "Hpoly.Day10"
        cell_type_key = "cell_label"
    elif data_name == "salmonella":
        ctrl_key = "Control"
        stim_key = "Salmonella"
        cell_type_key = "cell_label"
    data = sc.read(train_path)
    print("data has been loaded!")
    train = data[~((data.obs["condition"] == stim_key) &
                   (data.obs[cell_type_key] == cell_type))]
    pca = PCA(n_components=100)

    pca.fit(train.X.A)

    train_real_cd = train[train.obs["condition"] == "control", :]
    if p_type == "unbiased":
        train_real_cd = scgen.util.balancer(train_real_cd)
    train_real_stimulated = train[train.obs["condition"] == "stimulated", :]
    if p_type == "unbiased":
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)

    import scipy.sparse as sparse
    if sparse.issparse(train_real_cd.X):
        train_real_cd.X = train_real_cd.X.A
        train_real_stimulated.X = train_real_stimulated.X.A

    train_real_stimulated_PCA = pca.transform(train_real_stimulated.X)
    train_real_cd_PCA = pca.transform(train_real_cd.X)

    adata_list = scgen.util.extractor(data, cell_type, {
        "ctrl": ctrl_key,
        "stim": stim_key
    })
    if sparse.issparse(adata_list[1].X):
        adata_list[1].X = adata_list[1].X.A
        adata_list[2].X = adata_list[2].X.A
    ctrl_CD4T_PCA = pca.transform(adata_list[1].X)
    predicted_cells = predict(pca, train_real_cd_PCA,
                              train_real_stimulated_PCA, ctrl_CD4T_PCA, p_type)

    all_Data = sc.AnnData(
        np.concatenate([adata_list[1].X, adata_list[2].X, predicted_cells]))
    all_Data.obs["condition"] = ["ctrl"] * len(adata_list[1].X) + ["real_stim"] * len(adata_list[2].X) + \
                                ["pred_stim"] * len(predicted_cells)
    all_Data.var_names = adata_list[3].var_names
    if p_type == "unbiased":
        sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T.h5ad", all_Data)
    else:
        sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T_biased.h5ad",
                 all_Data)
예제 #17
0
def get_subset(idata, select, cc_genes=cc_genes, log=False, raw=True):
    if raw:
        adata = sc.AnnData(idata[select].raw.X)
        adata.var = idata.raw.var
    else:
        adata = sc.AnnData(idata[select].X)
        adata.var = idata.var
    adata.obs = idata.obs[select]
    adata.raw = adata.copy()
    #adata.X = scipy.sparse.csr_matrix(np.exp(adata.X.todense())-1)
    sc.pp.filter_genes_dispersion(adata, log=log)
    if log:
        sc.pp.log1p(adata)
    sc.pp.scale(adata, max_value=10)
    if len(cc_genes) > 0:
        remove_geneset(adata, cc_genes)
    sc.pp.pca(adata, n_comps=np.min([50, adata.X.shape[0], adata.X.shape[1]]))
    return adata
예제 #18
0
def load_klein():
    df_klein = pd.read_csv('/data/martin/single_cell/klein/data', sep=',')
    index_name = list(df_klein.iloc[:, 0])
    mat_klein = np.array(df_klein.iloc[:, 1:].as_matrix(), dtype=int).T
    # Convert to AnnData
    temp = sp.sparse.csr_matrix(mat_klein)
    data_klein = sc.AnnData(temp)
    data_klein.var_names = index_name
    return data_klein
예제 #19
0
def read_dataset(input_file, transpose=False):
    """
    Construct a anndata object
       
    """
    if os.path.isfile(input_file):
        print("The value os", os.path.isfile(input_file))
        if str(input_file).endswith('h5ad'):
            adata = sc.read(input_file)
        elif sum([
                str(input_file).endswith(str(i))
                for i in ["tsv", 'TSV', 'tab', 'data']
        ]):
            adata = sc.read_text(input_file, sep="\t", first_column_names=True)
            if transpose:
                adata = adata.T
        elif sum([str(input_file).endswith(str(i)) for i in ['csv', "CSV"]]):
            adata = sc.read_text(input_file, sep=",", first_column_names=True)
            if transpose:
                adata = adata.T
        else:
            #ValueError 'The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV'
            print(
                "The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV"
            )
    else:
        #read folder
        mtx = sc.read_mtx(os.path.join(input_file, "matrix.mtx"))
        num_lines = sum(
            1 for line in open(os.path.join(input_file, 'barcodes.tsv')))
        cellinfo = pd.read_csv(os.path.join(input_file, "barcodes.tsv"),
                               sep="\t",
                               header=None if num_lines == mtx.shape[1] else 0)
        if not 'cellname' in cellinfo.columns:
            cellinfo['cellname'] = cellinfo.iloc[:, 0]
        num_lines = sum(
            1 for line in open(os.path.join(input_file, 'genes.tsv')))
        geneinfo = pd.read_csv(os.path.join(input_file, "genes.tsv"),
                               sep="\t",
                               header=None if num_lines == mtx.shape[0] else 0)
        if not 'genename' in geneinfo.columns:
            geneinfo[
                'genename'] = geneinfo.iloc[:,
                                            1]  # for 10x,the second columns is the genename, and the first column is gene_id
        #create anndata
        adata = sc.AnnData(mtx.X.T, obs=cellinfo, var=geneinfo)
        adata.obs_names = adata.obs["cellname"]
        adata.var_names = adata.var["genename"]
        adata.obs_names_make_unique(join="-")
        adata.var_names_make_unique(join="-")
    #create time
    now = datetime.datetime.now()
    adata.uns["ProjectName"] = "DESC created in" + str(
        now.strftime("%Y-%m-%d %H:%M"))
    print("Creat adata successfully! The adata infofation is", adata)
    return adata
예제 #20
0
def load_klein_ercc():
    df_klein_ercc = pd.read_csv(
        '/data/martin/single_cell/ERCC_data/ERCC/klein.txt', sep=' ')
    index_name = list(df_klein_ercc.index)
    mat_klein_ercc = np.array(df_klein_ercc.as_matrix()).T
    # Convert to AnnData
    temp = sp.sparse.csr_matrix(mat_klein_ercc)
    data_klein_ercc = sc.AnnData(temp)
    data_klein_ercc.var_names = index_name
    return data_klein_ercc
예제 #21
0
def remove_cell_cycle(input_file, out_file):

    h5f = h5py.File(input_file, 'r')

    matrix = h5f['matrix'][:]
    gene_names = h5f['gene_attrs']['gene_names'].value
    decoder = np.vectorize(lambda t: t.decode('UTF-8'))
    gene_names = decoder(gene_names)

    adata = sc.AnnData(X=matrix, var=gene_names)

    # Load cell cycle genes defined in [Tirosh et al, 2015](https://doi.org/10.1126/science.aad0501).
    # It is a list of 97 genes, represented by their gene symbol.

    cell_cycle_genes = [
        x.strip() for x in open('./data/regev_lab_cell_cycle_genes.txt')
    ]

    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
    cell_cycle_genes = [
        x for x in cell_cycle_genes if x in adata.var[0].values
    ]

    # this is needed otherwise scanpy cannot tell the index
    adata.var_names = gene_names

    # Log-transformation of data and scaling should always be performed before scoring
    # sc.pp.log1p(adata)
    sc.pp.normalize_per_cell(adata)
    #    sc.pp.scale(adata)

    # calculate the cell cycle scores
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

    sc.pp.regress_out(adata, ['S_score', 'G2M_score'])
    #    sc.pp.scale(adata)

    matrix = adata.X
    cell_phase = np.array(adata.obs['phase'].values, dtype='S10')

    # write the output
    f = h5py.File(out_file, "w")
    f.create_dataset(name='matrix', data=matrix)
    gg = f.create_group('gene_attrs')
    cg = f.create_group('cell_attrs')
    cg.create_dataset(name='cell_phase', data=cell_phase)

    for key in h5f['gene_attrs'].keys():
        gg.create_dataset(name=key, data=h5f['gene_attrs'][key].value)
    for key in h5f['cell_attrs'].keys():
        cg.create_dataset(name=key, data=h5f['cell_attrs'][key].value)

    f.close()
    h5f.close()
예제 #22
0
def load(
    loc="data_files",
    blocksize=1000000,
    anndata_write=True,
    anndata_name="mouse_retina.h5ad",
    X_dtype=np.float32,
):

    adata_fpath = os.path.join(loc, anndata_name)

    # if we've already down;loaded and constructed the adata file, read it and use it
    if os.path.exists(adata_fpath) and os.path.isfile(adata_fpath):
        print("reading saved anndata h5ad file")
        adata = sc.read_h5ad(adata_fpath)

    # if anndata doesn't exit alread, download inputs and construct it
    else:
        # download files if they don't exist locally
        if not os.path.exists(loc):
            os.makedirs(loc)
        files = {
            "10x_mouse_retina_development.mtx": "https://www.dropbox.com/s/6d76z4grcnaxgcg/10x_mouse_retina_development.mtx?dl=1",
            "10x_mouse_retina_development_phenotype.csv": "https://www.dropbox.com/s/y5lho9ifzoktjcs/10x_mouse_retina_development_phenotype.csv?dl=1",
            "10x_mouse_retina_development_feature.csv": "https://www.dropbox.com/s/1mc4geu3hixrxhj/10x_mouse_retina_development_feature.csv?dl=1",
        }
        print("downloading data files")
        for fname, url in files.items():
            if not os.path.exists(os.path.join(loc, fname)):
                download_file(url, loc=loc, blocksize=blocksize)

        # read in data
        print("reading data files")
        df_obs = pd.read_csv(
            os.path.join(loc, "10x_mouse_retina_development_phenotype.csv"), index_col=0
        )[["barcode", "sample", "age", "CellType"]]
        df_var = pd.read_csv(
            os.path.join(loc, "10x_mouse_retina_development_feature.csv"), index_col=0
        )[["id", "gene_short_name"]]
        count_mat = mmread(os.path.join(loc, "10x_mouse_retina_development.mtx"))

        # make anndata object
        print("constructing anndata object")
        adata = sc.AnnData(
            X=count_mat.toarray().astype(X_dtype).transpose(), obs=df_obs, var=df_var
        )
        genes_to_keep = np.mean(adata.X != 0, axis=0) > 0
        cells_to_keep = np.mean(adata.X != 0, axis=1) > 0
        adata = adata[:, genes_to_keep][cells_to_keep, :].copy()

        # save a local copy
        if anndata_write:
            print("saving annndata h5ad file")
            adata.write(adata_fpath)

    return adata
예제 #23
0
def load_svensson_2x():
    input_folder = '/data/martin/single_cell/ERCC_data/ERCC'
    df_s2_ercc = pd.read_csv(
        '/data/martin/single_cell/ERCC_data/ERCC/svensson2X.txt', sep=' ')
    index_name = list(df_s2_ercc.index)
    mat_s2_ercc = np.array(df_s2_ercc.as_matrix()).T
    # Convert to AnnData
    temp = sp.sparse.csr_matrix(mat_s2_ercc)
    data_s2_ercc = sc.AnnData(temp)
    data_s2_ercc.var_names = index_name
    return data_s2_ercc
예제 #24
0
def to_AnnData(Y, gene_list=None):
    """ Convert a ndarray to AnnData with sparse csr reads
    """
    Y = sp.sparse.csr_matrix(Y)
    if gene_list is None:
        gene_list = []
        for i in range(Y.shape[1]):
            gene_list.append('gene %d' % (i))
    var = pd.DataFrame(index=gene_list)
    data = sc.AnnData(Y, var=var)
    return data
def load(
    split="train",
    original_fpath="/allen/aics/modeling/data/scRNAseq_SeeligCollaboration/data_for_modeling/scrnaseq_cardio_20181129.h5ad",
    cache_dir="data_cache",
    cache=True,
    selected_genes_path=None,
    threshold=0,
):
    """
    Load requested split of cardio data, where the whole dataset originated at original_fpath.
    Looks for local cache of split, and if it can't find that, makes a split on the fly.
    If cache=True, caches the result in cache_dir for next time.
    Loads raw count values.
    """

    original_fname = os.path.basename(original_fpath)
    original_bname, original_ext = os.path.splitext(original_fname)
    target_fname = "{0}_{1}{2}".format(original_bname, split, original_ext)
    target_fpath = os.path.join(cache_dir, target_fname)

    if not os.path.exists(target_fpath):
        adata_in = sc.read_h5ad(original_fpath)
        adata_raw = sc.AnnData(
            X=adata_in.raw.X.todense(),
            obs=adata_in.obs,
            var=adata_in.var,
            uns=adata_in.uns,
        )
        split_inds, split_adata = split_anndata(adata_raw)
        if cache:
            write_splits(
                split_inds_dict=split_inds,
                split_adata_dict=split_adata,
                basename=original_bname,
                out_dir=cache_dir,
            )

    adata = sc.read_h5ad(target_fpath)

    if selected_genes_path is not None:
        df = pd.read_csv(selected_genes_path, delimiter="\t")

        coding_genes = df["Gene name"].unique()
        coding_genes = [str(g) + "_HUMAN" for g in coding_genes]

        cols = np.array([c for c in adata.var.index if c in coding_genes])
        adata = adata[:, cols]

    gene_nz_freq = (adata.X > 0).mean(axis=0)
    adata = adata[:, cols[gene_nz_freq > threshold]]

    return adata
예제 #26
0
def DCATransform(sc_data_matrix):

    # Create a scanpy AnnData object
    sc_data_matrix = sc.AnnData(numpy.transpose(sc_data_matrix.values))

    # Filter genes with count<2
    sc.pp.filter_genes(data=sc_data_matrix, min_counts=1)

    # Apply DCA transform
    dca(adata=sc_data_matrix, threads=4, epochs=10)

    print("DCA Denoised data prepared")

    return numpy.transpose(sc_data_matrix.X)
예제 #27
0
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"):
    train_path = f"../data/train_{data_name}.h5ad"
    if data_name == "pbmc":
        ctrl_key = "control"
        stim_key = "stimulated"
        cell_type_key = "cell_type"
    elif data_name == "hpoly":
        ctrl_key = "Control"
        stim_key = "Hpoly.Day10"
        cell_type_key = "cell_label"
    elif data_name == "salmonella":
        ctrl_key = "Control"
        stim_key = "Salmonella"
        cell_type_key = "cell_label"
    data = sc.read(train_path)
    print("data has been loaded!")
    ctrl_cell = data[(data.obs["condition"] == ctrl_key) & (data.obs[cell_type_key] == cell_type)]
    stim_cell = data[(data.obs["condition"] == stim_key) & (data.obs[cell_type_key] == cell_type)]

    train_real_cd = data[data.obs["condition"] == "control", :]
    if p_type == "unbiased":
        train_real_cd = scgen.util.balancer(train_real_cd)
    train_real_stimulated = data[data.obs["condition"] == "stimulated", :]
    train_real_stimulated = train_real_stimulated[train_real_stimulated.obs["cell_type"] != "CD4T"]
    if p_type == "unbiased":
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)

    import scipy.sparse as sparse
    if sparse.issparse(train_real_cd.X):
        train_real_cd = train_real_cd.X.A
        train_real_stimulated = train_real_stimulated.X.A
    else:
        train_real_cd = train_real_cd.X
        train_real_stimulated = train_real_stimulated.X
    if sparse.issparse(ctrl_cell.X):
        ctrl_cell.X = ctrl_cell.X.A
        stim_cell.X = stim_cell.X.A
    predicted_cells = predict(train_real_cd, train_real_stimulated, ctrl_cell.X)

    print("Prediction has been finished")
    all_Data = sc.AnnData(np.concatenate([ctrl_cell.X, stim_cell.X, predicted_cells]))
    all_Data.obs["condition"] = ["ctrl"] * ctrl_cell.shape[0] + ["real_stim"] * stim_cell.shape[0] + \
                                ["pred_stim"] * len(predicted_cells)
    all_Data.var_names = ctrl_cell.var_names
    if p_type == "unbiased":
        sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T.h5ad", all_Data)
    else:
        sc.write(f"../data/reconstructed/VecArithm/VecArithm_CD4T_biased.h5ad", all_Data)
예제 #28
0
def init_scanpy(data,
                col_names,
                head_name,
                true_labels,
                fin,
                k=30,
                n_pcs=20,
                computeEmbedding=True):
    head_idx = np.where(true_labels == head_name)[0]
    if len(head_idx) > 1:
        D = pairwise_distances(data[head_idx, :], metric='euclidean')
        iroot = head_idx[np.argmin(D.sum(axis=0))]
    else:
        iroot = head_idx[0]

    adata = sc.AnnData(data)
    adata.var_names = col_names
    adata.obs['labels'] = true_labels
    adata.uns['iroot'] = iroot
    if computeEmbedding:
        if n_pcs:
            sc.pp.pca(adata, n_comps=n_pcs)
            sc.pp.neighbors(adata, n_neighbors=k, n_pcs=n_pcs)
        else:
            sc.pp.neighbors(adata, n_neighbors=k)

        sc.tl.louvain(adata, resolution=0.9)
        louvain_labels = np.array(list(adata.obs['louvain']))

        sc.tl.paga(adata)
        sc.tl.draw_graph(adata)
        sc.tl.diffmap(adata)
        sc.tl.tsne(adata)
        sc.tl.umap(adata)
        sc.tl.pca(adata, n_comps=2)

        sc.pl.paga(adata)
        sc.tl.draw_graph(adata, init_pos='paga')
    else:
        louvain_labels = []

    sc.settings.figdir = fin
    sc.settings.autosave = True
    # sc.settings.set_figure_params(dpi=80, dpi_save=300, color_map='Set1', format='pdf')
    sc.settings.set_figure_params(dpi=80, dpi_save=300, format='pdf')

    return adata, iroot, louvain_labels
예제 #29
0
def run_leiden(data, params ={}):
    """
    Performs Leiden community detection on given data.

    Args:
        data ([type]): [description]
        n_neighbors (int, optional): [description]. Defaults to 10.
        n_pcs (int, optional): [description]. Defaults to 40.

    Returns:
        [type]: [description]
    """
    import scanpy.api as sc
    adata = sc.AnnData(data)
    sc.pp.neighbors(adata, use_rep='X', n_neighbors = 300, n_pcs = 0)
    sc.tl.leiden(adata, **params)
    pred = adata.obs['leiden'].to_list()
    pred = [int(x) for x in pred]
    return pred
예제 #30
0
def createAnnDataObject(cell_file, feature_file, count_file, feature_name):

    #read in files
    cell = pd.read_csv(cell_file, sep=',')
    feature = pd.read_csv(feature_file, sep=',')
    count = scipy.io.mmread(count_file)

    # transpose to that each row corresponds to cell, each column corresponds to gene or peak
    adata_t = sc.AnnData(count.toarray())
    adata = sc.AnnData.transpose(adata_t)

    # set indices for obs and var
    cell.set_index('sample', inplace=True)
    feature.set_index(feature_name, inplace=True)

    adata.obs = cell
    adata.var = feature

    return adata