示例#1
0
def read_file(filename, transpose=False):
    adata = None
    if os.path.exists(filename):
        if os.path.isdir(filename):
            adata = sc.read_10x_mtx(filename)

        elif os.path.isfile(filename):
            name, filetype = os.path.splitext(filename)
            if filetype == ".txt":
                print()
                adata = sc.read_text(filename)

            if filetype == ".csv":
                adata = sc.read_csv(filename)

            if filetype == ".h5ad":
                adata = sc.read(filename)

        else:
            print(
                "ERROR: the format must be [H5AD|CSV|TXT] for file or 10x-MTX for directory."
            )
            sys.exit()

        if transpose:
            adata = adata.transpose()
    elif not os.path.exists(filename):
        sys.exit("ERROR: no such file or directory.")

    if not isinstance(adata.X, np.ndarray):
        X = adata.X.toarray()
        adata = anndata.AnnData(X, obs=adata.obs, var=adata.var)
    return adata
示例#2
0
文件: utils.py 项目: HZAU-dl/fisheye
def reading(path):
    log.info("Reading single cell expression matrix.")
    if path.endswith(".h5"):
        adata = sc.read_h5ad(path)
    else:
        adata = sc.read_text(path)
    adata.var_names_make_unique()
    return (adata)
示例#3
0
def process_mereu(root_dir):
    """
    In this case, because names are informative, we only need to download the data, read the csv files and output
    the adatas.
    """
    tsv_dir = root_dir + "/tsv/"

    df_cell_types_human = pd.read_csv(root_dir + "/cell_types/human.csv",
                                      index_col="colnames")
    df_cell_types_mouse = pd.read_csv(root_dir + "/cell_types/mouse.csv",
                                      index_col="colnames")

    list_techniques = [
        "CELseq2",
        "Dropseq",
        "QUARTZseq",
        "SMARTseq2",
        "SingleNuclei",
        "ddSEQ",
        "inDrop",
        "10X",
    ]
    file_list = os.listdir(tsv_dir)

    for technique in list_techniques:
        for org in ["mouse", "human"]:  # TODO: add mouse when I have the df
            print(technique, org)

            file_select = [
                f for f in file_list if (technique in f) & (org in f)
            ][0]

            adata = sc.read_text(tsv_dir + file_select).transpose()
            adata.var_names_make_unique()

            if org == "human":
                cells_select = np.intersect1d(df_cell_types_human.index.values,
                                              adata.obs_names.values)
                cell_types = (
                    df_cell_types_human["cell_types"].loc[cells_select].values)
            else:
                cells_select = np.intersect1d(df_cell_types_mouse.index.values,
                                              adata.obs_names.values)
                cell_types = (
                    df_cell_types_mouse["cell_types"].loc[cells_select].values)

            len_before, len_after = len(adata.obs_names), len(cells_select)
            print(
                f"{len_before} before removal, {len_after} after cell removal."
            )
            adata = adata[cells_select]

            adata.obs["cell_types"] = cell_types

            sc.pp.filter_genes(adata, min_cells=5)
            adata = ensembl2symbol(adata, root_dir[:-1], org, ".")
            adata.write_h5ad(root_dir + f"{technique}_{org}.h5ad")
示例#4
0
def get_single_batch(row_tpl, col_names, experiments_data_dir):
    row = row_tpl[1]
    cur_data = sc.read_text(
        Path(experiments_data_dir,
             row[meta_data_columns_names.BATCH_ID] + ".txt"))
    cur_data = cur_data.T
    for col_name in col_names:
        cur_data.obs[col_name] = row[col_name]
    logging.info(
        f"Reading , batch id - {row[meta_data_columns_names.BATCH_ID]}")
    return cur_data
示例#5
0
def cluster(inputMat, modelName):
    print('Working on {} cells and {} genes'.format(*inputMat.shape))
    dataPath = str('/home/ahmadazim/data/modelImputations' + modelName +
                   '.txt')
    # Output data as txt file
    np.savetxt(dataPath, inputMat)
    # Import data (export and then import to keep record of data)
    data = sc.read_text(dataPath)

    print("Data imported.")

    data.var_names_make_unique()
    data.var['mt'] = data.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(data,
                               qc_vars=['mt'],
                               percent_top=None,
                               log1p=False,
                               inplace=True)
    sc.pl.violin(data, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
                 jitter=0.4,
                 multi_panel=True)

    maxGene = input(
        "Filter out all cells with n_genes_by_counts greater than: ")
    maxMT = input(
        "Filter out all cells with pct_counts_mt greater than (input \"NA\" to ignore): "
    )

    data = data[data.obs.n_genes_by_counts < int(maxGene), :]
    if maxMT != "NA":
        data = data[data.obs.pct_counts_mt < int(maxMT), :]

    sc.pp.highly_variable_genes(data,
                                min_mean=0.0125,
                                max_mean=3,
                                min_disp=0.5)

    data = data[:, data.var.highly_variable]
    sc.pp.regress_out(data, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(data, max_value=10)

    print("QC steps done.")

    sc.tl.pca(data, svd_solver='arpack')
    sc.pp.neighbors(data, n_neighbors=10, n_pcs=40)
    sc.tl.umap(data)
    sc.tl.leiden(data)

    print("Plotting PCA and UMAP...")

    sc.pl.pca(data, color='leiden', save=str(modelName + '.png'))
    sc.pl.umap(data, color='leiden', save=str(modelName + '.png'))
示例#6
0
def txt_to_hfad(dge_in, dge_out):
    k = sc.read_text(dge_in)

    n_obs = len(k.var)
    n_var = len(k.obs)
    k_t = anndata.AnnData(X=None, shape=(n_obs, n_var))

    k_t.X = k.X.transpose()

    k_t.obs = k.var
    k_t.var = k.obs

    k_t.write(dge_out, compression='gzip')
示例#7
0
def ReadOldST(
    count_matrix_file: Union[str, Path] = None,
    spatial_file: Union[str, Path] = None,
    image_file: Union[str, Path] = None,
    library_id: str = "OldST",
    scale: float = 1.0,
    quality: str = "hires",
    spot_diameter_fullres: float = 50,
) -> AnnData:
    """\
    Read Old Spatial Transcriptomics data

    Parameters
    ----------
    count_matrix_file
        Path to count matrix file.
    spatial_file
        Path to spatial location file.
    image_file
        Path to the tissue image file
    library_id
        Identifier for the visium library. Can be modified when concatenating multiple adata objects.
    scale
        Set scale factor.
    quality
        Set quality that convert to stlearn to use. Store in anndata.obs['imagecol' & 'imagerow']
    spot_diameter_fullres
        Diameter of spot in full resolution

    Returns
    -------
    AnnData
    """

    adata = scanpy.read_text(count_matrix_file)
    adata = stlearn.add.parsing(adata, coordinates_file=spatial_file)
    stlearn.add.image(
        adata,
        library_id=library_id,
        quality=quality,
        imgpath=image_file,
        scale=scale,
        spot_diameter_fullres=spot_diameter_fullres,
    )

    return adata
示例#8
0
import sys
import numpy as np
import scanpy as sc
from scanpy.tools._utils import get_init_pos_from_paga as get_paga

wd = sys.argv[1]
adata = sc.read_text(filename="{}/NormExpr.txt".format(wd))
sc.pp.neighbors(adata, use_rep='X', n_neighbors=30)
sc.tl.leiden(adata)
sc.tl.paga(adata, groups='leiden')
sc.pl.paga(adata)
sc.tl.umap(adata, init_pos=get_paga(adata), n_components=2)
np.savetxt(X=adata.obsm['X_umap'],
           fname='{}/UMAP_Paga.txt'.format(wd),
           delimiter='\t')
示例#9
0
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams.update({
    'font.sans-serif': 'Arial',
    'font.family': 'sans-serif',
    'axes.titlesize': 18,
    'axes.labelsize': 14,
})

#%% Load prenormed data and metadata
input_file = 'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_norm.txt'
metadata = pd.read_csv(
    'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_metadata.txt',
    sep='\t',
    index_col=0)
adata = sc.read_text(input_file).T
adata.obs = metadata
all_adata = adata.copy()
VARIANT = 'all'


#%%
def exploratory_plots(adata):

    num_non_int = (adata.to_df().applymap(
        float.is_integer) == False).sum().sum()
    print('Num non-int: ', num_non_int)
    plt.figure()
    sc.pp.filter_cells(adata, min_genes=0)
    plt.hist(adata.obs.n_genes, bins=500)
    plt.title('Genes per cell')
示例#10
0
def upload(pathname):

    import anndata
    filename, file_extension = os.path.splitext(pathname)
    if file_extension == ".mat":
        x = loadmat(pathname)
        keys = []
        for key in x.keys():
            keys.append(key)

        #obs is the cell
        #var is gene
        #pick the largest
        largest = 3
        largest_size = 0
        for i in range(len(keys) - 3):
            if len(x[keys[i + 3]].shape) == 2:
                size = (x[keys[i + 3]].shape[0] * x[keys[i + 3]].shape[1])
            else:
                size = x[keys[i + 3]].shape[0]
            if size >= largest_size:
                largest = i + 3
                largest_size = size
        obs_d, var_d = {}, {}
        for i in range(len(keys) - 3):
            if i != largest - 3:
                if (x[keys[i + 3]].flatten()).shape[0] == (
                        x[keys[largest]]).shape[0]:
                    obs_d[keys[i + 3]] = x[keys[i + 3]].flatten()
                elif (x[keys[i + 3]].flatten()).shape[0] == (
                        x[keys[largest]]).shape[1]:
                    var_d[keys[i + 3]] = x[keys[i + 3]].flatten()
                #else:
        obs_df = pd.DataFrame(data=obs_d)
        var_df = pd.DataFrame(data=var_d)

        data = anndata.AnnData(X=x[keys[largest]].todense(),
                               obs=None if obs_df.empty else obs_df,
                               var=None if var_df.empty else var_df)

    elif file_extension == ".npz":
        x = np.load(pathname)
        #pick largest size file
        largest = 0
        largest_size = 0
        for i in range(len(x.files)):
            if len(x[x.files[i]].shape) == 2:
                size = (x[x.files[i]].shape[0] * x[x.files[i]].shape[1])
            else:
                size = x[x.files[i]].shape[0]
            if size >= largest_size:
                largest = i
                largest_size = size
        obs_d, var_d = {}, {}
        for i in range(len(x.files)):
            if i != largest:
                if len(x[x.files[i]].flatten()) == len(x[x.files[largest]]):
                    obs_d[x.files[i]] = x[x.files[i]].flatten()
                elif len(x[x.files[i]].flatten()) == len(
                        x[x.files[largest]][0]):
                    var_d[x.files[i]] = x[x.files[i]].flatten()
                #else:
        obs_df = pd.DataFrame(data=obs_d)
        var_df = pd.DataFrame(data=var_d)
        data = anndata.AnnData(X=x[x.files[largest]],
                               obs=None if obs_df.empty else obs_df,
                               var=None if var_df.empty else var_df)
    elif file_extension == ".mtx":
        data = sc.read_10x_mtx(os.path.dirname(pathname))
        data.X = data.X.todense()
    elif file_extension == ".csv":
        data = sc.read_csv(pathname)
    elif file_extension == ".xlsx":
        data = sc.read_excel(pathname)
    elif file_extension == ".txt":
        data = sc.read_text(pathname)
    else:
        data = sc.read(pathname)

    print(pathname, " uploaded !")
    return data
示例#11
0
from matplotlib import pyplot as plt
import scanpy as sc
from scipy.sparse import csr_matrix
from soptsc import *
from _probability import *
import networkx as nx
import collections

# First initialise some settings for scanpy
sc.settings.verbosity = 3 # Possible values: (0) errors, (1) warnings, (2) info, (3) hints
sc.settings.set_figure_params(dpi = 80, facecolor='white')

# First load the data (we have to take the transpose, because we need the cells to be the rows and genes to be the columns)
# joostdata = sc.read_text('/Users/axelalmet/Documents/scRNASeqData/Joost2016/GSE67602_Joost_et_al_expression.txt').transpose() # Directory with the text file
joostdata = sc.read_text('/Users/axelalmet/Documents/MATLAB/SoptSC/Data/JoostData.txt').transpose() # Directory with the text file
joostdata.var_names_make_unique() # If var_names = 'gene_ids', when this step isn't necessary

sc.pp.log1p(joostdata, base = 10) # For some reason Shuxiong does this

### Test that the soptsc object initialises correctly
joost_soptsc = SoptSC(joostdata)

# Test that we can store variables correctly
pathway_names = ['Tgfb', 'Wnt', 'Bmp'] # Name of the signalling_pathways
ligand_receptor_pairs = [[('Tgfb1', 'Tgfbr1'), ('Tgfb1', 'Tgfbr2'), ('Tgfb2', 'Tgfbr1'), ('Tgfb2', 'Tgfbr2')], \
                        [('Wnt3', 'Fzd1'), ('Wnt4', 'Fzd1'), ('Wnt5a', 'Fzd1'), ('Wnt6', 'Fzd1'), ('Wnt10a', 'Fzd1')], \
                        [('Bmp1', 'Bmpr2'), ('Bmp2', 'Bmpr2'), ('Bmp4', 'Bmpr2'), ('Bmp7', 'Bmpr2')]] # Name of the ligand-receptor pairs
upregulated_genes = [['Zeb2','Smad2','Wnt4','Wnt11','Bmp7','Sox9','Notch1'], \
                        ['Ctnnb1','Lgr5','Runx2','Apc','Mmp7','Dkk1','Ccnd1'], \
                        ['Crebbp','Fos','Id1','Jun','Runx1','Smad1','Smad5','Sox4','Cdh1']] # The upregulated genes
#scanpy HNSCC.py
import numpy as np
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3
sc.logging.print_versions()
results_file = '/home/ressf/Documenti/RessBachelorsThesisCode/Downstream_analysis/HNSCC/results_scanpy.h5ad'

adata = sc.read_text(
    '/home/ressf/Documenti/RessBachelorsThesisCode/Downstream_analysis/HNSCC/hnscc_clean_trasp.txt',
    delimiter='\t',
    dtype='float32')

adata.var_names_make_unique()
adata

#preprocessing
sc.pl.highest_expr_genes(
    adata,
    n_top=20,
)

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata,
                           qc_vars=['mt'],
                           percent_top=None,
                           log1p=False,
py.init_notebook_mode(connected=False)

import plotly.graph_objs as go
from plotly.graph_objs import XAxis, YAxis, ZAxis, Scene
from sklearn.decomposition import FastICA as ICA
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.manifold import SpectralEmbedding as LaplacianEigenMaps
from sklearn.manifold import Isomap
# In[2]:
# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)

# In[3]:
adata = sc.read_text("C:/Users/saite/Desktop/Datasets/dataset1.txt")
#adata=sc.read_csv("C:/Users/saite/Desktop/Datasets/wang.csv")
# =============================================================================
#reading PBMC dataset
# adata=sc.read_10x_mtx(
#     'C:/Users/saite/Desktop/Datasets/PBMC/filtered_gene_bc_matrices/hg19',  # the directory with the `.mtx` file
#     var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
#     cache=True)
# =============================================================================
#adata=adata.transpose()
adata.var_names_make_unique()
#adata.obs_names_make_unique()
# In[4]:
print(adata)
sc.pl.highest_expr_genes(adata, n_top=20)
#Computes, for each gene, the fraction of counts assigned to that gene within a cell.
示例#14
0
文件: paga.py 项目: yuxuth/lung-scrna
import sys, os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scvelo as scv
import scanpy as sc
import pandas as pd
import loompy

scv.settings.set_figure_params('scvelo')

proj_path = "/Users/kriemo/Projects/publication_repos/lung-scrna/results/revision_2/"
input_path = os.path.join(proj_path, "revision", "geo")

adata = sc.read_text(os.path.join(input_path, "count_matrix.tsv.gz"),
                     delimiter="\t")
adata = adata.T

mdata = pd.read_csv(os.path.join(proj_path,
                                 "revision/geo/cell_metadata.tsv.gz"),
                    sep="\t")

adata.obs["cluster"] = np.array(mdata["cluster"])
adata.obs["cell_type"] = np.array(mdata["cell_type"])

#add tSNE projections
tsne_mat = np.column_stack(
    (np.array(mdata["tSNE_1"]), np.array(mdata["tSNE_2"])))
adata.obsm['X_tsne'] = tsne_mat

good_clusters = [
# combine and calculate Alignment score
# all need data can be obtained in the corresponding GEO database

## load mp data
mpAdata = sc.read_10x_h5("mp_filtered_feature_bc_matrix.h5")
tempTools.plotCellScatter(mpAdata)
mpAdata = mpAdata[mpAdata.obs.eval("500 <= n_genes <= 5000")]

## load science data
scienceAdata = sc.read_10x_h5("science_filtered_gene_bc_matrices.h5")
tempTools.plotCellScatter(scienceAdata)
scienceAdata = scienceAdata[:, ~scienceAdata.var.index.duplicated()]
scienceAdata.var.index = scienceAdata.var.gene_ids

## load dc data
dcAdata = sc.read_text("dc_Root_single_cell_wt_datamatrix.csv", ",")
dcAdata = dcAdata.T
tempTools.plotCellScatter(dcAdata)

## load pp data
ppAdata = sc.read_text("pp_5way_merge_raw.tsv", "\t")
ppAdata = ppAdata.T
ppuseBarcodeLs = list(
    ppAdata.obs.index.str.split("_").map(
        lambda x: x[2] + "-1-" + str(int(x[1][-1]) - 1)
    )
)
ppRawAdatas = [
    sc.read_10x_h5(x)
    for x in [
        "pp_1_filtered_feature_bc_matrix.h5",
示例#16
0
def read_sc_data(input_file,
                 fmt='h5ad',
                 backed=None,
                 transpose=False,
                 sparse=False,
                 delimiter=" ",
                 unique_name=True,
                 batch_name=None,
                 var_names="gene_symbols"):
    """\
	Read single cell dataset

	Parameters
	----------

	input_file : string
		The path of the file to be read.

	fmt : string, optional (default: 'h5ad')
		The file type of the file to be read.

	backed : Union[Literal[‘r’, ‘r+’], bool, None] (default: None)
		If 'r', load AnnData in backed mode instead of fully loading it into memory (memory mode).
		If you want to modify backed attributes of the AnnData object, you need to choose 'r+'.

	transpose: bool, optional (default: False)
		Whether to transpose the read data.

	sparse: bool, optional (default: False)
		Whether the data in the dataset is stored in sparse matrix format.

	delimiter: str,  optional (default: ' ')
		Delimiter that separates data within text file. If None, will split at arbitrary number of white spaces,
		which is different from enforcing splitting at single white space ' '.

	unique_name: bool, optional (default: False)
		If Ture, AnnData object execute var_names_make_unique() and obs_names_make_unique() functions.

	batch_name: string, optional (default: None)
		Batch name of current batch data

	var_names: Literal[‘gene_symbols’, ‘gene_ids’] (default: 'gene_symbols')
		The variables index when the file type is 'mtx'.

	Returns
	-------
	:class:`~anndata.AnnData`
		adata
	"""
    if fmt == '10x_h5':
        adata = sc.read_10x_h5(input_file)
    elif fmt == '10x_mtx':
        adata = sc.read_10x_mtx(input_file, var_names=var_names)
    elif fmt == "mtx":
        adata = sc.read_mtx(input_file)
    elif fmt == 'h5ad':
        adata = sc.read_h5ad(input_file, backed=backed)
    elif fmt == "csv":
        adata = sc.read_csv(input_file)
    elif fmt == "txt":
        adata = sc.read_text(input_file, delimiter=delimiter)
    elif fmt == "tsv":
        adata = sc.read_text(input_file, delimiter="\t")
    else:
        raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'')
    if transpose:
        adata = adata.transpose()
    if sparse:
        adata.X = csr_matrix(adata.X, dtype='float32')
    if unique_name:
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
    if batch_name is not None:
        adata.obs["_batch"] = batch_name
    return adata