Exemplo n.º 1
0
def read_10x_data(input_file,
                  format_type='10x_h5',
                  backed=None,
                  transpose=False,
                  sparse=False):
    if format_type == '10x_h5':
        adata = sc.read_10x_h5(input_file)
    elif format_type == '10x_mtx':
        adata = sc.read_10x_mtx(input_file)
    elif format_type == '10x_h5ad':
        adata = sc.read_h5ad(input_file, backed=backed)
    elif format_type == "10x_csv":
        adata = sc.read_csv(input_file)
    elif format_type == "10x_txt":
        adata = sc.read_csv(input_file, delimiter="\t")
    else:
        raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'')

    if transpose:
        adata = adata.transpose()
    if sparse:
        adata.X = csr_matrix(adata.X, dtype='float32')
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    return adata
Exemplo n.º 2
0
def scanpy_deal():
    if opt.need_transpose:
        src_data = sc.read(opt.matrix_file, first_column_names=True)
    else:
        src_data = sc.read_csv(opt.matrix_file, first_column_names=True)
    print('X:', src_data.X, ' \ncells:', src_data.obs, ' \ngenes:', src_data.var)
    print('cell name:', src_data.obs_names, '\ngene name:', src_data.var_names)
Exemplo n.º 3
0
def load_file(path):
    """
    Load single cell dataset from file
    
    Parameters
    ----------
    path
        the path store the file
        
    Return
    ------
    AnnData
    """
    if os.path.exists(DATA_PATH + path + '.h5ad'):
        adata = sc.read_h5ad(DATA_PATH + path + '.h5ad')
    elif os.path.isdir(path):  # mtx format
        adata = read_mtx(path)
    elif os.path.isfile(path):
        if path.endswith(('.csv', '.csv.gz')):
            adata = sc.read_csv(path).T
        elif path.endswith(('.txt', '.txt.gz', '.tsv', '.tsv.gz')):
            df = pd.read_csv(path, sep='\t', index_col=0).T
            adata = AnnData(df.values, dict(obs_names=df.index.values),
                            dict(var_names=df.columns.values))
        elif path.endswith('.h5ad'):
            adata = sc.read_h5ad(path)
    else:
        raise ValueError("File {} not exists".format(path))

    if not issparse(adata.X):
        adata.X = scipy.sparse.csr_matrix(adata.X)
    adata.var_names_make_unique()
    return adata
Exemplo n.º 4
0
def read_file(filename, transpose=False):
    adata = None
    if os.path.exists(filename):
        if os.path.isdir(filename):
            adata = sc.read_10x_mtx(filename)

        elif os.path.isfile(filename):
            name, filetype = os.path.splitext(filename)
            if filetype == ".txt":
                print()
                adata = sc.read_text(filename)

            if filetype == ".csv":
                adata = sc.read_csv(filename)

            if filetype == ".h5ad":
                adata = sc.read(filename)

        else:
            print(
                "ERROR: the format must be [H5AD|CSV|TXT] for file or 10x-MTX for directory."
            )
            sys.exit()

        if transpose:
            adata = adata.transpose()
    elif not os.path.exists(filename):
        sys.exit("ERROR: no such file or directory.")

    if not isinstance(adata.X, np.ndarray):
        X = adata.X.toarray()
        adata = anndata.AnnData(X, obs=adata.obs, var=adata.var)
    return adata
Exemplo n.º 5
0
def preliminaryAnalysis():

    # f1. Read data
    idata = scanpy.read_csv(
        '/Volumes/omics4tb2/alomana/projects/mscni/data/scanpy/count.file.all.day.clean.csv'
    )
    adata = idata.transpose()

    # f2. Preprocessing
    scanpy.pp.filter_cells(adata, min_genes=200)
    scanpy.pp.filter_genes(adata, min_cells=3)

    adata.obs['n_counts'] = adata.X.sum(axis=1)

    scanpy.pp.normalize_per_cell(adata, counts_per_cell_after=1e5)
    scanpy.pp.log1p(adata)

    adata.raw = adata

    scanpy.pp.highly_variable_genes(adata,
                                    min_mean=0.0125,
                                    max_mean=6,
                                    min_disp=0.25)  # 2,851
    adata = adata[:, adata.var['highly_variable']]

    scanpy.pp.regress_out(adata, ['n_counts'])
    scanpy.pp.scale(adata, max_value=10)

    scanpy.tl.pca(adata, svd_solver='arpack')  ### there seem to be a bug

    return adata
Exemplo n.º 6
0
def cell_grouping(condition):

    adata = sc.read_csv('scRecover+scImpute_' + condition +
                        '_condition.csv').transpose()

    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    sc.tl.pca(adata, svd_solver='arpack')

    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

    sc.tl.umap(adata)

    sc.tl.leiden(adata)
    sc.pl.umap(adata, color='leiden')

    raw = pd.DataFrame(data=adata.X, columns=adata.var_names)

    avg_gata = np.average(raw['Gata2'].to_numpy())
    avg_sox = np.average(raw['Sox2'].to_numpy())
    avg_zic = np.average(raw['Zic3'].to_numpy())

    labels = []

    for i in range(0, len(raw)):
        if raw['Gata2'][i] > avg_gata:
            labels.append('2c')
        elif raw['Sox2'][i] > avg_sox:
            labels.append('naive')
        elif raw['Zic3'][i] > avg_zic:
            labels.append('primed')
        else:
            labels.append('unknown')

    raw.set_index(adata.obs_names)

    adata.obs.leiden = labels

    sc.pl.umap(adata, color='leiden')

    adata_2c = adata[adata.obs.leiden == '2c']
    adata_naive = adata[adata.obs.leiden == 'naive']
    adata_primed = adata[adata.obs.leiden == 'primed']

    raw_2c = pd.DataFrame(adata_2c.X, columns=adata_2c.var_names)
    raw_naive = pd.DataFrame(adata_naive.X, columns=adata_naive.var_names)
    raw_primed = pd.DataFrame(adata_primed.X, columns=adata_primed.var_names)

    raw_2c = raw_2c.transpose()
    raw_naive = raw_naive.transpose()
    raw_primed = raw_primed.transpose()

    raw_2c.to_csv(condition + '_2c.csv')
    raw_naive.to_csv(condition + '_naive.csv')
    raw_primed.to_csv(condition + '_primed.csv')
Exemplo n.º 7
0
def read_counts_and_phases(count_or_rpkm,
                           use_spike_ins,
                           biotype_to_use,
                           use_isoforms=False):
    '''
    Read data into scanpy; Read phases and FACS intensities
        - count_or_rpkm: Must be "Counts" or "Tpms"
    '''
    read_file = f"input/RNAData/{count_or_rpkm}{'_Isoforms' if use_isoforms else ''}.csv" + (
        ".ercc.csv" if use_spike_ins else "")
    if biotype_to_use != None and len(biotype_to_use) > 0:
        print(f"filtering for biotype: {biotype_to_use}")
        biotype_file = f"{read_file}.{biotype_to_use}.csv"
        if not os.path.exists(biotype_file):
            gene_info = pd.read_csv(
                f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz",
                index_col=False,
                header=None,
                names=["gene_id", "name", "biotype", "description"])
            biotyped = gene_info[gene_info["biotype"] ==
                                 biotype_to_use]["gene_id"]
            pd.read_csv(read_file)[biotyped].to_csv(biotype_file, index=False)
        read_file = biotype_file

    adata = sc.read_csv(read_file)
    print(f"data shape: {adata.X.shape}")
    # adata.raw = adata

    phases = pd.read_csv(
        "input/ProteinData/WellPlatePhasesLogNormIntensities.csv").sort_values(
            by="Well_Plate")

    # Assign phases and log intensities; require log intensity
    adata.obs["Well_Plate"] = np.array(phases["Well_Plate"])
    adata.obs["plate"] = np.array(
        [wp.split("_")[1] for wp in adata.obs["Well_Plate"]])
    adata.obs["phase"] = np.array(phases["Stage"])
    adata.obs["Green530"] = np.array(phases["Green530"])
    adata.obs["Red585"] = np.array(phases["Red585"])
    adata = adata[pd.notnull(adata.obs["Green530"]) & pd.notnull(
        adata.obs["Red585"])]  # removes dark mitotic cells

    # Read in fucci pseudotime from previous analysis
    if os.path.isfile("output/fucci_time.csv"):
        adata.obs["fucci_time"] = np.array(
            pd.read_csv("output/fucci_time.csv")["fucci_time"])

    # Get info about the genes
    gene_info = pd.read_csv(
        f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz",
        header=None,
        names=["name", "biotype", "description"],
        index_col=0)
    adata.var["name"] = gene_info["name"]
    adata.var["biotype"] = gene_info["biotype"]
    adata.var["description"] = gene_info["description"]

    return adata, phases
Exemplo n.º 8
0
def read_as_anndata(list_of_list: List[List[float]], roundoff_decimal: int = 5, filename: str = None) -> ad.AnnData:

	temp_folder: str = '__temp__'
	complete_file_path: str = os.path.join(temp_folder, filename)

	list_of_list = [[u.roundoff(value, roundoff_decimal) for value in row] for row in list_of_list]

	u.create_path_if_not_exists(temp_folder)
	csv.writecsv(filename, list_of_list, directory=temp_folder)

	return sc.read_csv(complete_file_path)
Exemplo n.º 9
0
def read_10x_data(input_file, format_type='10x_h5', backed=None):
    if format_type == '10x_h5':
        adata = sc.read_10x_h5(input_file)
    elif format_type == '10x_mtx':
        adata = sc.read_10x_mtx(input_file)
    elif format_type == '10x_h5ad':
        adata = sc.read_h5ad(input_file, backed=backed)
    elif format_type == "10x_csv":
        adata = sc.read_csv(input_file)
    else:
        raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'')

    adata.var_names_make_unique()
    return adata
Exemplo n.º 10
0
 def readData(self,countsFile=""):
     if countsFile=="":
         countsFile = self.CountsFile;
         
     if countsFile=="":
         print("please input counts file path");
         return ""
     
     self.CountsFile=countsFile;
     
     datapath = self.CountsFile;
     if os.path.isdir(datapath):
         files = os.listdir(datapath)
         for i in files:
             if i.endswith(".gz"):
                 print(i)
                 target = datapath+"/*.gz";
                 print(target)
                 command = subprocess.Popen("gunzip "+target, shell=True, stdin=PIPE, stdout=PIPE,stderr=STDOUT)
                 output =command.stdout.read();
                 break;
                 
         files=os.listdir(datapath);
         for i in files:
             if i =="features.tsv":
                 os.rename(datapath+"/features.tsv",datapath+"/genes.tsv");
                 break;
         files = list(os.listdir(datapath));
         if ('barcodes.tsv' in files) and ('barcodes.tsv' in files) and ("genes.tsv" in files):
             adata = sc.read_10x_mtx(datapath, var_names='gene_symbols');
             self.data=adata;
             self.preprocess();
         else:
             print("input data is not correct")
             return ""
         
     elif os.path.isfile(datapath):
         if datapath.endswith(".h5ad"):
             adata=sc.read_h5ad(datapath);
         else:
             adata = sc.read_csv(datapath)
             adata = adata.T;
         self.data=adata;
         #self.preprocess();
     else:
         print("file or dir not exists")
         return ""
Exemplo n.º 11
0
def normalizeTissue(file, dataDirectory, log):
    path = "%s/transpose/%s" % (dataDirectory, file)
    log.write(path + "\n")
    tissue_transpose = sc.read_csv(path, first_column_names=True)
    log.write("Gene count (pre-filter): %s\n" %
              len(tissue_transpose.var_names))
    sc.pp.log1p(tissue_transpose)
    sc.pp.highly_variable_genes(tissue_transpose, flavor='seurat')
    highly_variable = tissue_transpose.var['highly_variable']
    filter_result = highly_variable[highly_variable == True].keys()
    tissue_transpose = tissue_transpose[:, filter_result]
    log.write("Gene count (post-filter): %s\n" %
              len(tissue_transpose.var_names))
    sc.pp.normalize_per_cell(tissue_transpose, counts_per_cell_after=1)
    sc.pp.scale(tissue_transpose)
    tissue_norm = pd.DataFrame(data=tissue_transpose.X,
                               index=tissue_transpose.obs_names,
                               columns=tissue_transpose.var_names)
    tissue_norm.index.name = 'cell'
    normFile = addPostfix(file, 'norm')
    normPath = "%s/norm/%s" % (dataDirectory, normFile)
    tissue_norm.to_csv(normPath, index=True)
    return normFile
Exemplo n.º 12
0
def main(args):
    # print(args)
    n_slices = int(len(args.filename) / 2)
    # Error check arguments
    if args.mode != 'pairwise' and args.mode != 'center':
        raise (ValueError("Please select either 'pairwise' or 'center' mode."))

    if args.alpha < 0 or args.alpha > 1:
        raise (ValueError("alpha specified outside [0, 1]"))

    if args.initial_slice < 1 or args.initial_slice > n_slices:
        raise (ValueError("Initial slice specified outside [1, n]"))

    if len(args.lmbda) == 0:
        lmbda = n_slices * [1. / n_slices]
    elif len(args.lmbda) != n_slices:
        raise (ValueError("Length of lambda does not equal number of files"))
    else:
        if not all(i >= 0 for i in args.lmbda):
            raise (ValueError("lambda includes negative weights"))
        else:
            print("Normalizing lambda weights into probability vector.")
            lmbda = args.lmbda
            lmbda = [float(i) / sum(lmbda) for i in lmbda]

    # create slices
    slices = []
    for i in range(n_slices):
        s = sc.read_csv(args.filename[2 * i])
        s.obsm['spatial'] = np.genfromtxt(args.filename[2 * i + 1],
                                          delimiter=',')
        slices.append(s)

    if len(args.weights) == 0:
        for i in range(n_slices):
            slices[i].obsm['weights'] = np.ones(
                (slices[i].shape[0], )) / slices[i].shape[0]
    elif len(args.weights) != n_slices:
        raise (ValueError(
            "Number of slices {0} != number of weight files {1}".format(
                n_slices, len(args.weights))))
    else:
        for i in range(n_slices):
            slices[i].obsm['weights'] = np.genfromtxt(args.weights[i],
                                                      delimiter=',')
            slices[i].obsm['weights'] = slices[i].obsm['weights'] / np.sum(
                slices[i].obsm['weights'])

    if len(args.start) == 0:
        pis_init = (n_slices - 1) * [None] if args.mode == 'pairwise' else None
    elif (args.mode == 'pairwise' and len(args.start) != n_slices - 1) or (
            args.mode == 'center' and len(args.start) != n_slices):
        raise (ValueError(
            "Number of slices {0} != number of start pi files {1}".format(
                n_slices, len(args.start))))
    else:
        pis_init = [
            pd.read_csv(args.start[i], index_col=0).to_numpy()
            for i in range(len(args.start))
        ]

    # create output folder
    output_path = os.path.join(args.direc, "paste_output")
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    if args.mode == 'pairwise':
        print("Computing pairwise alignment.")
        # compute pairwise align
        pis = []
        for i in range(n_slices - 1):
            pi = pairwise_align(slices[i],
                                slices[i + 1],
                                args.alpha,
                                dissimilarity=args.cost,
                                a_distribution=slices[i].obsm['weights'],
                                b_distribution=slices[i + 1].obsm['weights'],
                                G_init=pis_init[i])
            pis.append(pi)
            pi = pd.DataFrame(pi,
                              index=slices[i].obs.index,
                              columns=slices[i + 1].obs.index)
            output_filename = "paste_output/slice" + str(
                i + 1) + "_slice" + str(i + 2) + "_pairwise.csv"
            pi.to_csv(os.path.join(args.direc, output_filename))
        if args.coordinates:
            new_slices = stack_slices_pairwise(slices, pis)
            for i in range(n_slices):
                output_filename = "paste_output/slice" + str(
                    i + 1) + "_new_coordinates.csv"
                np.savetxt(os.path.join(args.direc, output_filename),
                           new_slices[i].obsm['spatial'],
                           delimiter=",")
    elif args.mode == 'center':
        print("Computing center alignment.")
        initial_slice = slices[args.initial_slice - 1].copy()
        # compute center align
        center_slice, pis = center_align(
            initial_slice,
            slices,
            lmbda,
            args.alpha,
            args.n_components,
            args.threshold,
            dissimilarity=args.cost,
            distributions=[slices[i].obsm['weights'] for i in range(n_slices)],
            pis_init=pis_init)
        W = pd.DataFrame(center_slice.uns['paste_W'],
                         index=center_slice.obs.index)
        H = pd.DataFrame(center_slice.uns['paste_H'],
                         columns=center_slice.var.index)
        W.to_csv(os.path.join(args.direc, "paste_output/W_center"))
        H.to_csv(os.path.join(args.direc, "paste_output/H_center"))
        for i in range(len(pis)):
            output_filename = "paste_output/slice_center_slice" + str(
                i + 1) + "_pairwise.csv"
            pi = pd.DataFrame(pis[i],
                              index=center_slice.obs.index,
                              columns=slices[i].obs.index)
            pi.to_csv(os.path.join(args.direc, output_filename))
        if args.coordinates:
            center, new_slices = stack_slices_center(center_slice, slices, pis)
            for i in range(n_slices):
                output_filename = "paste_output/slice" + str(
                    i + 1) + "_new_coordinates.csv"
                np.savetxt(os.path.join(args.direc, output_filename),
                           new_slices[i].obsm['spatial'],
                           delimiter=",")
            np.savetxt(os.path.join(args.direc,
                                    "paste_output/center_new_coordinates.csv"),
                       center.obsm['spatial'],
                       delimiter=",")
    return
Exemplo n.º 13
0
os.makedirs(sys.argv[1])  #create the output directory
os.chdir(sys.argv[1])

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
results_file = './write/organ9_concatenate.h5ad'  # the file that will store the analysis results
sc.settings.autosave = True  # save figures, do not show them
sc.settings.set_figure_params(
    dpi=300,
    frameon=False)  # low dpi (dots per inch) yields small inline figures

### input gene expression matrix of each organ
#Kidney
data1 = sc.read_csv('Kidney_rawcount.txt',
                    delimiter='\t',
                    first_column_names=None,
                    dtype='float32')
adata1 = data1.T
adata1
adata1.X.shape
#Liver
data2 = sc.read_csv('Liver_rawcount.txt',
                    delimiter='\t',
                    first_column_names=None,
                    dtype='float32')
adata2 = data2.T
adata2
adata2.X.shape
#Lung
data3 = sc.read_csv('Lung_rawcount.txt',
                    delimiter='\t',
Exemplo n.º 14
0
celltype_x = celltype_x.values
print(celltype_x)

seurat_celltype_path = base_path + 'dann_vae/atac/seurat_pred_type.csv'
celltype_seurat = pd.read_csv(seurat_celltype_path, index_col=0)
celltype_seurat = list(celltype_seurat.values.flatten())
print(celltype_seurat)

encoder = LabelEncoder()
orig_label = encoder.fit_transform(celltype_x)
orig_label.dtype = 'int64'

batch_size = 100
epochs = 25

adata1 = sc.read_csv(file1)
adata2 = sc.read_csv(file2)
adata_davae = sc.read_h5ad(davae_path)
data = adata_davae.X
# data = adata_davae.obsm['davae']

len1 = adata1.shape[0]
len2 = adata2.shape[0]

test_set = data[0:len1, ]
train_set = data[len1:len1 + len2, ]

label = to_categorical(orig_label)
class_num = label.shape[1]

net_x = CLASSIFIER(input_size=train_set.shape[1], class_num=class_num)
Exemplo n.º 15
0
@author: antho
"""

import os
import pandas as pd
import numpy as np
import scvelo as scv
import scanpy as sc
import shutil
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'], plt.rcParams['ps.fonttype'], plt.rcParams[
    'savefig.dpi'] = 42, 42, 300  #Make PDF text readable
plt.rcParams['figure.figsize'] = (10, 10)

adata = sc.read_csv("input/RNAData/Tpms.csv.protein_coding.csv")
adata.obs_names = pd.read_csv("input/RNAData/Tpms.obs_names.csv")["well_plate"]
phases = pd.read_csv("input/ProteinData/WellPlatePhasesLogNormIntensities.csv"
                     ).sort_values(by="Well_Plate")

# Assign phases and log intensities; require log intensity
adata.obs["phase"] = np.array(phases["Stage"])
adata.obs["Green530"] = np.array(phases["Green530"])
adata.obs["Red585"] = np.array(phases["Red585"])
adata = adata[pd.notnull(adata.obs["Green530"])
              & pd.notnull(adata.obs["Red585"])]  # removes dark mitotic cells
adata.obs["fucci_time"] = np.array(
    pd.read_csv("output/fucci_time.csv")["fucci_time"])

# Get info about the genes
gene_info = pd.read_csv("input/RNAData/IdsToNames.csv.gz",
Exemplo n.º 16
0
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and args.make_var_index_unique:
        adata.var_names_make_unique()
        print("Making AnnData var index unique...")
    # Sort var index
    adata = adata[:, np.sort(adata.var.index)]
    print("Writing 10x data to h5ad...")
    adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME))

elif INPUT_FORMAT in ['tsv', 'csv'] and OUTPUT_FORMAT == 'h5ad':
    if INPUT_FORMAT == 'tsv':
        delim = '\t'
    elif INPUT_FORMAT == 'csv':
        delim = ','
    # Expects csv/tsv to have features as rows and observations as columns
    adata = sc.read_csv(FILE_PATH_IN, delimiter=delim,
                        first_column_names=True).T
    # Convert to sparse matrix
    adata.X = csr_matrix(adata.X)
    adata = add_sample_id(adata=adata, args=args)
    # If is tag_cell_with_sample_id is given, add the sample ID as suffix
    if args.tag_cell_with_sample_id:
        adata = tag_cell(adata=adata,
                         tag=args.sample_id,
                         remove_10x_gem_well=args.remove_10x_gem_well)
    adata.var.index = adata.var.index.astype(str)
    # Check if var index is unique
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and not args.make_var_index_unique:
        raise Exception(
            "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config."
        )
Exemplo n.º 17
0
import scanpy as sc
import numpy as np

sc.settings.autosave = True

parser = argparse.ArgumentParser()
parser.add_argument('-i', dest='input', help='counts csv file')
args = parser.parse_args()

#count_csv = 'rsc/tasic_scRNAseq/full_scRNAseq/GSE71585_RefSeq_counts.csv'
count_csv = args.input

# read in counts csv

adata = sc.read_csv(count_csv,
                    delimiter=',',
                    first_column_names=bool,
                    dtype='float32')

# need to transpose
tdata = sc.AnnData.transpose(adata)

print(tdata)

# Basic pre-processing
# filter out cells have less than 200 genes expressed
sc.pp.filter_cells(tdata, min_genes=200)
print(tdata.obs['n_genes'].min())
# filter out genes expressed in less than 3 cells
sc.pp.filter_genes(tdata, min_cells=3)
print(tdata.var['n_cells'].min())
Exemplo n.º 18
0
# %%
import os
import scanpy as sc
from scipy import sparse

# %%

adataD0 = sc.read_csv('./data/Klein/GSM1599494_ES_d0_main.csv.bz2')
adataD2 = sc.read_csv('./data/Klein/GSM1599497_ES_d2_LIFminus.csv.bz2')
adataD4 = sc.read_csv('./data/Klein/GSM1599498_ES_d4_LIFminus.csv.bz2')
adataD7 = sc.read_csv('./data/Klein/GSM1599499_ES_d7_LIFminus.csv.bz2')

# %%

adata = sc.AnnData.concatenate(adataD0.T, adataD2.T, adataD4.T, adataD7.T, batch_key='cluster',
                               batch_categories=['d0', 'd2', 'd4', 'd7', ])
adata.X = sparse.csr_matrix(adata.X)

# %%

sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

adata = adata[adata.obs.total_counts < 75000, :]

# sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
# sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=False, multi_panel=True)

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

adata.raw = adata
# Load up whitfield dataset & incorporate ideal vector data from Figure 1 to meta
whit1 = ['whitfield_dataPlusScores_6_30_2020_', '_1134.csv', '1134']
mod1 = 'quantile'
experiments = {
    'TT1': [0, 12],
    'TT2': [12, 38],
    'TT3': [38, 86],
    'TN': [86, 105],
    'SHAKE': [105, 114]
}
whitfield = {}
print(whit1, mod1)
for exp1 in experiments:
    whitfield[exp1] = sc.read_csv('data/Whitfield/data/' + whit1[0] + exp1 +
                                  whit1[1],
                                  first_column_names=True).T
    var_names = [
        str(g2e.loc[float(i), 'Gene stable ID'])
        for i in whitfield[exp1].var_names if float(i) in g2e.index
    ]
    whitfield[exp1] = whitfield[exp1][:, [
        True if float(i) in g2e.index else False
        for i in whitfield[exp1].var_names
    ]]
    whitfield[exp1].var_names = pd.Index(var_names)
    if mod1 == 'quantile':
        tmp = whitfield[exp1].X
        whitfield[exp1].X = quantile_transform(tmp, axis=1)
    whitfield[exp1].var_names = [
        i.rstrip('.0') for i in whitfield[exp1].var_names
Exemplo n.º 20
0
import sys
import numpy as np
import pandas as pd
import scanpy as sc  #1.4.3
import anndata
import bbknn  #1.3.4
#####################
print('Start')
fi = open(BATCH)
batch = []
for line in fi:
    seq = line.rstrip().split(',')
    batch = batch + seq
fi.close()
batch = batch[1:]
used_pca = sc.read_csv(PCA)
adata = anndata.AnnData(X=used_pca.X, obs=batch)
PCNUM = used_pca.X.shape[1]
sc.tl.pca(adata, n_comps=PCNUM)
adata.obsm['X_pca'] = used_pca.X
bbknn.bbknn(adata,
            batch_key=0,
            neighbors_within_batch=NB,
            n_pcs=PCNUM,
            n_trees=NT)
sc.tl.umap(adata)
umap = adata.obsm['X_umap']

fo = open(OUTPUT, 'w')
for one in umap:
    fo.write(str(one[0]) + '\t' + str(one[1]) + '\n')
Exemplo n.º 21
0
from fnmatch import fnmatch

# Get all evaluation files:
root = '../data/KptnMouse/RNAscope'
pattern = "Objects_Population - Nuclei.txt"
allFiles = []
slideNames = []
for path, subdirs, files in os.walk(root):
    for name in files:
        if fnmatch(name, pattern):
            allFiles.append(os.path.join(path, name))
            slideNames.append(str.split(allFiles[-1], '/')[4])

slide = 0
# Import data:
kptn_data_all = sc.read_csv(allFiles[slide], sep='\t', skiprows=8, header=1)
kptn_data = np.asarray(kptn_data_all[[
    'Position X [µm]', 'Position Y [µm]',
    'Nuclei - Intensity Nucleus Alexa 568 Mean',
    'Nuclei - Intensity Nucleus Atto 490LS Mean',
    'Nuclei - Intensity Nucleus Alexa 488 Mean',
    'Nuclei - Intensity Nucleus Alexa 647 Mean',
    'Nuclei - Intensity Nucleus Atto 425 Mean'
]])
channelOrder = ('568', '490LS', '488', '647', '425')
celltypeOrder = ('Astrocyte', 'Oligodendrocyte', 'GABAergicNeuron', 'OPC',
                 'Neuron')

# Filter out 1% smallest and 5% of largest nuclei as segmentation errors:

volumes = np.asarray(kptn_data_all['Nuclei - Nucleus Volume [µm³]'])
Exemplo n.º 22
0
def upload(pathname):

    import anndata
    filename, file_extension = os.path.splitext(pathname)
    if file_extension == ".mat":
        x = loadmat(pathname)
        keys = []
        for key in x.keys():
            keys.append(key)

        #obs is the cell
        #var is gene
        #pick the largest
        largest = 3
        largest_size = 0
        for i in range(len(keys) - 3):
            if len(x[keys[i + 3]].shape) == 2:
                size = (x[keys[i + 3]].shape[0] * x[keys[i + 3]].shape[1])
            else:
                size = x[keys[i + 3]].shape[0]
            if size >= largest_size:
                largest = i + 3
                largest_size = size
        obs_d, var_d = {}, {}
        for i in range(len(keys) - 3):
            if i != largest - 3:
                if (x[keys[i + 3]].flatten()).shape[0] == (
                        x[keys[largest]]).shape[0]:
                    obs_d[keys[i + 3]] = x[keys[i + 3]].flatten()
                elif (x[keys[i + 3]].flatten()).shape[0] == (
                        x[keys[largest]]).shape[1]:
                    var_d[keys[i + 3]] = x[keys[i + 3]].flatten()
                #else:
        obs_df = pd.DataFrame(data=obs_d)
        var_df = pd.DataFrame(data=var_d)

        data = anndata.AnnData(X=x[keys[largest]].todense(),
                               obs=None if obs_df.empty else obs_df,
                               var=None if var_df.empty else var_df)

    elif file_extension == ".npz":
        x = np.load(pathname)
        #pick largest size file
        largest = 0
        largest_size = 0
        for i in range(len(x.files)):
            if len(x[x.files[i]].shape) == 2:
                size = (x[x.files[i]].shape[0] * x[x.files[i]].shape[1])
            else:
                size = x[x.files[i]].shape[0]
            if size >= largest_size:
                largest = i
                largest_size = size
        obs_d, var_d = {}, {}
        for i in range(len(x.files)):
            if i != largest:
                if len(x[x.files[i]].flatten()) == len(x[x.files[largest]]):
                    obs_d[x.files[i]] = x[x.files[i]].flatten()
                elif len(x[x.files[i]].flatten()) == len(
                        x[x.files[largest]][0]):
                    var_d[x.files[i]] = x[x.files[i]].flatten()
                #else:
        obs_df = pd.DataFrame(data=obs_d)
        var_df = pd.DataFrame(data=var_d)
        data = anndata.AnnData(X=x[x.files[largest]],
                               obs=None if obs_df.empty else obs_df,
                               var=None if var_df.empty else var_df)
    elif file_extension == ".mtx":
        data = sc.read_10x_mtx(os.path.dirname(pathname))
        data.X = data.X.todense()
    elif file_extension == ".csv":
        data = sc.read_csv(pathname)
    elif file_extension == ".xlsx":
        data = sc.read_excel(pathname)
    elif file_extension == ".txt":
        data = sc.read_text(pathname)
    else:
        data = sc.read(pathname)

    print(pathname, " uploaded !")
    return data
Exemplo n.º 23
0
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import os

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
results_folder = 'write'
adata = sc.read_csv("dataset_cleaned.csv", first_column_names=True)

nn_number_list = [
    10, 12
]  # a list of numbers of nearest neighbors to consider for clustering
resolution_list = [
    0.001, 0.010, 0.015, 0.02
]  # a list of resolutions to consider for cluster annotation


def main():
    # common pre-processing steps
    sc.pp.scale(adata)
    sc.tl.pca(adata, svd_solver='auto')
    # the next steps vary per hyper-parameter
    for n_neighbors in nn_number_list:
        #clustering
        nn_key_added = str(n_neighbors) + '_nn'
        umap_obsm_key = 'X_umap_' + nn_key_added
        sc.pp.neighbors(adata,
                        method='umap',
                        n_neighbors=n_neighbors,
                        n_pcs=20,
Exemplo n.º 24
0
adata_spatial_posterior = sc.datasets.visium_sge(
    sample_id="V1_Mouse_Brain_Sagittal_Posterior")

#Normalize and log1P
for adata in [
        adata_spatial_anterior,
        adata_spatial_posterior,
]:
    sc.pp.normalize_total(adata, inplace=True)
    #sc.pp.log1p(adata)
    #sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True)

##################
#Sc data GSE115746

adata_cortex = sc.read_csv('../data/GSE115746_cells_exon_counts.csv').T
adata_cortex_meta = pd.read_csv(
    '../data/GSE115746_complete_metadata_28706-cells.csv', index_col=0)
adata_cortex_meta_ = adata_cortex_meta.loc[adata_cortex.obs.index, ]

adata_cortex.obs = adata_cortex_meta_

adata_cortex.var_names_make_unique()

adata_cortex.var['mt'] = adata_cortex.var_names.str.startswith(
    'Mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_cortex,
                           qc_vars=['mt'],
                           percent_top=None,
                           log1p=False,
                           inplace=True)
#!/usr/bin/env python3

import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import scvelo as scv
import csv

sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)

## read in counts csv
adata = sc.read_csv('GSE71585_RefSeq_counts.csv', delimiter=',',
first_column_names=bool, dtype='float32')
## check dims
print(adata)
## need to transpose

tdata = sc.AnnData.transpose(adata)
print(tdata)

# Basic pre-processing
sc.pp.filter_cells(tdata, min_genes=200)
sc.pp.filter_genes(tdata, min_cells=3)
## check dims
print(tdata)

mito_genes = tdata.var_names.str.startswith('mt-')
# for each cell compute fraction of counts in mito genes vs. all genes
tdata.obs['percent_mito'] = np.sum(
Exemplo n.º 26
0
import mmap
import glob
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"
matplotlib.rcParams['font.size'] = 14

#read the data
np.random.seed(1)
import scanpy as sc

wt1 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/WT_EM_1.csv",
                  first_column_names=True)
wt2 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/WT_EM_2.csv",
                  first_column_names=True)
wt1.var['batch'] = "wt1"
wt2.var['batch'] = "wt2"
wt1 = wt1.T
wt2 = wt2.T
dkd1 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/DKD_EM_1.csv",
                   first_column_names=True)
dkd2 = sc.read_csv("/Users/qingbowang/Desktop/slide_seq/DKD_EM_2.csv",
                   first_column_names=True)
dkd1.var['batch'] = "dkd1"
dkd2.var['batch'] = "dkd2"
dkd1 = dkd1.T
dkd2 = dkd2.T
Exemplo n.º 27
0

import numpy,pandas,datetime
import matplotlib,matplotlib.pyplot
import scanpy
scanpy.settings.verbosity=5


# # 1. Reading data

# In[2]:


print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

idata=scanpy.read_csv('/Volumes/omics4tb2/alomana/projects/mscni/data/scanpy/count.file.all.day.clean.csv')
adata=idata.transpose()
print(adata)
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


# # 2. Preprocessing

# In[3]:


scanpy.pl.highest_expr_genes(adata,n_top=20)


# In[4]:
Exemplo n.º 28
0
cluster_count = 2
base_path='/Users/zhongyuanke/data/'
orig_path = 'pbmc/zheng/mcl_pre.h5ad'
desc_path = 'desc/desc_jurkat.h5ad'
davae_path = 'dann_vae/pbmc/293t_save04_label.h5ad'
seurat_path = 'seurat_result/mcl.h5ad'
scan_path = 'scanorama/scan_mcl.h5ad'
scgen_path = 'scgen/scgen_mcl.h5ad'
harmony_path = 'harmony_result/mcl.csv'

adata_davae = sc.read_h5ad(base_path+davae_path)
adata_scan = sc.read_h5ad(base_path+scan_path)
adata_orig = sc.read_h5ad(base_path+orig_path)
adata_seurat = sc.read_h5ad(base_path+seurat_path)
adata_scgen=sc.read_h5ad(base_path+scgen_path)
adata_harmony = sc.read_csv(base_path+harmony_path)
adata_desc = sc.read_h5ad(base_path+desc_path)

sc.pp.neighbors(adata_seurat, use_rep='X_pca')
sc.tl.umap(adata_seurat)
# print(adata_scgen)
# sc.pp.neighbors(adata_orig)
# sc.tl.umap(adata_orig)
# sc.pp.neighbors(adata_davae)
# sc.tl.umap(adata_davae)
# sc.pp.neighbors(adata_scan, use_rep='X_scanorama')
# sc.tl.umap(adata_scan)
# sc.pp.neighbors(adata_seurat)
# sc.tl.umap(adata_seurat)
# sc.pp.neighbors(adata_scgen,use_rep='corrected_latent')
# sc.tl.umap(adata_scgen)
def read_counts_and_phases(count_or_rpkm, use_spike_ins, biotype_to_use, u_plates, use_isoforms=False, load_velocities=False):
    '''
    Read data into scanpy; Read phases and FACS intensities
        - count_or_rpkm: Must be "Counts" or "Tpms"
    '''
    read_file = f"input/RNAData/{count_or_rpkm}{'_Isoforms' if use_isoforms else ''}.csv" + (".ercc.csv" if use_spike_ins else "")
    if biotype_to_use != None and len(biotype_to_use) > 0:
        print(f"filtering for biotype: {biotype_to_use}")
        biotype_file = f"{read_file}.{biotype_to_use}.csv"
        if not os.path.exists(biotype_file):
            gene_info = pd.read_csv(f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz", 
                                    index_col=False, header=None, names=["gene_id", "name", "biotype", "description"])
            biotyped = gene_info[gene_info["biotype"] == biotype_to_use]["gene_id"]
            pd.read_csv(read_file)[biotyped].to_csv(biotype_file, index=False)
        read_file = biotype_file

    adata = sc.read_csv(read_file)
    print(f"data shape: {adata.X.shape}")
    if load_velocities:
        adata.obs_names = pd.read_csv("input/RNAData/Tpms.obs_names.csv")["well_plate"]

    intensities, phases = [],[]
    for plate in u_plates:
        file = f"input/RNAData/180911_Fucci_single cell seq_ss2-18-{plate}_index sort export.csv"
        plateIntensities = pd.read_csv(file, skiprows=2)
        newColumns = list(plateIntensities.columns)
        newColumns[5] = "MeanGreen530"
        newColumns[6] = "MeanRed585"
        plateIntensities.columns = newColumns
        plateIntensities["Plate"] = [plate] * len(plateIntensities)
        plateIntensities["Well_Plate"] = [f"{w}_{plate}" for w in plateIntensities["Well"]]
        intensitiesSubFrame = plateIntensities[plateIntensities["Population"] == "All Events"]
        if len(intensities) == 0: intensities = intensitiesSubFrame
        else: intensities = intensities.append(intensitiesSubFrame, ignore_index=True)
        isPhaseRow = ~plateIntensities["Population"].isin(["All Events", "Cells", "Singlets"])
        phasesSubFrame = plateIntensities[isPhaseRow & (plateIntensities["% Total"] == "100.00%")]
        if len(phases) == 0: phases = phasesSubFrame
        else: phases = phases.append(phasesSubFrame, ignore_index=True)
    wp_idx = list(phases.columns).index("Well_Plate")
    pop_idx = list(phases.columns).index("Population")
    phases_lookup = dict([(row[1][wp_idx], row[1][pop_idx]) for row in phases.iterrows()])
            
    # Assign phases and log intensities; require log intensity
    intensities = intensities.sort_values(by="Well_Plate")
    adata.obs["Well_Plate"] = np.array(intensities["Well_Plate"])
    adata.obs["plate"] = np.array(intensities["Plate"])
    adata.obs["phase"] = np.array([phases_lookup[wp] if wp in phases_lookup else "N/A" for wp in intensities["Well_Plate"]])
    adata.obs["MeanGreen530"] = np.array(intensities["MeanGreen530"])
    adata.obs["MeanRed585"] = np.array(intensities["MeanRed585"])
    adata = adata[pd.notnull(adata.obs["MeanGreen530"]) & pd.notnull(adata.obs["MeanRed585"])] # removes 6 dark likely mitotic cells
    
    # Read in fucci pseudotime from previous analysis
    if os.path.isfile("output/fucci_time.csv"):
        adata.obs["fucci_time"] = np.array(pd.read_csv("output/fucci_time.csv")["fucci_time"])

    # Get info about the genes
    gene_info = pd.read_csv(f"input/RNAData/IdsToNames{'_Isoforms' if use_isoforms else ''}.csv.gz", 
                            header=None, names=["name", "biotype", "description"], index_col=0)
    adata.var["name"] = gene_info["name"]
    adata.var["biotype"] = gene_info["biotype"]
    adata.var["description"] = gene_info["description"]
    
    if load_velocities:
        ldata = scv.read("input/RNAData/a.loom", cache=True)
        ldata.obs_names = pd.read_csv("input/RNAData/a.obs_names.csv")["well_plate"]
        ldata.var["GeneName"] = ldata.var_names
        ldata.var_names = ldata.var["Accession"]
        adata = scv.utils.merge(adata, ldata, copy=True)

    return adata, phases
Exemplo n.º 30
0
#!/usr/bin/env python
##################################################
# File Name: test.py
# Author: Rui
# mail: [email protected]
# Created Time: Thu 11 Jul 2019 11:49:02 AM EDT
################################################

import scanpy as sc
import numpy as np
import giniclust3 as gc
import anndata

####Load and filter dataset####
adataRaw = sc.read_csv("./data/GSM1599495_ES_d0_biorep_techrep1.csv",
                       first_column_names=True)
sc.pp.filter_cells(adataRaw,
                   min_genes=3)  #####remover gene expressed less than N cell
sc.pp.filter_genes(adataRaw,
                   min_cells=200)  #####remove cell express less than M gene
adataSC = anndata.AnnData(X=adataRaw.X.T, obs=adataRaw.var, var=adataRaw.obs)
sc.pp.normalize_per_cell(adataSC, counts_per_cell_after=1e4)

####GiniIndexClust and FanoFactorClust####
gc.gini.calGini(adataSC)
adataGini = gc.gini.clusterGini(adataSC, neighbors=3)

gc.fano.calFano(adataSC)
adataFano = gc.fano.clusterFano(adataSC)

####ConsensusClust####