예제 #1
0
파일: read_10x.py 프로젝트: timrandg/scanpy
def test_read_10x_mtx():
    sc.read_10x_mtx(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices',
                                 'hg19_chr21'),
                    var_names='gene_symbols',
                    cache=True)
    sc.read_10x_mtx(os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix'),
                    var_names='gene_symbols',
                    cache=True)
예제 #2
0
 def create_scanpy_adata_basic(self, assay="counts", sample_key=None):
     adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True)
     # adata.var_names_make_unique()
     # adata.obs_names_make_unique()
     # sc.pp.highly_variable_genes(adata, flavor="cell_ranger", subset=True)
     # adata = sc.tl.pca(adata, copy=True)
     # adata = sc.pp.neighbors(adata, copy=True)
     return adata
예제 #3
0
    def readData(self, countsFile=""):
        if countsFile == "":
            countsFile = self.CountsFile

        if countsFile == "":
            print("please input counts file path")
            return ""

        self.CountsFile = countsFile

        datapath = self.CountsFile
        if os.path.isdir(datapath):
            files = os.listdir(datapath)
            for i in files:
                if i.endswith(".gz"):
                    print(i)
                    target = datapath + "/*.gz"
                    print(target)
                    command = subprocess.Popen("gunzip " + target,
                                               shell=True,
                                               stdin=PIPE,
                                               stdout=PIPE,
                                               stderr=STDOUT)
                    output = command.stdout.read()
                    break

            files = os.listdir(datapath)
            for i in files:
                if i == "features.tsv":
                    os.rename(datapath + "/features.tsv",
                              datapath + "/genes.tsv")
                    break
            files = list(os.listdir(datapath))
            if ('barcodes.tsv' in files) and ('barcodes.tsv'
                                              in files) and ("genes.tsv"
                                                             in files):
                adata = sc.read_10x_mtx(datapath, var_names='gene_symbols')
                self.data = adata
                self.preprocess()
            else:
                print("input data is not correct")
                return ""

        elif os.path.isfile(datapath):
            if datapath.endswith(".h5ad"):
                adata = sc.read(datapath)
            else:
                adata = sc.read_csv(datapath)
                adata = adata.T
            self.data = adata
            self.preprocess()
        else:
            print("file or dir not exists")
            return ""
예제 #4
0
 def create_scanpy_adata(self,
                         sce,
                         fast_load=True,
                         assay="counts",
                         high_var=False,
                         subset=None):
     barcodes = sce.colData["Barcode"]
     _transcripts = sce.rowData["hgnc_symbol"]
     adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True)
     adata.var_names_make_unique()
     adata.obs_names_make_unique()
     print(adata.X)
     sc.pp.highly_variable_genes(adata, flavor="cell_ranger", subset=True)
     transcripts = []
     if subset == None:
         subset = _transcripts
     for symbol in _transcripts:
         if symbol not in subset: continue
         if symbol not in adata.var.index:
             symbol = symbol.replace(".", "-")
             if symbol not in adata.var.index:
                 symbol = symbol.split("-")
                 symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1])
                 if symbol not in adata.var.index:
                     symbol = symbol.split(".")[0]
         transcripts.append(symbol)
     adata.barcodes = pandas.read_csv(os.path.join(self.filtered_matrices(),
                                                   'barcodes.tsv'),
                                      header=None)[0]
     adata = adata[:, transcripts]
     assert set(adata.var.index) == set(
         transcripts), "Issues with symbol conversion."
     adata = adata[barcodes, :]
     adata.var_names_make_unique()
     if high_var:
         var_transcripts = sc.pp.highly_variable_genes(adata,
                                                       flavor="cell_ranger",
                                                       inplace=False,
                                                       n_top_genes=1000,
                                                       n_bins=100)
         assert len(var_transcripts) == len(adata.var.index)
         var_transcripts = [
             x[0] for x in zip(adata.var.index, var_transcripts)
             if x[1][0] == True
         ]
         adata = adata[:, var_transcripts]
     adata = sc.tl.pca(adata, copy=True)
     adata = sc.pp.neighbors(adata, copy=True)
     adata = sc.tl.umap(adata, copy=True)
     adata = sc.tl.tsne(adata, copy=True)
     return adata
예제 #5
0
def zheng():
    """Prepare the Zheng dataset
    
    Massively parallel digital transcriptional profiling of single cells. by
    Zheng GX, et al. in Nature Communications. 2017.
    """
    pbmc_68k = sc.read_10x_mtx("data/zheng/filtered_matrices_mex/hg19/")
    bl = pd.read_csv("data/zheng/zheng17_bulk_lables.txt", header=None)
    pbmc_68k.obs["bulk_labels"] = bl.values
    pr.read.process_clusts(pbmc_68k, "bulk_labels")
    sc.write("data/zheng/fresh_68k_bulk_labels.h5ad", pbmc_68k)
    ft = pr.performance.FoldTester(pbmc_68k)
    ft.makefolds(random=True)
    ft.savefolds("output/zheng_folds.npz")
예제 #6
0
 def create_scanpy_adata(self,
                         sce,
                         fast_load=True,
                         assay="counts",
                         high_var=False,
                         subset=None):
     barcodes = sce.colData["Barcode"]
     _transcripts = sce.rowData["hgnc_symbol"]
     adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True)
     adata.var_names_make_unique()
     adata.obs_names_make_unique()
     adata.barcodes = pandas.read_csv(os.path.join(self.filtered_matrices(),
                                                   'barcodes.tsv'),
                                      header=None)[0]
     adata = adata[barcodes, :]
     return adata
예제 #7
0
 def get_genes(self, sce):
     _transcripts = sce.rowData["hgnc_symbol"]
     try:
         adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build)
     except Exception:
         adata = sc.read_10x_mtx(self.filtered_matrices())
     transcripts = []
     for symbol in transcripts:
         if symbol not in adata.var.index:
             symbol = symbol.replace(".", "-")
             if symbol not in adata.var.index:
                 symbol = symbol.split("-")
                 symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1])
                 if symbol not in adata.var.index:
                     symbol = symbol.split(".")[0]
         transcripts.append(symbol)
     return _transcripts
예제 #8
0
 def gene_map(self, sce, original=False):
     _transcripts = sce.rowData["Symbol"]
     try:
         adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build)
     except Exception:
         adata = sc.read_10x_mtx(self.filtered_matrices())
     transcripts = {}
     for symbol in _transcripts:
         original = symbol
         if symbol not in adata.var.index:
             symbol = symbol.replace(".", "-")
             if symbol not in adata.var.index:
                 symbol = symbol.split("-")
                 symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1])
                 if symbol not in adata.var.index:
                     symbol = symbol.split(".")[0]
         if original:
             transcripts[original] = symbol
         else:
             transcripts[symbol] = original
     return transcripts
예제 #9
0
    def load_data(data):
        if isfile(data):
            name, extension = splitext(data)
            if extension == ".h5ad":
                adata = sc.read_h5ad(data)
            elif extension == ".loom":
                adata = sc.read_loom(data)
            else:
                raise click.FileError(
                    data,
                    hint="does not have a valid extension [.h5ad | .loom]")
        elif isdir(data):
            if not data.endswith(sep):
                data += sep
            adata = sc.read_10x_mtx(data)
        else:
            raise click.FileError(data, hint="not a valid file or path")

        if not set_obs_names == "":
            if set_obs_names not in adata.obs_keys():
                raise click.UsageError(
                    f"obs {set_obs_names} not found, options are: {adata.obs_keys()}"
                )
            adata.obs_names = adata.obs[set_obs_names]
        if not set_var_names == "":
            if set_var_names not in adata.var_keys():
                raise click.UsageError(
                    f"var {set_var_names} not found, options are: {adata.var_keys()}"
                )
            adata.var_names = adata.var[set_var_names]
        if make_obs_names_unique:
            adata.obs_names_make_unique()
        if make_var_names_unique:
            adata.var_names_make_unique()
        if not adata._obs.index.is_unique:
            click.echo("Warning: obs index is not unique")
        if not adata._var.index.is_unique:
            click.echo("Warning: var index is not unique")

        return adata
예제 #10
0
def getAnnData_10x_mtx(input_file):
	adata = sc.read_10x_mtx(input_file)
	return adata
예제 #11
0
    print("The run time for all resolution is:", get_time() - time_start)
    print("After training, the information of adata is:\n", adata)
    return data


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        description='just for simple test train.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--use_GPU', default=True, type=bool)
    args = parser.parse_args()
    print(args)
    #test for pbmc
    adata = sc.read_10x_mtx("../datasets/pbmc",
                            var_names="gene_symbols",
                            cache=True)
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    mito_genes = adata.var_names.str.startswith('MT-')
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1
    adata = adata[adata.obs['n_genes'] < 2500, :]
    adata = adata[adata.obs['percent_mito'] < 0.05, :]
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata,
                                min_mean=0.0125,
                                max_mean=3,
                                min_disp=0.5)
예제 #12
0
def build(pyfit):
    celltypes = pickle.load(open(pyfit,"rb"))
    adata = sc.read_10x_mtx(tenx_path,var_names='gene_symbols')
    rho = get_rho(rho_path)
    adata =
예제 #13
0
    args = parser.parse_args()

    # load dataset
    optimizer1 = Adam(amsgrad=True)
    optimizer2 = 'adadelta'

    # data_mat = h5py.File(args.data_file)
    # x = np.array(data_mat['X'])
    # y = np.array(data_mat['Y'])

    # # preprocessing scRNA-seq read counts matrix
    # adata = sc.AnnData(x)
    # adata.obs['Group'] = y

    adata = sc.read_10x_mtx(args.data_file,
                            var_names='gene_symbols',
                            cache=True)
    adata = read_dataset(adata, transpose=False, test_split=False, copy=True)
    adata = normalize(adata,
                      size_factors=True,
                      normalize_input=True,
                      logtrans_input=True)
    y = None

    input_size = adata.n_vars

    print("X:", type(adata.X), adata.X.shape)
    # print("y:", type(adata.X), adata.X.shape)
    # print(y.shape)

    x_sd = adata.X.std(0)
예제 #14
0
파일: SCveloTest.py 프로젝트: mtvector/MTsc
import scvelo as scv
scv.settings.set_figure_params('scvelo')
import scanpy.api as sc
sc.settings.autoshow=False
sc.settings.autosave=True
sc.settings.figdir='/scrapp2/mtschmitz/data/Exonic/fig'
adata = sc.read_10x_mtx('/scrapp2/mtschmitz/data/Exonic/E40_motor_Out/outs/filtered_gene_bc_matrices/refdata-celranger-mmul8-toplevel/', cache=True)
ldata = scv.read('/scrapp2/mtschmitz/data/Exonic/E40_motor_Out_velocyto/possorted_genome_bam_RWRQ2.loom', cache=True)
adata.var_names_make_unique()
ldata.var_names_make_unique()
adata = scv.utils.merge(adata, ldata)
adata.var_names_make_unique()
print('norm')
scv.pp.filter_genes(adata)
scv.pp.normalize_per_cell(adata)
scv.pp.filter_genes_dispersion(adata)
scv.pp.log1p(adata)
print(adata)
print('moment')
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
print('velo')
scv.tl.umap(adata)
scv.tl.velocity(adata)
print('graph')
scv.tl.velocity_graph(adata)
scv.tl.velocity_embedding(adata, basis='umap')
scv.pl.velocity_embedding(adata, basis='umap',save='Embed')
scv.pl.velocity_embedding_grid(adata, basis='umap',save='Grid')
scv.pl.velocity_embedding_stream(adata, basis='umap',save='stream')
sc.tl.leiden(adata)
예제 #15
0
def cli(dataset, engine, format, layout, recipe, output, sparse, plotting):
    """
    Hi! This is a tool for preprocessing data for use with cellxgene.
    """
    import matplotlib
    matplotlib.use('Agg')
    import scanpy.api as sc
    import pandas as pd
    import numpy as np

    # scanpy settings 
    sc.settings.verbosity = 2
    sc.settings.autosave = True

    # data loading
    adata = None

    if format == 'h5ad':
        adata = sc.read_h5ad(dataset)
    if format == '10x_mtx':
        adata = sc.read_10x_mtx(dataset)
    if format == 'loom' and sparse:
        adata = sc.read_loom(dataset, sparse=True)
    if format == 'loom' and not sparse:
        adata = sc.read_loom(dataset, sparse=False)

    adata.var_names_make_unique()

    # run a recipe if requested
    if recipe == 'seurat':
        sc.pp.recipe_seurat(adata)
    elif recipe == 'zheng17':
        sc.pp.recipe_zheng17(adata)
    else:
        sc.pp.filter_cells(adata, min_genes=5)
        sc.pp.filter_genes(adata, min_cells=25)
        if sparse:
            sc.pp.scale(adata, zero_center=False)
        else:
            sc.pp.scale(adata)

    # dimensionality reduction
    if sparse:
        sc.pp.pca(adata, svd_solver='arpack', zero_center=False)
    else:
        sc.pp.pca(adata, svd_solver='arpack')

    # neighbors and clustering
    sc.pp.neighbors(adata)
    sc.tl.louvain(adata)

    # layout and plotting
    if len(np.unique(adata.obs['louvain'].values)) < 10:
        palette = 'tab10'
    else:
        palette = 'tab20'

    if layout == 'umap' or layout == 'umap+tsne':
        sc.tl.umap(adata)
        if plotting:
            sc.pl.umap(adata, color='louvain', palette=palette, save='_louvain')

    if layout == 'tsne' or layout == 'umap+tsne':
        sc.tl.tsne(adata)
        if plotting:
            sc.pl.tsne(adata, color='louvain', palette=palette, save='_louvain')    

    # show the structure
    print('data structure...')
    print(adata)

    # saving file
    if not output == '':
        print('saving output...')
        adata.write(output)
예제 #16
0
 def create_scanpy_adata_basic(self, assay="counts", sample_key=None):
     adata = sc.read_10x_mtx(self.filtered_matrices(), make_unique=True)
     return adata
chdir(CWD)


####### main
meta = pd.read_csv('./liver_metadata.csv',header=0)

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)

scorenames = ['scrublet_score','scrublet_cluster_score','bh_pval']
os.makedirs('scrublet-scores')

###

for sample in meta.lanes.unique():
    #import data
    adata_sample = sc.read_10x_mtx('Liver/'+sample+'/filtered', cache=True)
    #rename cells to SAMPLE_BARCODE, cleaving the trailing -1
    adata_sample.obs_names = [sample+'_'+i.split('-')[0] for i in adata_sample.obs_names]
    #set up and run Scrublet
    scrub = scr.Scrublet(adata_sample.X)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
    adata_sample.obs['scrublet_score'] = doublet_scores
    #overcluster prep. run turbo basic scanpy pipeline
    sc.pp.filter_genes(adata_sample, min_cells=3)
    sc.pp.normalize_per_cell(adata_sample, counts_per_cell_after=1e4)
    sc.pp.log1p(adata_sample)
    sc.pp.highly_variable_genes(adata_sample, min_mean=0.0125, max_mean=3, min_disp=0.5)
    adata_sample = adata_sample[:, adata_sample.var['highly_variable']]
    sc.pp.scale(adata_sample, max_value=10)
    sc.tl.pca(adata_sample, svd_solver='arpack')
    sc.pp.neighbors(adata_sample)