Exemplo n.º 1
0
def load(
    anndata_name_original="mouse_retina.h5ad",
    split="train",
    cache_dir="data_cache",
    cache=True,
):
    """
    Load requested split of mouse data, where the whole dataset.
    Looks for a local cache of the original data, and creates it in cache_dir if not there and cache=True.
    Then Looks for local cache of the requested split, and if it can't find that, makes a split on the fly.
    If cache=True, caches the result in cache_dir for next time."""

    original_fpath = os.path.join(cache_dir, anndata_name_original)
    if not os.path.exists(original_fpath):
        _create_anndata(anndata_name_out=anndata_name_original,
                        cache_dir=cache_dir)

    original_fname = os.path.basename(original_fpath)
    original_bname, original_ext = os.path.splitext(original_fname)
    target_fname = "{0}_{1}{2}".format(original_bname, split, original_ext)
    target_fpath = os.path.join(cache_dir, target_fname)

    if not os.path.exists(target_fpath):
        adata_in = sc.read_h5ad(original_fpath)
        split_inds, split_adata = split_anndata(adata_in)
        if cache:
            write_splits(
                split_inds_dict=split_inds,
                split_adata_dict=split_adata,
                basename=original_bname,
                out_dir=cache_dir,
            )

    return sc.read_h5ad(target_fpath)
def load(
    split="train",
    original_fpath="/allen/aics/modeling/data/scRNAseq_SeeligCollaboration/data_for_modeling/scrnaseq_cardio_20181129.h5ad",
    cache_dir="data_cache",
    cache=True,
    selected_genes_path=None,
    threshold=0,
):
    """
    Load requested split of cardio data, where the whole dataset originated at original_fpath.
    Looks for local cache of split, and if it can't find that, makes a split on the fly.
    If cache=True, caches the result in cache_dir for next time.
    Loads raw count values.
    """

    original_fname = os.path.basename(original_fpath)
    original_bname, original_ext = os.path.splitext(original_fname)
    target_fname = "{0}_{1}{2}".format(original_bname, split, original_ext)
    target_fpath = os.path.join(cache_dir, target_fname)

    if not os.path.exists(target_fpath):
        adata_in = sc.read_h5ad(original_fpath)
        adata_raw = sc.AnnData(
            X=adata_in.raw.X.todense(),
            obs=adata_in.obs,
            var=adata_in.var,
            uns=adata_in.uns,
        )
        split_inds, split_adata = split_anndata(adata_raw)
        if cache:
            write_splits(
                split_inds_dict=split_inds,
                split_adata_dict=split_adata,
                basename=original_bname,
                out_dir=cache_dir,
            )

    adata = sc.read_h5ad(target_fpath)

    if selected_genes_path is not None:
        df = pd.read_csv(selected_genes_path, delimiter="\t")

        coding_genes = df["Gene name"].unique()
        coding_genes = [str(g) + "_HUMAN" for g in coding_genes]

        cols = np.array([c for c in adata.var.index if c in coding_genes])
        adata = adata[:, cols]

    gene_nz_freq = (adata.X > 0).mean(axis=0)
    adata = adata[:, cols[gene_nz_freq > threshold]]

    return adata
Exemplo n.º 3
0
def preprocess_h5ad_data(raw_input_path,
                         processed_path,
                         scaling_option="log_min_max",
                         sig_genes=None):
    """
    Preprocess raw input data for the model
    :param raw_input_path:
    :param scaling_option:
    :param group_small:
    :param signature_genes:
    :return:
    """
    print("Pre-processing raw data ...")
    raw_input = sc.read_h5ad(raw_input_path)

    print("Subsetting genes ...")
    # Select features go use
    raw_input = raw_input[:, sig_genes]

    print("Scaling using " + str(scaling_option))
    # Scaling
    raw_input.X = sample_scaling(raw_input.X, scaling_option)

    print("Writing to disk ...")
    # Write processed data to disk
    raw_input.write(processed_path)
    print("Data pre-processing done.")
Exemplo n.º 4
0
    def load_h5ad_file(self, input_path, batch_size, datasets=[]):
        """
        Load input data from a h5ad file and divide into training and test set
        :param input_path: path to h5ad file
        :param batch_size: batch size to use for training
        :param datasets: a list of datasets to extract from the file
        :return: Dataset object
        """
        raw_input = sc.read_h5ad(input_path)
        # Subset dataset
        if len(datasets) > 0:
            all_ds = collections.Counter(raw_input.obs['ds'])
            for ds in all_ds:
                if ds not in datasets:
                    raw_input = raw_input[raw_input.obs['ds'] != ds].copy()
                    
        

        # Create training dataset
        ratios = [raw_input.obs[ctype] for ctype in raw_input.uns['cell_types']]
        self.x_data = raw_input.X.astype(np.float32)
        self.y_data = np.array(ratios, dtype=np.float32).transpose()
        # create placeholders
        self.x_data_ph = tf.compat.v1.placeholder(self.x_data.dtype, self.x_data.shape, name="x_data_ph")
        self.y_data_ph = tf.compat.v1.placeholder(self.y_data.dtype, self.y_data.shape, name="y_data_ph")
        self.data = tf.compat.v1.data.Dataset.from_tensor_slices((self.x_data_ph, self.y_data_ph))
        self.data = self.data.shuffle(1000).repeat().batch(batch_size=batch_size)

        # Extract celltype and feature info
        self.labels = raw_input.uns['cell_types']
        self.sig_genes = list(raw_input.var_names)
Exemplo n.º 5
0
def verify_anndata(matrix_path, test_yaml_path):

    expected_values = yaml.load(open(test_yaml_path))['expected_output']

    output_matrix = sc.read_h5ad(matrix_path).X.T

    assert numpy.count_nonzero(
        output_matrix) == expected_values["non_zero_count"]
    assert numpy.sum(output_matrix) == expected_values["sum"]
    assert tuple(output_matrix.shape) == tuple(expected_values["shape"])
Exemplo n.º 6
0
def load(
    loc="data_files",
    blocksize=1000000,
    anndata_write=True,
    anndata_name="mouse_retina.h5ad",
    X_dtype=np.float32,
):

    adata_fpath = os.path.join(loc, anndata_name)

    # if we've already down;loaded and constructed the adata file, read it and use it
    if os.path.exists(adata_fpath) and os.path.isfile(adata_fpath):
        print("reading saved anndata h5ad file")
        adata = sc.read_h5ad(adata_fpath)

    # if anndata doesn't exit alread, download inputs and construct it
    else:
        # download files if they don't exist locally
        if not os.path.exists(loc):
            os.makedirs(loc)
        files = {
            "10x_mouse_retina_development.mtx": "https://www.dropbox.com/s/6d76z4grcnaxgcg/10x_mouse_retina_development.mtx?dl=1",
            "10x_mouse_retina_development_phenotype.csv": "https://www.dropbox.com/s/y5lho9ifzoktjcs/10x_mouse_retina_development_phenotype.csv?dl=1",
            "10x_mouse_retina_development_feature.csv": "https://www.dropbox.com/s/1mc4geu3hixrxhj/10x_mouse_retina_development_feature.csv?dl=1",
        }
        print("downloading data files")
        for fname, url in files.items():
            if not os.path.exists(os.path.join(loc, fname)):
                download_file(url, loc=loc, blocksize=blocksize)

        # read in data
        print("reading data files")
        df_obs = pd.read_csv(
            os.path.join(loc, "10x_mouse_retina_development_phenotype.csv"), index_col=0
        )[["barcode", "sample", "age", "CellType"]]
        df_var = pd.read_csv(
            os.path.join(loc, "10x_mouse_retina_development_feature.csv"), index_col=0
        )[["id", "gene_short_name"]]
        count_mat = mmread(os.path.join(loc, "10x_mouse_retina_development.mtx"))

        # make anndata object
        print("constructing anndata object")
        adata = sc.AnnData(
            X=count_mat.toarray().astype(X_dtype).transpose(), obs=df_obs, var=df_var
        )
        genes_to_keep = np.mean(adata.X != 0, axis=0) > 0
        cells_to_keep = np.mean(adata.X != 0, axis=1) > 0
        adata = adata[:, genes_to_keep][cells_to_keep, :].copy()

        # save a local copy
        if anndata_write:
            print("saving annndata h5ad file")
            adata.write(adata_fpath)

    return adata
Exemplo n.º 7
0
 def __init__(self, download=True, dir_path='.', tabula_muris_senis=False):
     '''
     download: if True, data will be downloaded automatically and saved in dir_path, otherwise
               data will be read from dir_path
     dir_path: path to directory where data is stored (if already downloaded), or where the data
               should be saved 
     tabula_muris_senis: if False generator for Tabula Muris data  only will be created, otherwise
                         for Tabula Muris Senis
     '''
     if download:
         self.download_data(dir_path)
     self.adata = sc.read_h5ad(os.path.join(dir_path, 'tms-facs-mars.h5ad'))
     self.preprocess()
     if not tabula_muris_senis:
         self.adata = self.adata[self.adata.obs['age'] == '3m']
Exemplo n.º 8
0
def processing(data_path, training_data, processed_path):
    """
    Process a training dataset to contain only the genes also available in the prediction data
    :param data_path: path to prediction data
    :param training_data: path to training data (h5ad file)
    :param processed_path: name of processed file
    :return:
    """
    # Get the common genes (signature genes)
    raw_input = sc.read_h5ad(training_data)
    sig_genes_complete = list(raw_input.var_names)
    sig_genes = get_signature_genes(input_path=data_path,
                                    sig_genes_complete=sig_genes_complete)

    # Pre-process data with new signature genes
    preprocess_h5ad_data(raw_input_path=training_data,
                         processed_path=processed_path,
                         sig_genes=sig_genes)
Exemplo n.º 9
0
def read_anndata(input, genome=None):
    _, input_ext = splitext(input)
    if input_ext == ".h5":
        if not genome:
            keys = list(File(input, "r").keys())
            if len(keys) == 1:
                genome = keys[0]
            else:
                raise Exception(
                    "Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s"
                    % (input, ",".join(keys)))
        return read_10x_h5(input, genome=genome)
    elif input_ext == ".h5ad":
        return read_h5ad(input)
    elif input_ext == ".loom":
        # reads the whole dataset in memory!
        return read_loom(input)
    else:
        raise Exception("Unrecognized input extension: %s" % input_ext)
Exemplo n.º 10
0
    def load_data(data):
        if isfile(data):
            name, extension = splitext(data)
            if extension == ".h5ad":
                adata = sc.read_h5ad(data)
            elif extension == ".loom":
                adata = sc.read_loom(data)
            else:
                raise click.FileError(
                    data,
                    hint="does not have a valid extension [.h5ad | .loom]")
        elif isdir(data):
            if not data.endswith(sep):
                data += sep
            adata = sc.read_10x_mtx(data)
        else:
            raise click.FileError(data, hint="not a valid file or path")

        if not set_obs_names == "":
            if set_obs_names not in adata.obs_keys():
                raise click.UsageError(
                    f"obs {set_obs_names} not found, options are: {adata.obs_keys()}"
                )
            adata.obs_names = adata.obs[set_obs_names]
        if not set_var_names == "":
            if set_var_names not in adata.var_keys():
                raise click.UsageError(
                    f"var {set_var_names} not found, options are: {adata.var_keys()}"
                )
            adata.var_names = adata.var[set_var_names]
        if make_obs_names_unique:
            adata.obs_names_make_unique()
        if make_var_names_unique:
            adata.var_names_make_unique()
        if not adata._obs.index.is_unique:
            click.echo("Warning: obs index is not unique")
        if not adata._var.index.is_unique:
            click.echo("Warning: var index is not unique")

        return adata
Exemplo n.º 11
0
import scanpy.api as sc
from umap import UMAP
import scanorama
import sys

script_path = os.path.dirname(os.path.realpath(__file__))
output_dir = os.path.join(script_path, '../../Figures') + '/'
adata_scv_pru = sc.read_h5ad(output_dir + '../Data/pru/adata_sc_velocyto.h5ad')
adata_scv_me49 = sc.read_h5ad(output_dir +
                              '../Data/011_me49/adata_sc_velocyto.h5ad')

adatas = [adata_scv_me49.copy(), adata_scv_pru.copy()]
integrated, corrected = scanorama.correct_scanpy(adatas, return_dimred=True)
merged_x = np.concatenate(integrated)
umap_merged_x = UMAP(n_components=2,
                     random_state=4,
                     min_dist=0.3,
                     n_neighbors=50).fit_transform(merged_x)
adatas = corrected[0].concatenate(corrected[1])
adatas.obs_names = [x.split('-')[0] for x in adatas.obs_names]
adatas.obsm['X_corrected'] = merged_x
adatas.obsm['X_corrected_umap'] = umap_merged_x
adatas.layers['original_mat'] = sp.sparse.csr_matrix(
    np.concatenate([adata_scv_me49.X.A, adata_scv_pru.X.A]))
batch = ['ME49' if '10099011' in x else 'Pru' for x in adatas.obs_names]
adatas.obs['batch'] = batch

## Save scanorama results
adatas.write_h5ad(filename=output_dir +
                  '../Data/pru/adata_integrated_0506_me49.h5ad',
                  compression='gzip')
Exemplo n.º 12
0
def cli(dataset, engine, format, layout, recipe, output, sparse, plotting):
    """
    Hi! This is a tool for preprocessing data for use with cellxgene.
    """
    import matplotlib
    matplotlib.use('Agg')
    import scanpy.api as sc
    import pandas as pd
    import numpy as np

    # scanpy settings 
    sc.settings.verbosity = 2
    sc.settings.autosave = True

    # data loading
    adata = None

    if format == 'h5ad':
        adata = sc.read_h5ad(dataset)
    if format == '10x_mtx':
        adata = sc.read_10x_mtx(dataset)
    if format == 'loom' and sparse:
        adata = sc.read_loom(dataset, sparse=True)
    if format == 'loom' and not sparse:
        adata = sc.read_loom(dataset, sparse=False)

    adata.var_names_make_unique()

    # run a recipe if requested
    if recipe == 'seurat':
        sc.pp.recipe_seurat(adata)
    elif recipe == 'zheng17':
        sc.pp.recipe_zheng17(adata)
    else:
        sc.pp.filter_cells(adata, min_genes=5)
        sc.pp.filter_genes(adata, min_cells=25)
        if sparse:
            sc.pp.scale(adata, zero_center=False)
        else:
            sc.pp.scale(adata)

    # dimensionality reduction
    if sparse:
        sc.pp.pca(adata, svd_solver='arpack', zero_center=False)
    else:
        sc.pp.pca(adata, svd_solver='arpack')

    # neighbors and clustering
    sc.pp.neighbors(adata)
    sc.tl.louvain(adata)

    # layout and plotting
    if len(np.unique(adata.obs['louvain'].values)) < 10:
        palette = 'tab10'
    else:
        palette = 'tab20'

    if layout == 'umap' or layout == 'umap+tsne':
        sc.tl.umap(adata)
        if plotting:
            sc.pl.umap(adata, color='louvain', palette=palette, save='_louvain')

    if layout == 'tsne' or layout == 'umap+tsne':
        sc.tl.tsne(adata)
        if plotting:
            sc.pl.tsne(adata, color='louvain', palette=palette, save='_louvain')    

    # show the structure
    print('data structure...')
    print(adata)

    # saving file
    if not output == '':
        print('saving output...')
        adata.write(output)
Exemplo n.º 13
0
    4689, 4708, 4722, 4730, 4739, 4747, 4749, 4800
]) / 4800.0
cov_milestones = [
    2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40
]
dmx_perturb = 0
cov_round = reads_per_singlet / 1000
for i in range(len(cov_milestones)):
    if i == 0:
        if cov_round <= cov_milestones[i]:
            dmx_perturb = demux_perturbation[i]
    elif i > 0 and i <= len(cov_milestones) - 1:
        if cov_milestones[i - 1] < cov_round <= cov_milestones[i]:
            dmx_perturb = demux_perturbation[i]

data = api.read_h5ad(
    "GROUND_TRUTH_CT_DMX/IGOR_120-individuals_3000-cells.h5ad")

matrix_ = data.X
observations_ = data.obs
observations_.index = observations_.index.astype(int)

genes = list(data.var["gene"])
individuals = list(observations_["ind_cov"].unique())

print(Counter(observations_["ind_cov"]))
# permute the individuals labels due to multiplexing performance at difference levels of coverage

ind_minus = {}
for x in individuals:
    ind_minus[x] = [y for y in individuals if y != x]
if dmx_perturb > 0:
Exemplo n.º 14
0
def import_data(data_p1, data_p2, create = True):
    """
    Utility funciton to import both samples together with proteins.
    
    Parameters
    --------
    data_p1, data_p2: str
        data paths
        
    Output
    --------
    adata: AnnData Object
    """

    if not create:
        path = '../data_update_cite_seq/tec_cite_h5_2019jun18'
        return sc.read_h5ad(path + '/adata.h5') 

    adata_list = []
    for d in [data_p1, data_p2]:
        p_genes = d + 'filtered_feature_bc_matrix/'
        features = pd.read_csv(p_genes + 'features.tsv', delimiter='\t', header=None)[1]
        barcodes = pd.read_csv(p_genes + 'barcodes.tsv', delimiter='\t', header=None)
        matrix = scipy.io.mmread(p_genes + 'matrix.mtx')
        adata = anndata.AnnData(matrix.tocsr()) # compared execution time with anndata.read_mtx and this is faster
        adata.obs.index = features.values.tolist()
        adata.var.index =  barcodes[0].str.slice(stop=-2).values.tolist() # index for proteins
        #adata.var.index =  barcodes[0].values.tolist() # index for proteins
        adata_list.append(adata)                                                                                                                                               


    adata_r1 = adata_list[0].T
    adata_r2 = adata_list[1].T         
    adata_r1.var_names_make_unique()
    adata_r2.var_names_make_unique() 


    # Proteomic Data
    prot_list = []
    for d in [data_p1, data_p2]:
        p_prot = d + 'umi_count/'
        features = pd.read_csv(p_prot + 'features.tsv', delimiter='\t', header=None)[0].values.tolist() 
        features = [f[:f.find('-')] for f in features]
        features[-1] = features[-1] + 'd'
        barcodes = pd.read_csv(p_prot + 'barcodes.tsv', delimiter='\t', header=None)
        matrix = scipy.io.mmread(p_prot + 'matrix.mtx')
        prot = pd.DataFrame(data = matrix.todense(), index = features, columns = barcodes[0].values)
        prot_list.append(prot)

    prot_r1 = prot_list[0].T
    prot_r2 = prot_list[1].T

    protein_names = list(prot_r1.columns)

    # bring the adata object in the right order
    adata_r1 = adata_r1[prot_r1.index]
    adata_r2 = adata_r2[prot_r2.index]

    # combine these again
    adata = adata_r1.concatenate(adata_r2)

    # make names unique
    adata_r1.obs_names_make_unique()
    adata_r2.obs_names_make_unique()

    # combine
    prot = pd.concat((prot_r1, prot_r2), axis=0)

    # add the proteins to the adata object
    adata.obsm['prot'] = prot.values

    # add the protein names
    adata.uns['prot_names'] = protein_names

    # Add some annotations 
    # mitochondrial genes
    mito_genes = [name for name in adata.var_names if name.startswith('mt-')]
    adata.obs['percent_mito'] = np.sum(
        adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)
    adata.obs['n_counts_0'] = scipy.sparse.csr_matrix.sum(adata.X, axis = 1) # Number of counts in each cell
    adata.obs['n_genes_0'] = scipy.sparse.csr_matrix.sum(adata.X>0, axis = 1) # Number of genes in each cell
    adata.var['n_cells_0'] =  np.sum(adata.X>0, axis = 0).T # Number of cells where the gene is expressed
    adata.var['n_counts_gene'] = np.sum(adata.X, axis = 0).T # Number of counts of each gene across all cells

    # add some protein annotations
    adata.obs['n_proteins'] = np.sum(adata.obsm['prot'][:, :-1] > 0, axis = 1)
    adata.obs['unmapped'] = adata.obsm['prot'][:, -1]

    #write h5 file
    adata.write('../data_update_cite_seq/tec_cite_h5_2019jun18/adata.h5')

    return adata
import numpy as np
from collections import Counter
from scanpy import api
import pandas as pd
from scipy.sparse import csr_matrix, vstack
import time
import pickle

exname = sys.argv[1]
number_individuals = int(sys.argv[2])
singlets_per_individual = int(sys.argv[3])
multiplets_per_individual = int(sys.argv[4])
reads_per_singlet = int(sys.argv[5])
reads_per_doublet = int(sys.argv[6])

data = api.read_h5ad("IGOR_120-individuals_3000-cells.h5ad")

matrix_ = data.X
observations_ = data.obs
observations_.index = observations_.index.astype(int)

genes = list(data.var["gene"])
individuals = list(observations_["ind_cov"].unique())
if number_individuals < len(individuals):
    individuals = np.random.choice(list(observations_["ind_cov"].unique()),
                                   size=number_individuals,
                                   replace=False)

good_rows = []
doublets = {}
for individual in individuals:
Exemplo n.º 16
0
def merge_anndatas(anndata_paths, output_path):

    first_adata = sc.read_h5ad(anndata_paths[0])
    concat_adata = first_adata.concatenate(
        sc.read_h5ad(a) for a in anndata_paths[1:])
    concat_adata.write(output_path)
#!/usr/bin/env python
# coding: utf-8

# import
import os
import sys
import scanpy.api as sc
from anndata import AnnData

sample_name = sys.argv[1]
cluster_to_filter = sys.argv[2]

print('filtering clusters:{0} for {1} '.format(cluster_to_filter, sample_name))

wd = os.path.join(os.getcwd(), sample_name)
adata = sc.read_h5ad(
    filename=os.path.join(wd, '{0}.adata.h5ad'.format(sample_name)))

barcodes = adata.obs.index[adata.obs.leiden.isin([cluster_to_filter])]
filename = os.path.join(wd, '{}.filtered.txt'.format(sample_name))

print('total  {0} low quality cells will be recorded in {1}'.format(
    len(barcodes), filename))

with open(filename, 'w') as f:
    f.writelines('\n'.join(barcodes))

print('END')
Exemplo n.º 18
0
def createClusterFigure(doc):
    active_gene = None
    print('Starting Document....')

    if doc.session_context.request.arguments is not None:
        args = doc.session_context.request.arguments

    dataPath = doc.session_context.db_path
    if dataPath != 'None':
        dataSet = sc.read_h5ad(dataPath)
    else:
        dataSet = sc.read_h5ad('/app/ProcessedData.h5ad')

    geneList = [(args[x][0].decode())for x in args.keys() if 'Gene' in x]
    if 'None' not in geneList and len(geneList) is not 0:
        print(geneList[0])
        active_gene = geneList[0]

    def makePlot(doc, active_gene, adata):

        cdsDict = {}
        cdsDict['x'] = adata.obsm['X_umap'][:, 0]
        cdsDict['y'] = adata.obsm['X_umap'][:, 1]

        single_gene_colors = []
        # Color by Cluster
        color_Dict = dict(zip(adata.obs['louvain'].cat.categories, adata.uns['louvain_colors']))
        colors = [
            color_Dict[cluster] for cluster in adata.obs['louvain'] if cluster in color_Dict.keys()
        ]
        cdsDict['color'] = colors

        # Color by n_genes
        gene_colors = []
        for x, y, z, _ in 255 * mpl.cm.viridis(mpl.colors.Normalize()(adata.obs['n_genes'].tolist())):
            gene_colors.append("#%02x%02x%02x" % (int(x), int(y), int(z)))

        cdsDict['gene_colors'] = gene_colors

        if active_gene is not None:
            # Sort matrix by gene columnthen normalize the count values
            geneExpression = adata.X[:, adata.var.index == active_gene].flatten()
            single_gene_colors = []
            for x, y, z, _ in 255 * mpl.cm.viridis(mpl.colors.Normalize()(geneExpression)):
                single_gene_colors.append("#%02x%02x%02x" % (int(x), int(y), int(z)))
            if len(single_gene_colors) == 2638:
                cdsDict['single_gene'] = single_gene_colors

        source = ColumnDataSource(cdsDict)
        # source = ColumnDataSource(dict( x=adata.obsm['X_umap'][:, 0], y=adata.obsm['X_umap'][:, 1], color=colors, gene_colors=gene_colors, single_gene=single_gene_colors))
        title = 'T-SNE visualization of sequences'

        geneTitle = 'n_genes'

        plotDict = {}

        plot_lda = figure(plot_width=800, plot_height=600, title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1)

        plot_lda.scatter(x='x', y='y', legend='label', source=source, color='color',
                         alpha=0.8, size=5)

        plotDict['tsne'] = plot_lda

        genePlot = figure(plot_width=800, plot_height=600, title=geneTitle, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1)
        genePlot.scatter(x='x', y='y', legend='label', source=source, color='gene_colors',
                         alpha=0.8, size=5)

        plotDict['nGene'] = genePlot

        if 'single_gene' in cdsDict.keys():
            singleGene = figure(plot_width=800, plot_height=600, title=active_gene, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1)
            singleGene.scatter(x='x', y='y', legend='label', source=source, color='single_gene', alpha=0.8, size=5)

            plotDict['sGene'] = singleGene

        return plotDict

    def update(new):
        active_gene = geneList[new]
        sgCol.children[0] = makePlot(doc, active_gene, dataSet)['sGene']

    plotDict = makePlot(doc, active_gene, dataSet)

    # hover tools
    hover = plotDict['tsne'].select(dict(type=HoverTool))
    hover.tooltips = {"content": "Sequence: @seq, CCS: @ccs, Charge: @charge "}
    plotDict['tsne'].legend.location = "top_left"

    button_group = RadioButtonGroup(labels=geneList)
    button_group.on_click(update)

    tabList = []

    if 'sGene' in plotDict.keys():
        controls = widgetbox([button_group], width=800)
        sgCol = column(plotDict['sGene'], controls)
        sgTab = Panel(child=sgCol, title="Single Gene")
        tabList.append(sgTab)

    tsneTab = Panel(child=plotDict['tsne'], title="Louvain")
    tabList.append(tsneTab)

    nGeneTab = Panel(child=plotDict['nGene'], title="nGene")
    tabList.append(nGeneTab)

    tabs = Tabs(tabs=tabList)

    doc.add_root(tabs)

    return doc
Exemplo n.º 19
0
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy.api as sc

PARENT_DIR = os.path.join(sys.path[0], '..')

dataset = sys.argv[1]
data_dir = "{}/results/downstream/{}".format(PARENT_DIR, dataset)

imputation = [
    "deepImpute", "DCA", "VIPER", "MAGIC", "SAVER", "scImpute", "DrImpute",
    "raw"
]

raw = sc.read_h5ad('{}/paper_data/downstream/raw_{}.h5ad'.format(
    PARENT_DIR, dataset))
cells = raw.obs.index
genes = raw.var.index
metadata = raw.obs.celltype.values

components_all = []
colnames = ["UMAP_1", "UMAP_2"]
for method in imputation:
    try:
        data = np.load("{}/results/downstream/UMAP/{}/{}.npy".format(
            PARENT_DIR, dataset, method))
        df = pd.DataFrame(data, index=cells, columns=colnames)
        df["meta"] = metadata
        df["imputation"] = method
        components_all.append(df)
    except:
Exemplo n.º 20
0
 def asScanpy(self):
     adata = sc.read_h5ad("./rdata/h5/assays.h5")
     return adata
Exemplo n.º 21
0
import scanpy.api as sc
import scipy.sparse as sp_sparse

# andata = sc.read_h5ad("./ExprMatrix.h5ad")
andata = sc.read_h5ad("./100_test_data.h5ad")
print("Finished reading.")
andata.var_names_make_unique()
if sp_sparse.issparse(andata.X):
    andata.X = andata.X.toarray()
    # andata = andata
partial_data = andata[:100, :]
print("Finished processing")
sc.write("100_test_data.h5ad", partial_data)
print("Finished writing.")