예제 #1
0
def annotate_cell_cycle_scores_human(
    adata_results_file,
    cell_cycle_file='/ahg/regevdata/users/oursu/code/general_data/cellcycle/regev_lab_cell_cycle_genes.txt'
):
    cell_cycle_genes = [x.strip() for x in open(cell_cycle_file)]

    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]

    adata_cellcycle = sc.read(adata_results_file + '.basic.h5ad')
    adata_cellcycle

    sc.pp.log1p(adata_cellcycle)
    sc.pp.scale(adata_cellcycle)

    sc.tl.score_genes_cell_cycle(adata_cellcycle,
                                 s_genes=s_genes,
                                 g2m_genes=g2m_genes)

    adata_annotated = sc.read(adata_results_file + '.basic.h5ad')
    #now, assign the cell cycle scores from the adata_cellcycle to adata_annotated
    s_scores = adata_cellcycle.obs['S_score']
    g2m_scores = adata_cellcycle.obs['G2M_score']
    adata_annotated.obs['S_score_added'] = s_scores.loc[
        adata_annotated.obs_names]
    adata_annotated.obs['G2M_score_added'] = g2m_scores.loc[
        adata_annotated.obs_names]
    adata_annotated

    adata_annotated.write(adata_results_file + '.basic.cc.h5ad')
    def __init__(self, ah5_path, scanpy_pca):
        self.prefix = ah5_path.split("/")[0]
        self.scanpy_pca = scanpy_pca
        self.adata_dict = {}


        # Create output directories
        for output_dir in glob.glob(ah5_path+"/*"):
            # Made change for in vitro data
            if output_dir.split("/")[-1] == 'preprocessing_summary':
                pass
            else:
                try:
                    if not os.path.exists(output_dir + "/principle_component_matrices"):
                        os.makedirs(output_dir + "/principle_component_matrices")
                    if not os.path.exists(output_dir + "/principle_component_analysis_figures"):
                        os.makedirs(output_dir + "/principle_component_analysis_figures")
                except OSError:
                    print("Error creating directory")

        for processed_file in glob.glob(ah5_path+"*/gene_matrices/*.h5ad"):
            # Made changes for in vitro data
            if len(processed_file.split('/')[-1].split('_')) <= 2:
                print(processed_file)
                self.adata_dict[processed_file.split('/')[-1].split('.')[0]] = sc.read(processed_file)
def generate_cluster_expression_output_file(sample_cluster_dict,
                                            raw_log_scanpy_obj, output_dir):
    """
    :param sample_clustered_dict:
    :param output_dir:
    return:
    """
    for sample_key, adata in sample_cluster_dict.items():
        sample_cluster_dict[sample_key].raw = sc.read(raw_log_scanpy_obj)
        for cluster in list(set(adata.obs.louvain)):
            cluster_specific_cells = sample_cluster_dict[sample_key].obs.loc[(
                sample_cluster_dict[sample_key].obs.louvain == cluster
            )].index.tolist()
            sample_cluster_subset = sample_cluster_dict[sample_key][
                cluster_specific_cells, :]
            try:
                # this works for sparse matrices
                subset_df = pd.DataFrame(
                    data=sample_cluster_subset.raw.X.toarray(),
                    index=sample_cluster_subset.obs.index.tolist(),
                    columns=sample_cluster_subset.raw.var.index.tolist())
            except AttributeError:
                # AttributeError occurs for non-sparse matrices (no toarrray())
                subset_df = pd.DataFrame(
                    data=sample_cluster_subset.raw.X,
                    index=sample_cluster_subset.obs.index.tolist(),
                    columns=sample_cluster_subset.raw.var.index.tolist())
            subset_df.to_csv(output_dir + sample_key + '/' + sample_key +
                             '_cluster_' + cluster + '.tsv',
                             sep='\t',
                             index=True)
예제 #4
0
    def read_raw_file(self):
        """
        Reads the raw data file and turns it into a dense matrix, stored as
        an attribute.

        Returns
        -------

        """

        print("reading single cell data from {}".format(self.raw_file))

        file_format = self.raw_file.split('.')[-1]

        if file_format == 'h5':
            andata = sc.read_10x_h5(self.raw_file)
        elif file_format == 'h5ad':
            andata = sc.read(self.raw_file)
        else:
            raise ValueError('Reading [ %s ] failed, the inferred file '
                             'format [ %s ] is not supported. Please convert '
                             'your file to either h5 or h5ad format.' %
                             (self.raw_file, file_format))

        # appends -1 -2... to the name of genes that already exist
        andata.var_names_make_unique()
        if sp_sparse.issparse(andata.X):
            andata.X = andata.X.toarray()

        self.sc_raw = andata
예제 #5
0
def load_10x_scanpy(path, batch_label):
    sc01 = sc.read('{}/matrix.mtx'.format(path), cache=True).T
    sc01.var_names = pd.read_table('{}/genes.tsv'.format(path), header=None)[1]
    sc01.obs_names = pd.read_table('{}/barcodes.tsv'.format(path),
                                   header=None)[0]
    sc01.obs_names = sc01.obs_names.str.replace('-1', '')
    sc01.var_names_make_unique()
    sc.pp.filter_cells(sc01, min_genes=200)
    sc.pp.filter_genes(sc01, min_cells=3)

    sc01.obs['n_UMI'] = np.sum(sc01.X, axis=1).A1

    mito_genes = sc01.var_names[sc01.var_names.str.match(r'^mt-')]
    sc01.obs['percent_mito'] = np.sum(sc01[:, mito_genes].X,
                                      axis=1).A1 / sc01.obs['n_UMI']

    ribo_genes = sc01.var_names[sc01.var_names.str.match(
        r'^(Rpl|Rps|Mrpl|Mrps)')]
    sc01.obs['percent_ribo'] = np.sum(sc01[:, ribo_genes].X,
                                      axis=1).A1 / sc01.obs['n_UMI']

    assgn = pd.read_csv('{}/{}_assgn.csv'.format(
        os.path.join(CUR_DIR, '..', '01-cluster-sc01-sc02'),
        batch_label,
    ),
                        index_col=0)
    assgn.columns = ['cluster']

    sc01.obs['cluster'] = assgn.cluster[sc01.obs.index]
    return sc01
예제 #6
0
def read_dataset(adata, transpose=False, test_split=False, copy=False):

    if isinstance(adata, sc.AnnData):
        if copy:
            adata = adata.copy()
    elif isinstance(adata, str):
        adata = sc.read(adata, first_column_names=True)
    else:
        raise NotImplementedError

    # check if observations are unnormalized using first 10
    X_subset = adata.X[:10]
    norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.'
    if sp.sparse.issparse(X_subset):
        assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error
    else:
        assert np.all(X_subset.astype(int) == X_subset), norm_error

    if transpose: adata = adata.transpose()

    if test_split:
        train_idx, test_idx = train_test_split(np.arange(adata.n_obs),
                                               test_size=0.1,
                                               random_state=42)
        spl = pd.Series(['train'] * adata.n_obs)
        spl.iloc[test_idx] = 'test'
        adata.obs['dca_split'] = spl.values
    else:
        adata.obs['dca_split'] = 'train'

    adata.obs['dca_split'] = adata.obs['dca_split'].astype('category')
    print('dca: Successfully preprocessed {} genes and {} cells.'.format(
        adata.n_vars, adata.n_obs))

    return adata
예제 #7
0
def main() :
    # check there is exactly one command-line argument provided
    if(len(sys.argv) != 2):
        sys.stderr.write("usage: " + __file__ + " <adata-file-path>\n")
        sys.exit(1)

    anndata = sc.read(sys.argv[1])

    # run PCA, compute pairwise distance, k-nearest-neighbors \n# trick to replacing scanpy implementation with our own
    # is to \n# update the anndata object with our intermediate values\n# \n# we replace the scanpy version of PCA by
    # updating\n# .adata.obsm['X_pca'] = our PCA() output\n#\n# we replace the scanpy version of k-nearest-neighbors by
    # updating\n# self.adata.uns['neighbors']['connectivities'] = our knn() output\n
    # self.adata.uns['neighbors']['distances'] = out knn()
    # output\n\nknng = KnnG(anndata, n_neighbors=12, runPCA=True, nPC=50)")

    # umap() reduce results to 2 deminsions so that we can plot the data\nsc.tl.umap(anndata)')


    # ### 1.c. [5 pts] Turn in a UMAP plot of your 12-NN graph calculated from the combined chemistry PBMC dataset
    # colored by batch (the chemistry used)
    scanpy.pl.umap(anndata, color=['Method'])

    # ### 1.d. [5 pts] Turn in another UMAP plot of your 12-NN graph calculated from the combined chemistry PBMC dataset
    # but colored by cell type
    scanpy.pl.umap(anndata, color=['Cell type'])
예제 #8
0
    def testAdata(self):
        '''
        The real deal
        '''
        self.logger.info("BEGIN")
        anndata = sc.read("../PBMC.merged.h5ad")

        xxx = anndata.uns['neighbors']['connectivities']

        # anndata.uns['neighbors']['connectivities'] csr_matrix
        # shape <class 'tuple'>: (15476, 15476)
        #
        # adata.obs['louvain']
        # Series: index
        # data_3p-AAACCTGAGCATCATC-0     9
        # data_3p-AAACCTGAGCTAGTGG-0     5
        # <class 'tuple'>: (15476,)

        #knn takes about 3 or 4 min
        # run our implementation of nearest neighboors and update anndata
        # todo try running with out knng maybe adata has values already save time
        KnnG(anndata, n_neighbors=12, runPCA=True, nPC=50)

        self.logger.info("begin Louvain.runWithAdata")
        start = timer()
        root = Louvain.runWithAdata(anndata)
        end = timer()
        self.logger.info("Louvain.runWithAdata execution time:{}"\
                         .format(timedelta(seconds=end-start)))

        self.logger.info("END\n")
예제 #9
0
def Smillie2019_processed():
    """Processed data from Smillie et al. Intra- and Inter-cellular Rewiring of the Human Colon during Ulcerative Colitis. Cell. 2019

    The data consists of processed single cell expression data from colon mucosa from 7 ulcerative colitis (UC) patients and 10 healthy controls, paired samples (inlamed, non-inflamed for UC, location-matched for healthy): 34 samples. Epithelial (EPI) and lamina propria (LP) fractions enriched in a two-step digestion process. Data was filtered, batch corrected using BBKNN and celltypes were annotated.


    
    Returns
    -------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
            
    Example
    -------

    >>> import besca as bc
    >>> adata = bc.datasets.Smillie2019_processed()
    >>> adata
    
    """

    filename = pkg_resources.resource_filename(
        'besca', 'datasets/data/Smillie2019_processed.h5ad')
    adata = read(filename, cache=True)
    return adata
예제 #10
0
def read10xData(path,min_genes):
    """ 
    Reads, precesses and returns single cell data. 
    
    Parameters
    ----------
    path : Str
        Diractory path, the location of single cell data.
        
    min_genes : Int
        The minimum number of genes for a cell to have in order to participate the analysis.
        
    Returns
    -------
    scData : AnnData
        Single cell data. 
        
    """
    
    result = sc.read(path + 'matrix.mtx').transpose() #, cache=True
    result.var_names = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1]
    result.obs_names = np.genfromtxt(path + 'barcodes.tsv', dtype=str)
    result.var_names_make_unique()
    result.obs['n_counts'] = np.sum(result.X, axis=1).A1
    sc.pp.filter_cells(result, min_genes=min_genes)
    
    return result
예제 #11
0
def generate_sample_expression_output_file(sample_cluster_dict,
                                           raw_log_scanpy_obj, output_dir):
    """
    :param sample_clustered_dict:
    :param output_dir:
    return:
    """
    for sample_key, adata in sample_cluster_dict.items():
        sample_cluster_dict[sample_key].raw = sc.read(raw_log_scanpy_obj)
        #sample_expression_df = pd.DataFrame(adata.X, index = adata.obs.index.tolist(), columns = adata.var.index.tolist())
        try:
            # this works for sparse matrices
            sample_expression_df = pd.DataFrame(
                data=sample_cluster_dict[sample_key].raw.X.toarray(),
                index=sample_cluster_dict[sample_key].obs.index.tolist(),
                columns=sample_cluster_dict[sample_key].raw.var.index.tolist())
        except AttributeError:
            # AttributeError occurs for non-sparse matrices (no toarrray())
            sample_expression_df = pd.DataFrame(
                data=sample_cluster_dict[sample_key].raw.X,
                index=sample_cluster_dict[sample_key].obs.index.tolist(),
                columns=sample_cluster_dict[sample_key].raw.var.index.tolist())
        sample_expression_df.to_csv(output_dir + sample_key +
                                    '_log_expression.tsv',
                                    sep='\t',
                                    index=True)
예제 #12
0
    def __init__(self, pca_h5ad, n_pcs, n_neighbors):

        self.adata_dict = {}
        self.prefix = pca_h5ad

        for processed_file in glob.glob(
                pca_h5ad + "*/principle_component_matrices/*.h5ad"):
            self.adata_dict[processed_file.split('/')[-3]] = sc.read(
                processed_file)

        for output_dir in glob.glob(pca_h5ad + "*/"):
            if output_dir.split("/")[-1] == 'preprocessing_summary':
                pass
            else:
                try:
                    if not os.path.exists(output_dir + '/cluster_matrices'):
                        os.makedirs(output_dir + '/cluster_matrices')

                    if not os.path.exists(output_dir + "/cluster_analysis"):
                        os.makedirs(output_dir + "/cluster_analysis")

                    # if not os.path.exists(output_dir + "/cluster_analysis/tSNE"):
                    #     os.makedirs(output_dir + "/cluster_analysis/tSNE")
                    #
                    # if not os.path.exists(output_dir + "/cluster_analysis/umap"):
                    #     os.makedirs(output_dir + "/cluster_analysis/umap")
                    #
                    # if not os.path.exists(output_dir + "/cluster_analysis/louvain"):
                    #     os.makedirs(output_dir + "/cluster_analysis/louvain")

                except OSError:
                    print("Error creating directory")

        self.n_pcs = n_pcs
        self.n_neighbors = n_neighbors
예제 #13
0
def read_h5ad(args):

    #read input h5ad
    dataset = sc.read(args.input)
    print("File read!")

    compute_entropy(dataset)
예제 #14
0
def getdata(dataset):
    basedir = os.path.abspath(os.path.join(__file__, ".."))
    if dataset == "green":
        adata = sc.read(basedir + "/data/green/green.h5ad")
        process_clusts(adata, "CellType")
    elif dataset == "paul":
        adata = sc.read(basedir + "/data/paul/paul.h5ad")
        process_clusts(adata, "paul15_clusters")
    elif dataset == "zeisel":
        adata = sc.read(basedir + "/data/zeisel/zeisel.h5ad")
        process_clusts(adata, "group")
    elif dataset == "zheng":
        adata = sc.read(basedir + "/data/zheng/fresh_68k_bulk_labels.h5ad")
        process_clusts(adata)
    else:
        raise ValueError("No such dataset")
    return adata
예제 #15
0
def check_dl(filename, url):
    try:
        adata = read(filename, backup_url=url, cache=True)
    except Exception:
        raise URLError(
            f'\n\n\n {filename} could not be downloaded from {url}; \n Please download it manually and store it in your besca installation: besca/datasets/data/'
        )
    return adata
def train(data_name="pbmc", cell_type="CD4T", p_type="unbiased"):
    train_path = f"../data/train_{data_name}.h5ad"
    if data_name == "pbmc":
        ctrl_key = "control"
        stim_key = "stimulated"
        cell_type_key = "cell_type"
    elif data_name == "hpoly":
        ctrl_key = "Control"
        stim_key = "Hpoly.Day10"
        cell_type_key = "cell_label"
    elif data_name == "salmonella":
        ctrl_key = "Control"
        stim_key = "Salmonella"
        cell_type_key = "cell_label"
    data = sc.read(train_path)
    print("data has been loaded!")
    train = data[~((data.obs["condition"] == stim_key) &
                   (data.obs[cell_type_key] == cell_type))]
    pca = PCA(n_components=100)

    pca.fit(train.X.A)

    train_real_cd = train[train.obs["condition"] == "control", :]
    if p_type == "unbiased":
        train_real_cd = scgen.util.balancer(train_real_cd)
    train_real_stimulated = train[train.obs["condition"] == "stimulated", :]
    if p_type == "unbiased":
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)

    import scipy.sparse as sparse
    if sparse.issparse(train_real_cd.X):
        train_real_cd.X = train_real_cd.X.A
        train_real_stimulated.X = train_real_stimulated.X.A

    train_real_stimulated_PCA = pca.transform(train_real_stimulated.X)
    train_real_cd_PCA = pca.transform(train_real_cd.X)

    adata_list = scgen.util.extractor(data, cell_type, {
        "ctrl": ctrl_key,
        "stim": stim_key
    })
    if sparse.issparse(adata_list[1].X):
        adata_list[1].X = adata_list[1].X.A
        adata_list[2].X = adata_list[2].X.A
    ctrl_CD4T_PCA = pca.transform(adata_list[1].X)
    predicted_cells = predict(pca, train_real_cd_PCA,
                              train_real_stimulated_PCA, ctrl_CD4T_PCA, p_type)

    all_Data = sc.AnnData(
        np.concatenate([adata_list[1].X, adata_list[2].X, predicted_cells]))
    all_Data.obs["condition"] = ["ctrl"] * len(adata_list[1].X) + ["real_stim"] * len(adata_list[2].X) + \
                                ["pred_stim"] * len(predicted_cells)
    all_Data.var_names = adata_list[3].var_names
    if p_type == "unbiased":
        sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T.h5ad", all_Data)
    else:
        sc.write(f"../data/reconstructed/PCAVecArithm/PCA_CD4T_biased.h5ad",
                 all_Data)
    def __init__(self, cluster_h5ad, marker_gene_file, cell_type):

        self.prefix = cluster_h5ad
        self.cell_type = cell_type

        for clusters in glob.glob(self.prefix + "/*"):
            if clusters.split("/")[-1] != 'preprocessing_summary':
                try:
                    if not os.path.exists(clusters + "/cluster_analysis"):
                        os.makedirs(clusters + "/cluster_analysis")

                    # Create marker gene analysis directory
                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis")

                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/tSNE"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/tSNE")

                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/louvain"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/louvain")

                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/umap"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/umap")

                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/heatmap"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/heatmap")

                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/dotplot"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/dotplot")

                    if not os.path.exists(clusters + "/cluster_analysis/marker_gene_analysis/cluster_gene_rankings"):
                        os.makedirs(clusters + "/cluster_analysis/marker_gene_analysis/cluster_gene_rankings")

                    # Create cell type analysis directory
                    if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis"):
                        os.makedirs(clusters + "/cluster_analysis/cell_type_analysis")

                    if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis/tSNE"):
                        os.makedirs(clusters + "/cluster_analysis/cell_type_analysis/tSNE")

                    if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis/louvain"):
                        os.makedirs(clusters + "/cluster_analysis/cell_type_analysis/louvain")

                    if not os.path.exists(clusters + "/cluster_analysis/cell_type_analysis/umap"):
                        os.makedirs(clusters + "/cluster_analysis/cell_type_analysis/umap")

                except OSError:
                    print("Error creating directory.")

        # Create dictionary containing cell cluster cluster cluster_matrices
        self.cluster_matrices_dict = {}
        for cluster_matrices in glob.glob(self.prefix + "/*/cluster_matrices/*"):
            self.cluster_matrices_dict[cluster_matrices.split('/')[-3]] = sc.read(cluster_matrices)

        # Create marker gene list with provided txt file
        self.marker_gene_file = marker_gene_file
        if self.marker_gene_file:
            self.marker_gene_list = list(map(lambda x: x.strip(), marker_gene_file.readlines()))
예제 #18
0
def adata_neighbors():
    adata = sc.read('./data/pbmc3k_raw.h5ad',
                    backup_url='http://falexwolf.de/data/pbmc3k_raw.h5ad')
    sc.pp.filter_genes(adata, min_cells=1)
    sc.pp.normalize_per_cell(adata)
    sc.pp.log1p(adata)
    sc.pp.pca(adata)
    sc.pp.neighbors(adata)
    return adata
def reconstruct():
    train_path = "../data/train_pbmc.h5ad"
    data = sc.read(train_path)
    ctrl_key = "control"
    stim_key = "stimulated"
    all_data = anndata.AnnData()
    print(data.obs["cell_type"].unique().tolist())
    for idx, cell_type in enumerate(data.obs["cell_type"].unique().tolist()):
        pca = PCA(n_components=100)
        train = data[~((data.obs["condition"] == stim_key) &
                       (data.obs["cell_type"] == cell_type))]
        pca.fit(train.X.A)
        print(cell_type, end="\t")
        train_real_stimulated = data[data.obs["condition"] == stim_key, :]
        train_real_stimulated = train_real_stimulated[
            train_real_stimulated.obs["cell_type"] != cell_type]
        train_real_stimulated = scgen.util.balancer(train_real_stimulated)
        train_real_stimulated_PCA = pca.transform(train_real_stimulated.X)

        train_real_cd = data[data.obs["condition"] == ctrl_key, :]
        train_real_cd = scgen.util.balancer(train_real_cd)
        train_real_cd_PCA = pca.transform(train_real_cd.X)

        cell_type_adata = data[data.obs["cell_type"] == cell_type]
        cell_type_ctrl = cell_type_adata[cell_type_adata.obs["condition"] ==
                                         ctrl_key]
        cell_type_stim = cell_type_adata[cell_type_adata.obs["condition"] ==
                                         stim_key]
        if sparse.issparse(cell_type_ctrl.X):
            cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X.A)
        else:
            cell_type_ctrl_PCA = pca.transform(cell_type_ctrl.X)
        predicted_cells = predict(pca, train_real_cd_PCA,
                                  train_real_stimulated_PCA,
                                  cell_type_ctrl_PCA)
        if sparse.issparse(cell_type_ctrl.X):
            all_Data = sc.AnnData(
                np.concatenate(
                    [cell_type_ctrl.X.A, cell_type_stim.X.A, predicted_cells]))
        else:
            all_Data = sc.AnnData(
                np.concatenate(
                    [cell_type_ctrl.X, cell_type_stim.X, predicted_cells]))
        all_Data.obs["condition"] = [f"{cell_type}_ctrl"] * cell_type_ctrl.shape[0] + [f"{cell_type}_real_stim"] * \
                                    cell_type_stim.shape[0] + \
                                    [f"{cell_type}_pred_stim"] * len(predicted_cells)
        all_Data.obs["cell_type"] = [f"{cell_type}"] * (
            cell_type_ctrl.shape[0] + cell_type_stim.shape[0] +
            len(predicted_cells))
        all_Data.var_names = cell_type_adata.var_names

        if idx == 0:
            all_data = all_Data
        else:
            all_data = all_data.concatenate(all_Data)
        print(cell_type)
    sc.write("../data/reconstructed/PCAVecArithm/PCA_pbmc.h5ad", all_data)
예제 #20
0
def read_h5ad(x):
    #read input h5ad
    dataset = sc.read(x)

    kwargs = {}
    kwargs["batch_vector"] = dataset.obs["Batch"]
    kwargs["cell_type_vector"] = dataset.obs["cell_type1"]

    do_the_filtering(dataset, **kwargs)
예제 #21
0
def load_10x_12k_mix_mouse():
    filename_data = '/data/martin/single_cell/10x_12k_mix/filtered_gene_bc_matrices/mm10/matrix.mtx'
    filename_genes = '/data/martin/single_cell/10x_12k_mix/filtered_gene_bc_matrices/mm10/genes.tsv'
    filename_barcodes = '/data/martin/single_cell/10x_12k_mix/filtered_gene_bc_matrices/mm10/barcodes.tsv'

    data = sc.read(filename_data, cache=True).transpose()
    data.var_names = np.genfromtxt(filename_genes, dtype=str)[:, 1]
    data.smp_names = np.genfromtxt(filename_barcodes, dtype=str)
    return data
예제 #22
0
def paul_test(n_top_gene=100):
    adata = sc.read("data/paul15/paul15.h5ad")
    sc.pp.filter_cells(adata, min_genes=10)
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.filter_genes(adata, min_cells=20)
    sc.pp.filter_genes_dispersion(adata, n_top_genes=1000)
    sc.pp.log1p(adata)
    sc.pp.scale(adata, zero_center=True, max_value=False)
    return adata
예제 #23
0
def read_dataset(input_file, transpose=False):
    """
    Construct a anndata object
       
    """
    if os.path.isfile(input_file):
        print("The value os", os.path.isfile(input_file))
        if str(input_file).endswith('h5ad'):
            adata = sc.read(input_file)
        elif sum([
                str(input_file).endswith(str(i))
                for i in ["tsv", 'TSV', 'tab', 'data']
        ]):
            adata = sc.read_text(input_file, sep="\t", first_column_names=True)
            if transpose:
                adata = adata.T
        elif sum([str(input_file).endswith(str(i)) for i in ['csv', "CSV"]]):
            adata = sc.read_text(input_file, sep=",", first_column_names=True)
            if transpose:
                adata = adata.T
        else:
            #ValueError 'The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV'
            print(
                "The file must be one of *.h5ad, *.tsv,*TSV,*.tab,*data, *csv,*CSV"
            )
    else:
        #read folder
        mtx = sc.read_mtx(os.path.join(input_file, "matrix.mtx"))
        num_lines = sum(
            1 for line in open(os.path.join(input_file, 'barcodes.tsv')))
        cellinfo = pd.read_csv(os.path.join(input_file, "barcodes.tsv"),
                               sep="\t",
                               header=None if num_lines == mtx.shape[1] else 0)
        if not 'cellname' in cellinfo.columns:
            cellinfo['cellname'] = cellinfo.iloc[:, 0]
        num_lines = sum(
            1 for line in open(os.path.join(input_file, 'genes.tsv')))
        geneinfo = pd.read_csv(os.path.join(input_file, "genes.tsv"),
                               sep="\t",
                               header=None if num_lines == mtx.shape[0] else 0)
        if not 'genename' in geneinfo.columns:
            geneinfo[
                'genename'] = geneinfo.iloc[:,
                                            1]  # for 10x,the second columns is the genename, and the first column is gene_id
        #create anndata
        adata = sc.AnnData(mtx.X.T, obs=cellinfo, var=geneinfo)
        adata.obs_names = adata.obs["cellname"]
        adata.var_names = adata.var["genename"]
        adata.obs_names_make_unique(join="-")
        adata.var_names_make_unique(join="-")
    #create time
    now = datetime.datetime.now()
    adata.uns["ProjectName"] = "DESC created in" + str(
        now.strftime("%Y-%m-%d %H:%M"))
    print("Creat adata successfully! The adata infofation is", adata)
    return adata
예제 #24
0
def getAnnData(matrix, genelist, barcodes):
    sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
    sc.settings.autoshow = False
    print('Reading matrix...')
    adata = sc.read(matrix, cache=False).T
    print(adata)
    print('Reading gene list...')
    genes = pd.read_csv(genelist, header=None, sep='\t')
    geneNames = anndata.utils.make_index_unique(pd.Index(genes[1]))
    adata.var_names = geneNames
    adata.var['gene_ids'] = genes[0].values
    adata.obs_names = pd.read_csv(barcodes, header=None)[0]

    adata.var_names_make_unique()

    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    mito_genes = [name for name in adata.var_names if name.startswith('MT-')]
    # for each cell compute fraction of counts in mito genes vs. all genes
    # the `.A1` is only necessary as X is sparse to transform to a dense array after summing
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1
    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1

    adata = adata[adata.obs['n_genes'] < 2500, :]
    adata = adata[adata.obs['percent_mito'] < 0.05, :]

    adata.raw = sc.pp.log1p(adata, copy=True)

    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    filter_result = sc.pp.filter_genes_dispersion(adata.X,
                                                  min_mean=0.0125,
                                                  max_mean=3,
                                                  min_disp=0.5)

    adata = adata[:, filter_result.gene_subset]

    sc.pp.log1p(adata)

    sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])

    sc.pp.scale(adata, max_value=10)

    sc.tl.pca(adata, svd_solver='arpack')

    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adata)

    sc.tl.louvain(adata)
    sc.pl.umap(adata, color=['louvain'], show=False)

    sc.tl.rank_genes_groups(adata, 'louvain', method='logreg')

    return adata
예제 #25
0
def main():

    ad_path, cs_name, output, gmt_file = parse_args()

    gene_sets = read_gmt(gmt_file)

    ad = sc.read(ad_path)
    percentage_markers_expressed_in_cluster(ad, cs_name,
                                            gene_sets).to_csv(output,
                                                              header=True)
예제 #26
0
def normalize(adata, copy=True, highly_genes = None, filter_min_counts=True, 
              size_factors=True, normalize_input=True, logtrans_input=True):
    """
    Normalizes input data and retains only most variable genes 
    (indicated by highly_genes parameter)

    Args:
        adata ([type]): [description]
        copy (bool, optional): [description]. Defaults to True.
        highly_genes ([type], optional): [description]. Defaults to None.
        filter_min_counts (bool, optional): [description]. Defaults to True.
        size_factors (bool, optional): [description]. Defaults to True.
        normalize_input (bool, optional): [description]. Defaults to True.
        logtrans_input (bool, optional): [description]. Defaults to True.

    Raises:
        NotImplementedError: [description]

    Returns:
        [type]: [description]
    """
    if isinstance(adata, sc.AnnData):
        if copy:
            adata = adata.copy()
    elif isinstance(adata, str):
        adata = sc.read(adata)
    else:
        raise NotImplementedError
    norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.'
    assert 'n_count' not in adata.obs, norm_error
    if adata.X.size < 50e6: # check if adata.X is integer only if array is small
        if sp.sparse.issparse(adata.X):
            assert (adata.X.astype(int) != adata.X).nnz == 0, norm_error
        else:
            assert np.all(adata.X.astype(int) == adata.X), norm_error

    if filter_min_counts:
        sc.pp.filter_genes(adata, min_counts=1)#3
        sc.pp.filter_cells(adata, min_counts=1)
    if size_factors or normalize_input or logtrans_input:
        adata.raw = adata.copy()
    else:
        adata.raw = adata
    if size_factors:
        sc.pp.normalize_per_cell(adata)
        adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts)
    else:
        adata.obs['size_factors'] = 1.0
    if logtrans_input:
        sc.pp.log1p(adata)
    if highly_genes != None:
        sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes = highly_genes, subset=True)
    if normalize_input:
        sc.pp.scale(adata)
    return adata
예제 #27
0
def load_10x_1_3mil_subsample(opt=10):
    if opt == 10:
        filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high10_gene.h5ad'
    elif opt == 5:
        filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high5_gene.h5ad'
    elif opt == 1:
        filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high1_gene.h5ad'
    elif opt == 0.5:
        filename_data = '/data/martin/exp_sceb/subsample_1.3mil/data_1.3mil_high0.5_gene.h5ad'
    data = sc.read(filename_data)
    return data
예제 #28
0
    def readData(self, countsFile=""):
        if countsFile == "":
            countsFile = self.CountsFile

        if countsFile == "":
            print("please input counts file path")
            return ""

        self.CountsFile = countsFile

        datapath = self.CountsFile
        if os.path.isdir(datapath):
            files = os.listdir(datapath)
            for i in files:
                if i.endswith(".gz"):
                    print(i)
                    target = datapath + "/*.gz"
                    print(target)
                    command = subprocess.Popen("gunzip " + target,
                                               shell=True,
                                               stdin=PIPE,
                                               stdout=PIPE,
                                               stderr=STDOUT)
                    output = command.stdout.read()
                    break

            files = os.listdir(datapath)
            for i in files:
                if i == "features.tsv":
                    os.rename(datapath + "/features.tsv",
                              datapath + "/genes.tsv")
                    break
            files = list(os.listdir(datapath))
            if ('barcodes.tsv' in files) and ('barcodes.tsv'
                                              in files) and ("genes.tsv"
                                                             in files):
                adata = sc.read_10x_mtx(datapath, var_names='gene_symbols')
                self.data = adata
                self.preprocess()
            else:
                print("input data is not correct")
                return ""

        elif os.path.isfile(datapath):
            if datapath.endswith(".h5ad"):
                adata = sc.read(datapath)
            else:
                adata = sc.read_csv(datapath)
                adata = adata.T
            self.data = adata
            self.preprocess()
        else:
            print("file or dir not exists")
            return ""
예제 #29
0
def load_10x_ercc_1k():
    """ 
        https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/ercc
    """
    filename_data = '/data/martin/single_cell/10x_ERCC_1k/filtered_matrices_mex/ercc92/matrix.mtx'
    filename_genes = '/data/martin/single_cell/10x_ERCC_1k/filtered_matrices_mex/ercc92/genes.tsv'
    filename_barcodes = '/data/martin/single_cell/10x_ERCC_1k/filtered_matrices_mex/ercc92/barcodes.tsv'
    data = sc.read(filename_data, cache=True).transpose()
    data.var_names = np.genfromtxt(filename_genes, dtype=str)[:, 1]
    data.obs_names = np.genfromtxt(filename_barcodes, dtype=str)
    return data
예제 #30
0
def preprocessing(data_folder,
                  min_genes=200,
                  min_cells=3,
                  max_genes=7000,
                  mito_cutoff=False,
                  normalize=True):
    """
	Combined function for preprocessing using Scanpy. For a more complete documentation on preprocessing, please
	visit

	Input:
		data_folder = Path to data files
		min_genes = Minimum amount of genes required for a gene to be valid (default is set at 200)
		min_cells = Minimum amount of cells required for a gene to be valid (default is set at 3)
		max_genes = Maximum amount of genes permitted for a cell to be valid (default is set at 7000)
		mito_cutoff = Percentage of genes permitted to be assigned to  
		mitochondrial Genes in a cell (default is set at False=0)
		normalize = Normalize the Anndata object (default set at True)

	Returns AnnData type from matrix - genes - barcodes. Full documentation on AnnData can be found on github.

	"""

    #Read data and create initial AnnData Frame
    path = '{}/'.format(data_folder)
    adata = sc.read(path + 'matrix.mtx', cache=True).T  # transpose the data
    adata.var_names = pd.read_csv(path + 'genes.tsv', header=None, sep='\t')[1]
    adata.obs_names = pd.read_csv(path + 'barcodes.tsv', header=None)[0]

    adata.var_names_make_unique()

    #Filter data with min_genes per cell, max_genes per cell, min_cells per genes
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    adata = adata[adata.obs['n_genes'] < max_genes, :]

    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1

    #Create mito_genes and possible filter
    mito_genes = [name for name in adata.var_names if name.startswith('MT-')]
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1

    if int(mito_cutoff) == False:
        pass
    else:
        adata = adata[adata.obs['percent_mito'] < float(mito_cutoff), :]

    #Normalize data option
    if normalize == True:
        sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

    return adata