示例#1
0
    def write_filtered(self,
                       biotypes=['lincRNA', 'antisense'],
                       removeMIR=True,
                       subsample=False,
                       n_obs=2000,
                       min_counts=1000,
                       min_genes=300,
                       min_cells=0,
                       max_mito=20):

        outdir = os.path.join(self.outdir, "filter")
        mkdir(outdir)
        adatas = self.load_mtx(biotypes=biotypes, removeMIR=removeMIR)
        from scimmunity.filtering import filter_and_merge

        # preprocess, filter, and merge
        adata = filter_and_merge(adatas,
                                 self.sample_names,
                                 outdir,
                                 subsample=subsample,
                                 n_obs=n_obs,
                                 min_counts=min_counts,
                                 min_genes=min_genes,
                                 min_cells=min_cells,
                                 max_mito=max_mito)

        adata = self.add_metadata_to_adata(adata)

        # write adata
        adata.write(self.filtered)
        return
示例#2
0
    def infercnv(self,
                 reduction,
                 ref_groups,
                 annotation_key='Phenotype',
                 sample_key='sample_name',
                 gene_order_dir=None,
                 use_name=True,
                 write=True,
                 cores=4,
                 mem=32,
                 partition='all',
                 time=24):
        import infercnv.pipeline as cnv

        if gene_order_dir is None:
            gene_order_dir = cnv.GENE_ORDER_DIR

        out = reduction.out.replace('reduction', 'infercnv')
        mkdir(out)
        cnv.run_all_samples(reduction.adata, annotation_key, sample_key, ref_groups, \
            self.reference, out=out, gene_order_dir=gene_order_dir, use_name=use_name, write=write, \
            cores=cores, mem=mem, partition=partition, time=time)
        return


###
示例#3
0
    def set_highest_ranked_phenotype(self,
                                     df_rank,
                                     name='',
                                     keepcluster=False):
        cluster2phenotype = {}
        for col in df_rank.columns:
            # if equally ranked, set phenotype as NA
            if len(df_rank[col].unique()) == 1:
                cluster2phenotype[col] = 'NA'
            else:
                cluster2phenotype[col] = str(df_rank[col].idxmin())

        # make new clustering
        if len(name) > 0:
            name = '_' + name
        if keepcluster:
            key = 'phenotype_' + self.clustering + name
            self.adata.obs[key] = self.adata.obs[self.clustering].apply(\
                lambda x:'{}:{}'.format(x, cluster2phenotype[x]))

        else:
            key = 'phenotype' + name
            self.adata.obs[key] = self.adata.obs[self.clustering].apply(\
                lambda x:cluster2phenotype[x])

        out = os.path.join(self.out, 'phenotype')
        mkdir(out)
        plot_reps(self.adata, key, save_name=key, outdir=out)
        plot_reps(self.adata, key, save_name=key+'_ondata', outdir=out, \
            legend_loc='on data', legend_fontweight='normal', legend_fontsize=10)

        return cluster2phenotype
示例#4
0
    def __init__(self, outdir, filtered, scaled, out='batchcorrect', \
        n_epochs=50, use_batches=True, use_cuda=False, n_latent=30, train_size=1.0):
        '''
        Args:
            outdir (str): path to analysis output directory
            filtered (str): path to filtered raw adata input 
            scaled (str): path to scaled adata output
            out (str, optional): output subfolder name
        '''
        self.outdir = outdir
        self.out = os.path.join(outdir, out)
        self.filtered = filtered
        self.scaled = scaled
        # load raw filtered dataset
        self.gene_dataset = self.load_filtered()

        # scvi model variables
        self.n_epochs = n_epochs
        self.use_batches = use_batches
        self.use_cuda = use_cuda
        self.n_latent = n_latent
        self.train_size = train_size

        self.vae = self.get_vae()
        self.trainer = self.get_trainer(self.vae, self.train_size)

        # make output folder
        mkdir(self.out)
        return
示例#5
0
 def plot_signature_dict(self, groupby, markersets, mode='heatmap', 
     layers=['corrected_regressed', 'normalized_regressed']):
     out = self.out.replace('reduction', 'heatmap')
     mkdir(out)
     plot_phenotype_markerset(self.adata, groupby, markersets, out=out, 
         mode=mode, layers=layers)
     return 
示例#6
0
 def run_pca(self, n_comps=50):
     mkdir(os.path.join(self.out,'pca'))
     sc.settings.figdir = os.path.join(self.out,'pca')
     sc.tl.pca(self.adata, n_comps=n_comps, svd_solver='auto', use_highly_variable=False)
     # arpack can give zero variance explained, so we use auto solver
     sc.settings.figdir = self.out
     sc.pl.pca_variance_ratio(self.adata, log=False, save=True)
     self.set_n_pcs(min_n_pcs=self.min_n_pcs) 
示例#7
0
def save_de(adata,
            label_name,
            method,
            layer,
            comparison,
            out='./',
            filtered=False,
            query=True,
            pval_cutoff=5e-3,
            log2fc_min=1,
            enrich=True):
    """
    Save rank genes groups results as csv per group. 
    Annotate and enrich up- and down-regulated genes per group filtered based on criteria.
    
    
    """
    filt = "_filtered" * filtered
    method_name = f"{method}_{layer}_{comparison}" + filt
    key = f"rank_genes_groups_{label_name}_{method_name}"
    # get overview of ranked genes
    df_ranked_genes = get_df_ranked_genes(adata, key)
    outdir = os.path.join(out, label_name, method_name)
    mkdir(outdir)
    df_ranked_genes.to_csv(os.path.join(outdir, f'{method_name}.csv'))

    # get up- and down-regulated per group based on criteria
    for i in adata.uns[key]['names'].dtype.names:
        df_up = rank_genes_groups_df(adata,
                                     group=i,
                                     key=key,
                                     pval_cutoff=pval_cutoff,
                                     log2fc_min=log2fc_min,
                                     log2fc_max=None)
        df_down = rank_genes_groups_df(adata,
                                       group=i,
                                       key=key,
                                       pval_cutoff=pval_cutoff,
                                       log2fc_max=-log2fc_min,
                                       log2fc_min=None)
        for df, direction in zip([df_up, df_down], ['up', 'down']):
            df = df.loc[(df['names'] != '') & (~df['names'].isnull()), :]
            name = f"{method}_{layer}_{clean_up_str(i)}_{comparison+filt}"
            path = os.path.join(outdir, f"{name}_{direction}.csv")
            # gene name and description annotation
            if (len(df) > 0) and query:
                df_annotation = annotate_de(df)
                df_annotation.to_csv(path)
            else:
                df.to_csv(path)
            # Gene set enrichment
            if (len(df) > 0) and enrich:
                name = f"{clean_up_str(i)}_{direction}"
                gsea(list(df.names),
                     description=name,
                     out=os.path.join(outdir, name))
    return
示例#8
0
 def plot_clustering_metrics(self, reps=['latent_regressed', 'latent', 'pcs']):
     for rep in reps:
         out = os.path.join(self.out, 'clustering', rep)
         mkdir(out)
         plot_variance_ratio(self.adata, self.res_list, X=rep, 
             prefix=self.prefix, rep=rep, out=out)
         plot_silhouette_coeff(self.adata, self.res_list, X=rep, 
             prefix=self.prefix, rep=rep, out=out)
     return
示例#9
0
def save_de_overlap(adata, key, markersets, out='./'):
    combined_dict = {}
    for markerset in markersets:
        signature_dict = get_signature_dict(markerset, adata=adata)
        combined_dict.update(signature_dict)
    df = de_overlap(adata, key, combined_dict)
    mkdir(out)
    df.to_csv(os.path.join(out, f'de_overlap_{key}_{markerset}.csv'),
              index=False)
    # df.to_csv('{}de_overlap_{}_{}.csv'.format(out, key, markerset), index=False)
    return
示例#10
0
def corr_rep_gene_dict(adata,
                       markerset,
                       rep,
                       prefix='',
                       dims=[0, 1],
                       offset=1,
                       layer=None,
                       out='./',
                       save=True):
    """
    Calculate correlation coefficient R of each component with each gene. 
    Then for each component take the average of R for each gene signature/phenotype in the markerset.
    Args:
        adata (AnnData)
        markerset: name of markerset to annotate with
        rep (str): name of dimension reduction representation
        dims (list of int): indices (0-based) of components from dimension reduction 
        offset (int): Offset indices to skip unwanted component (ex. 1 for diffmap, 0 for others)
        layer (str): adata.layers key to get gene expression from
        out (str): output path
    Return:
        Saves correlation matrix as csv and heatmap.
    """
    mkdir(out)
    # load markerset
    gene_dict = get_signature_dict(markerset, adata=adata)
    phenotypes = list(gene_dict.keys())
    df = pd.DataFrame(index=phenotypes)

    for i in dims:
        df_list = []
        corr = corr_comp_gene(adata, rep, i, offset=offset, layer=layer)
        rs = []
        for phenotype in phenotypes:
            genes = gene_dict[phenotype]
            subset = corr.loc[genes, :]
            subset['phenotype'] = phenotype
            avg_r = corr.loc[genes, 'R'].mean()
            rs.append(avg_r)
            df_list.append(subset.sort_values('R'))
        df['{}{}'.format(prefix, i + 1)] = rs
        pd.concat(df_list).to_csv('{}{}_{}{}.csv'.format(
            out, markerset, prefix, i + 1))
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.heatmap(df, cmap='coolwarm', center=0, ax=ax)
    plt.tight_layout()
    plt.savefig('{}{}_{}_{}_avg_gene_corr.png'.format(out, rep, markerset,
                                                      layer))
    plt.close()
    if save:
        df.to_csv('{}{}_{}_{}_avg_gene_corr.csv'.format(
            out, rep, markerset, layer))
    return df
示例#11
0
 def call_doublet(self,
                  reduction,
                  sample_key='sample_name',
                  thresh_list=[]):
     from scimmunity.qc import call_doublet_per_sample
     out = reduction.out.replace('reduction', 'doublet')
     mkdir(out)
     call_doublet_per_sample(reduction.adata,
                             sample_key,
                             thresh_list=thresh_list,
                             out=out)
     return
示例#12
0
def plot_reps_signature_dict(adata,
                             markerset,
                             use_raw=False,
                             layer=None,
                             out='./',
                             reps=['umap', 'diffmap'],
                             figsize=(4, 4)):
    mkdir(out)
    signature_dict = get_signature_dict(markerset, adata=adata)
    for celltype, genes in signature_dict.items():
        plot_reps_markers(adata, genes, markerset+'_'+celltype.strip(), \
            outdir=out, reps=reps, use_raw=use_raw, layer=layer, figsize=figsize)
    return
示例#13
0
    def plot_reps_signature_dict(self,
                                 signature_dict,
                                 markerset,
                                 use_raw=False,
                                 dpi=300,
                                 layer=None,
                                 figsize=(2, 2)):

        old_dpi = rcParams["savefig.dpi"]
        rcParams["savefig.dpi"] = dpi
        out = os.path.join(self.out, 'expression')
        mkdir(out)
        for celltype, genes in signature_dict.items():
            plot_reps_markers(self.adata, genes, markerset+'_'+celltype.strip(), \
                outdir=out, reps=self.reps, use_raw=use_raw, layer=layer, figsize=figsize)
        rcParams["savefig.dpi"] = old_dpi
        return
示例#14
0
    def __init__(self,
                 adata,
                 outdir,
                 clustering,
                 out='annotation',
                 bulkprofiles=BULKPROFILES,
                 markersets=MARKERSETS,
                 pop2markersetchoices=POP2MARKERSETCHOICES,
                 pop2markersets=POP2MARKERSETS,
                 pop2bulkprofiles=POP2BULKPROFILES,
                 pop2phenotype=POP2PHENOTYPE,
                 reps=None):
        """
        Automatic cluster annotation 
        1) Avg cell marker detection rate (fraction of total genes detected)
        2) Correlate cluster centroid with bulk profiles
            - use only genes with coeff. of variation >20% in bulk dataset?
        3) compare DE genes to know marker genes
        Args:
            h5ad (str): path to '.h5ad' file
            out (str): name of subfolder for output (default: annotation)
            clustering (key): key name of clustering stored in adata.obs
        """

        # self.h5ad = h5ad
        # self.out = '{}/{}/{}/'.format(os.path.dirname(self.h5ad), out, clustering)
        self.out = os.path.join(outdir, clustering)
        self.adata = adata
        self.clustering = clustering
        self.bulkprofiles = bulkprofiles
        self.markersets = markersets
        self.pop2markersetchoices = pop2markersetchoices
        self.pop2markersets = pop2markersets
        self.pop2phenotype = pop2phenotype
        self.pop2bulkprofiles = pop2bulkprofiles
        if reps is None:
            self.reps = [
                'pca', 'umap_pcs', 'umap_latent', 'umap_latent_regressed',
                'tsne_pcs', 'tsne_latent', 'tsne_latent_regressed',
                'diffmap_pcs', 'diffmap_latent', 'diffmap_latent_regressed'
            ]
        else:
            self.reps = reps
        # make output folder
        mkdir(self.out)
        return
示例#15
0
 def plot_frequency(self,
                    reduction,
                    x,
                    y,
                    xrot=0,
                    yrot=45,
                    xorder=None,
                    yorder=None,
                    sort_x=False,
                    sort_y=False,
                    explode=[],
                    swap_axes=True,
                    dropna=False,
                    **kwargs):
     import scimmunity.frequency as freq
     subfolder = f"{clean_up_str(x)}_{clean_up_str(y)}"
     out = os.path.join(reduction.out.replace('reduction', 'frequency'),
                        subfolder)
     mkdir(out)
     df = reduction.adata.obs.copy()
     if len(explode) > 0:
         for cols in explode:
             if type(cols) == str:
                 cols = [cols]
             df = tcr.explode_strs(df, cols, ';')
     if sort_x:
         props = df.groupby(x)[y].count().sort_values(ascending=False)
         sort_order = list(props.index)
         print(sort_order)
         if xorder is not None:
             xorder = [x for x in sort_order if x in xorder]
         else:
             xorder = sort_order
     if sort_y:
         props = df.groupby(y)[x].count().sort_values(ascending=False)
         sort_order = props.index
         if yorder is not None:
             yorder = [y for y in sort_order if y in yorder]
         else:
             yorder = sort_order
     freq.plots(reduction.adata, df, \
         out, x=x, y=y, xrot=xrot, yrot=yrot, xorder=xorder, yorder=yorder, swap_axes=swap_axes, dropna=dropna, **kwargs)
     freq.save_df(x, y, df, 'Frequency', out, dropna=dropna)
     freq.save_df(x, y, df, 'Count', out, dropna=dropna)
     return xorder, yorder
示例#16
0
    def set_annotation(self, population, pop2phenotype, markersets=None):
        phenotype = pop2phenotype[population]
        self.adata.obs['Phenotype'] = self.adata.obs[phenotype]
        # reset phenotype colors to avoid slicing error
        if 'Phenotype_colors' in self.adata.uns:
            del self.adata.uns['Phenotype_colors']

        # plot the set phenotype
        out = os.path.join(self.out, 'phenotype')
        mkdir(out)

        plot_reps(self.adata,
                  'Phenotype',
                  save_name='Phenotype' + '_ondata',
                  outdir=out,
                  reps=self.reps,
                  legend_loc='on data',
                  legend_fontweight='normal',
                  legend_fontsize=10)
        plot_reps(self.adata,
                  'Phenotype',
                  save_name='Phenotype',
                  outdir=out,
                  reps=self.reps)

        # plot markerset heatmap
        if markersets is None:
            markersets = self.pop2markersets[population]

        plot_phenotype_markerset(self.adata,
                                 'Phenotype',
                                 markersets,
                                 out=self.out,
                                 mode='heatmap')
        plot_phenotype_markerset(self.adata,
                                 'Phenotype',
                                 markersets,
                                 out=self.out,
                                 mode='matrixplot')
        plot_phenotype_markerset(self.adata,
                                 'Phenotype',
                                 markersets,
                                 out=self.out,
                                 mode='dotplot')
        return
示例#17
0
 def annotate_comp(self,
                   reduction,
                   rep,
                   prefix='',
                   dims=[0, 1],
                   offset=1,
                   layer=None,
                   thresh=0.8,
                   markersets=[],
                   gsets=[
                       'GO_Biological_Process_2018', 'KEGG_2019_Human',
                       'WikiPathways_2019_Human'
                   ]):
     from scvi_analysis.component import corr_rep_gene_dict, corr_rep_gene
     out = os.path.join(reduction.out.replace('reduction', 'annotation'),
                        rep)
     mkdir(out)
     corr_rep_gene(reduction.adata,
                   rep,
                   prefix=prefix,
                   dims=dims,
                   offset=offset,
                   layer=layer,
                   thresh=thresh,
                   gsets=gsets,
                   out=out)
     for markerset in markersets:
         corr_rep_gene_dict(reduction.adata,
                            markerset,
                            rep,
                            prefix=prefix,
                            dims=dims,
                            offset=offset,
                            layer=layer,
                            out=out)
     return
示例#18
0
    def __init__(self, outdir, 
        parent_name='Whole', parent_h5ad='corrected.h5ad',
        subset_name='Whole', subset_h5ad='corrected.h5ad', subset_cond=dict(), 
        subfolder='reduction',
        pca_s=1.0, min_n_pcs=5, n_neighbors=20, n_jobs=None, 
        verify_barcodes=False, 
        neighbors_reps=['pcs', 'latent', 'latent_regressed'],
        default_rep='pcs',
        reductions=['pca', 'umap', 'tsne', 'diffmap'], 
        res_list=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2],
        regress=False, 
        regress_vars={
            'latent':['percent_mito'], 
            'normalized':['n_counts', 'percent_mito', 'S_score', 'G2M_score'],
            'corrected':['percent_mito']
            }
        ):
    
        """
        pcs_s (float): sensitivity for knee detection in determing number of PCs
        n_neighbors (int): number of neighbors for constructing neighborhood graph
        n_jobs (int): number of jobs for regressing out variable
        subset_name (str): name of subset
        subset_cond (dict of str to list): dictionary mapping obs column to list of values to include
        (ex. {'louvain':['0', '1', '2']}) 
        neighbors_reps (list):  Representations used for neighborhood graphs
        default_rep (list):  Default representation for neighborhood graph
        reductions (list): List of dimension reduction to perform
        res_list (list): List of clustering resolution to perform
        """

        self.outdir = outdir
        self.parent = os.path.join(self.outdir, parent_name, parent_h5ad)
        self.subset = os.path.join(self.outdir, subset_name, subset_h5ad)

        self.subset_name = subset_name
        self.subset_cond = subset_cond
        self.subfolder = subfolder

        # set output folders
        self.out = os.path.join(self.outdir, subset_name, subfolder)
        self.prefix = subset_name

        # make output folder
        mkdir(self.out)

        sc.settings.figdir = self.out

        if not os.path.isfile(self.subset):
            # create new subset if none exists
            self.adata = self.create_subset()
        else:
            # load existing subset adata
            self.adata = sc.read(self.subset)
            if verify_barcodes:
            # verify that the subset conditions give the same barcodes
                self.verify_barcodes()

        # Parameters for dimension reduction     
        self.pca_s = pca_s
        self.min_n_pcs = min_n_pcs
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs
        self.regress = regress
        self.regress_vars = regress_vars
        self.default_rep = default_rep
        
        # define representations used for constructing neighborhood graph
        self.neighbors_reps = neighbors_reps
        self.neighbors_kwargs_list = [{'use_rep': 'X_'+rep} \
            for rep in self.neighbors_reps]
        self.neighbors_list = [] # store name of neighborhood graphs
        self.all_reps = reductions + \
            [f'{x}_{y}'for x in reductions if x!='pca' for y in neighbors_reps]

        # Clustering resolutions
        self.res_list = res_list

        return
示例#19
0
    def __init__(
            self,
            name,
            samplesheet,
            gtf,
            scdir,
            sample_names=[],
            sample_name_col='sample_name',
            # library_ids=[],
            # library_id_col='library_id',
            whole='Whole',
            gex_col='gex',
            vdj_col='vdj',
            metadata_cols=[],
            dpi=300,
            n_epochs=50,
            use_batches=True,
            use_cuda=False,
            n_latent=30,
            train_size=1.0):

        self.samplesheet = pd.read_csv(samplesheet)

        if not sample_names:
            self.sample_names = self.samplesheet[sample_name_col].tolist()
        else:
            inds = self.samplesheet[sample_name_col].isin(sample_names)
            self.samplesheet = self.samplesheet[inds].reset_index(drop=True)
            self.sample_names = self.samplesheet[sample_name_col].tolist()
        self.gtf = gtf
        self.whole = whole
        self.alignments = self.samplesheet[gex_col].tolist()
        self.vdj_alignments = self.samplesheet[vdj_col].tolist()

        # import metadata
        if not metadata_cols:
            metadata_cols = list(self.samplesheet.columns)
            metadata_cols.remove(gex_col)
            metadata_cols.remove(vdj_col)
        self.metadata_cols = metadata_cols

        # scvi arguments
        self.scvi_kwargs = {
            'n_epochs': n_epochs,
            'use_batches': use_batches,
            'use_cuda': use_cuda,
            'n_latent': n_latent,
            'train_size': train_size
        }

        # set analysis name
        self.name = name

        # set output paths
        self.scdir = scdir
        self.outdir = os.path.join(self.scdir, self.name)
        self.filtered = os.path.join(self.outdir, whole, 'filtered.h5ad')
        self.corrected = os.path.join(self.outdir, whole, 'corrected.h5ad')
        self.pkl = os.path.join(self.outdir, 'scvi.model.pkl')
        self.no_batch_pkl = os.path.join(self.outdir,
                                         'no_batch_scvi.model.pkl')

        print('Analysis saved at ' + self.outdir)
        mkdir(self.outdir)

        # set working directory for cache files
        os.chdir(self.outdir)

        set_plotting_params(dpi=dpi)

        return