def run(path, f, output_path, dictionary, column='gene', **kwargs): """Run and save e.a..""" genes = pd.read_csv(path+f, **kwargs) df = tea.enrichment_analysis(genes[column], dictionary, show=False) df = df[df.Observed > 2] df.to_csv(output_path + f + '.csv', index=False)
def run_tea(parent, sleuth, out, q_threshold=0.05): """ Runs TEA on sleuth output. """ try: import tissue_enrichment_analysis as tea except ImportError as e: print_with_flush('# TEA is not installed...skipping') sys.exit(0) try: import pandas as pd except ImportError as e: print_with_flush('# pandas is not installed...skipping') sys.exit(0) analyses = ['tissue', 'phenotype', 'go'] # Load sleuth results. wdir = os.getcwd() print_with_flush('# entering 3_diff_exp') os.chdir(parent) print_with_flush('# creating {} directory'.format(out)) os.makedirs(out, exist_ok=True) for file in os.listdir(sleuth): if file.endswith('.csv'): df = pd.read_csv(os.path.join(sleuth, file), index_col=0) gene_list = df[df.qval < q_threshold].ens_gene name = os.path.splitext(os.path.basename(file))[0] if len(gene_list) == 0: print_with_flush(('# there are no genes with q < {} in ' + '{}!').format(q_threshold, file)) print_with_flush( '# this means there are no significantly ' + 'differentially-expressed genes for ' + 'this set of conditions.') continue for analysis in analyses: print_with_flush(('# performing {} enrichment analysis ' + 'for {}').format(analysis, file)) fname = '{}_{}'.format(name.replace('betas_wt', out), analysis) title = os.path.join(out, fname) df_dict = tea.fetch_dictionary(analysis) df_results = tea.enrichment_analysis(gene_list, df_dict, aname=title + '.csv', save=True, show=False) tea.plot_enrichment_results(df_results, analysis=analysis, title=title, save=True) os.chdir(wdir) print_with_flush('# returned to root')
def walker(tissue_df, directory, save=True): """Given the tissue dictionary and a directory to save to, open all the gene sets, analyze them and deposit the results in the specified directory. Parameters: ------------------- tissue_df - pandas dataframe containing specified tissue dictionary directory - where to save to save - boolean indicating whether to save results or not. """ with open(directory + 'empty.txt', 'w') as f: f.write('Genesets with no enrichment:\n') # go through each file in the folder for fodder in os.walk(path_sets): for f_set in fodder[2]: # open df df = pd.read_csv(path_sets + f_set) # extract gene list and analyze short_name = f_set test = df.gene.values df_analysis, unused = tea.enrichment_analysis(test, tissue_df, show=False) # if it's not empty and you want to save: if df_analysis.empty is False & save: # save without index df_analysis.to_csv(directory + short_name + '.csv', index=False) # add a comment line = '#' + short_name + '\n' line_prepender(directory + short_name + '.csv', line) # plot tea.plot_enrichment_results(df_analysis, title=short_name, dirGraphs=directory, ftype='pdf') plt.close() # if it's empty and you want to save, place it in file called empty if df_analysis.empty & save: with open(directory + 'empty.txt', 'a+') as f: f.write(short_name + '\n')
def walker(tissue_df, directory, save=True): """Given the tissue dictionary and a directory to save to, open all the gene sets, analyze them and deposit the results in the specified directory. Parameters: ------------------- tissue_df - pandas dataframe containing specified tissue dictionary directory - where to save to save - boolean indicating whether to save results or not. """ with open(directory+'empty.txt', 'w') as f: f.write('Genesets with no enrichment:\n') # go through each file in the folder for fodder in os.walk(path_sets): for f_set in fodder[2]: # open df df = pd.read_csv(path_sets + f_set) # extract gene list and analyze short_name = f_set test = df.gene.values df_analysis, unused = tea.enrichment_analysis(test, tissue_df, show=False) # if it's not empty and you want to save: if df_analysis.empty is False & save: # save without index df_analysis.to_csv(directory+short_name+'.csv', index=False) # add a comment line = '#' + short_name+'\n' line_prepender(directory+short_name+'.csv', line) # plot tea.plot_enrichment_results(df_analysis, title=short_name, dirGraphs=directory, ftype='pdf') plt.close() # if it's empty and you want to save, place it in file called empty if df_analysis.empty & save: with open(directory+'empty.txt', 'a+') as f: f.write(short_name+'\n')
thresh = float(thresh) # typecasting method = f_dict[-7:-4] ntiss = len(tissue_df.columns) ngenes = tissue_df.shape[0] # open each enrichment set for fodder in os.walk(path_sets): for f_set in fodder[2]: df = pd.read_csv(path_sets + f_set) test = df.gene.values ntest = len(test) short_name = f_set[16:len(f_set)-16] df_analysis, unused = tea.enrichment_analysis(test, tissue_df, alpha=0.05, show=False) nana = len(df_analysis) # len of results nun = len(unused) # number of genes dropped avf = df_analysis['Fold Change'].mean() avq = df_analysis['Q value'].mean() s = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format( annot, thresh, method, f_set, ntiss, ntest, nana, ntest-nun, avf, avq, ngenes) with open(dirSummaries+'ExecutiveSummary.csv', 'a+') as fSum: fSum.write(s) fSum.write('\n') # Print summary to csv df_summary = pd.read_csv(dirSummaries+'ExecutiveSummary.csv', comment='#')
for i, df in enumerate(Ldf): fname = Lnames[i] obj = receptacle(fname) for direction in Ldirection: ind = g(df[df[direction] == 1.0], 'SequenceNameGene') x = names[ind].WBID print('---------') print(fname + ' ' + direction) print('Number of genes submitted for analysis ', len(x)) y = tissue_df[tissue_df.wbid.isin(x)].wbid.unique().shape[0] print('Number of genes used for analysis ', y) print('\n') df_res, unused = tea.enrichment_analysis(x, tissue_df, show=False, alpha=0.1) print(df_res.empty) if len(df_res) > 0: obj.add_result(direction, df_res) n_genes.append(['{0}, {1}'.format(fname, direction), y, len(unused)]) Lreceptacles[fname] = obj # make for species in Lreceptacles: current = Lreceptacles[species] n = current.n # number of dfs
def run(path, f, output_path, dictionary, **kwargs): """Run and save e.a..""" genes = pd.read_csv(path+f, **kwargs) df, _ = tea.enrichment_analysis(genes.gene, dictionary, show=False) df.to_csv(output_path + f + '.csv', index=False)
print('-----------------------------------') print('Genes with annotated phenotype terms: ', get_n(pheno_traits, worm_genes)) print('Genes with annotated tissue terms: ', get_n(tissue_traits, worm_genes)) print('Genes with annotated go terms: ', get_n(go_traits, worm_genes)) print('-----------------------------------\n') # one of the traits foolishly has a '/' if '/' in trait: # rename the trait with this character or it breaks your code trait = 'post bronchodilator fev1 fevc ratio' # enrichment analyses: df = tea.enrichment_analysis(worm_genes, phenotype_df, show=False) df = df[df.Observed > n_min_obs].copy() df.to_csv('../output/phenologues_2/pea_' + trait + '.csv', index=False) if 'lupus' in trait: print('Graphing PEA results for ', trait) fig, ax = plt.subplots() tea.plot_enrichment_results(df, title='../output/lupus_pea', save=True, analysis='phenotype') plt.close() df = tea.enrichment_analysis(worm_genes, tissue_df, show=False) df = df[df.Observed > n_min_obs].copy() df.to_csv('../output/disease_tissues_2/tea_' + trait + '.csv', index=False) if 'lupus' in trait: print('Graphing TEA results for ', trait)
import tissue_enrichment_analysis as tea import pandas as pd phenotype_df = pd.read_csv('../input/phenotype_ontology.csv') go_df = pd.read_csv('../input/gene_ontology.csv') tissue_df = tea.fetch_dictionary() sfari = pd.read_excel('../input/sfari.xlsx') name_df = pd.read_excel('../input/sfari_name_converter.xlsx') sfari.head() df, _ = tea.enrichment_analysis(sfari.Gene, tissue_df, show=False) df.to_csv('../output/tea_sfari.csv', index=False) df, _ = tea.enrichment_analysis(sfari.Gene, phenotype_df, show=False) df.to_csv('../output/pea_sfari.csv', index=False) df, _ = tea.enrichment_analysis(sfari.Gene, go_df, show=False) df.to_csv('../output/goa_sfari.csv', index=False) melt_pheno = pd.melt(phenotype_df, id_vars='wbid', var_name='phenotype') melt_pheno = melt_pheno[melt_pheno.value == 1] def convert(x): return name_df[name_df.wbid == x].gene_name.values[0]
def go_enrichment(self, mode: str = 'go', alpha: float = 0.05, save_csv: bool = False, fname: str = None): """ Analyzes GO, Tissue and/or Phenotype enrichment of the given group of features. \ Uses the the Anatomy, Phenotype and Gene Ontology annotations for C. elegans. \ Corrected p-values are calculated using hypergeometric statistics. \ For more details see GitHub page of the developers: https://github.com/dangeles/TissueEnrichmentAnalysis :param mode: the enrichment you wish to perform. 'go' for gene ontology enrichment, \ 'tissue' for tissue enrichment, 'phenotype' for phenotype enrichment. :param alpha: float. Significance threshold. Default is 0.05 :param save_csv: bool. False by default. If True, save the result to a csv. :param fname: Name and path in which to save the results. Must be filled if save_csv is True. :return: a DataFrame which contains the significant enrichmenet terms .. figure:: go_en.png :align: center :scale: 40 % Example plot of GO enrichment .. figure:: tissue_en.png :align: center :scale: 40 % Example plot of Tissue enrichment """ assert isinstance(alpha, float), "alpha must be a float!" assert isinstance(mode, str), "'mode' must be a string!" if mode == 'all': d = [] df_comb = pd.DataFrame() for k, arg in enumerate(('go', 'tissue', 'phenotype')): print(f'Calculating... {100 * k / 3 :.2f}% done') if arg in EnrichmentProcessing._go_dicts: d.append(EnrichmentProcessing._go_dicts[arg]) else: d.append(tea.fetch_dictionary(arg)) EnrichmentProcessing._go_dicts[arg] = d[-1] df = tea.enrichment_analysis(self.gene_set, d[-1], alpha=alpha) df_comb = df_comb.append(df) tea._plot_enrichment_results( df, title=f'{arg.capitalize()} Enrichment Analysis', analysis=arg) plt.title(f'{arg.capitalize()} Enrichment Analysis', fontsize=20) print(df) else: assert (mode == 'go' or mode == 'tissue' or mode == 'phenotype'), "Invalid mode!" d = tea.fetch_dictionary(mode) df_comb = tea.enrichment_analysis(self.gene_set, d, show=True) tea._plot_enrichment_results( df_comb, title=f'{mode.capitalize()} Enrichment Analysis', analysis=mode) plt.title(f'{mode.capitalize()} Enrichment Analysis', fontsize=20) if save_csv: self._enrichment_save_csv(df_comb, fname) plt.show() return df_comb
thresh = float(thresh) # typecasting method = f_dict[-7:-4] ntiss = len(tissue_df.columns) ngenes = tissue_df.shape[0] # open each enrichment set for fodder in os.walk(path_sets): for f_set in fodder[2]: df = pd.read_csv(path_sets + f_set) test = df.gene.values ntest = len(test) short_name = f_set[16:len(f_set) - 16] df_analysis, unused = tea.enrichment_analysis(test, tissue_df, alpha=0.05, show=False) # save the analysis to the relevant folder savepath = '../output/HGT' + annot + '_' + method + '_Results/' df_analysis.to_csv(savepath + f_set + '.csv', index=False) tea.plot_enrichment_results(df_analysis, save='savepath' + f_set + 'Graph', ftype='pdf') nana = len(df_analysis) # len of results nun = len(unused) # number of genes dropped avf = df_analysis['Enrichment Fold Change'].mean() avq = df_analysis['Q value'].mean() s = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(
def run_post(project, code='post', requires='diff'): print_with_flush('# starting post for project {}'.format(project.objectId)) organism = project.relation('samples').query()[0].reference.organism if organism.genus != 'caenorhabditis' or organism.species != 'elegans': print_with_flush('# Currently, post analysis is only supported for ' 'C. elegans') return config = Config.get() q_threshold = config['qThreshold'] tea_types = config['teaTypes'] diff_path = project.paths[requires] post_path = project.paths[code] for file in os.listdir(diff_path): file_name = os.path.splitext(os.path.basename(file))[0] file_path = os.path.join(diff_path, file) if file.startswith('betas') and file.endswith('.csv'): df = pd.read_csv(file_path, index_col=0) gene_list = df[df.qval < q_threshold].ens_gene # Skip if gene list is empty. if len(gene_list) == 0: print_with_flush( ('# there are no genes with q < {} in ' + '{}!').format( q_threshold, file)) print_with_flush('# this means there are no significantly ' + 'differentially-expressed genes for ' + 'this set of conditions.') continue for tea_type in tea_types: tea_file = '{}_{}'.format( file_name.replace('betas_wt', 'enrichment'), tea_type) tea_title = os.path.join(post_path, tea_file) print_with_flush( ('# performing {} enrichment analysis ' + 'for {}').format( tea_type, file)) df_dict = tea.fetch_dictionary(tea_type) df_results = tea.enrichment_analysis(gene_list, df_dict, aname=tea_title + '.csv', save=True, show=False) tea.plot_enrichment_results(df_results, analysis=tea_type, title=tea_title, save=True) # Archive. archive_path = archive(project, code) if code not in project.files: project.files[code] = {} project.files[code]['archive'] = archive_path project.save() print_with_flush('# done')