def run_tea(parent, sleuth, out, q_threshold=0.05): """ Runs TEA on sleuth output. """ try: import tissue_enrichment_analysis as tea except ImportError as e: print_with_flush('# TEA is not installed...skipping') sys.exit(0) try: import pandas as pd except ImportError as e: print_with_flush('# pandas is not installed...skipping') sys.exit(0) analyses = ['tissue', 'phenotype', 'go'] # Load sleuth results. wdir = os.getcwd() print_with_flush('# entering 3_diff_exp') os.chdir(parent) print_with_flush('# creating {} directory'.format(out)) os.makedirs(out, exist_ok=True) for file in os.listdir(sleuth): if file.endswith('.csv'): df = pd.read_csv(os.path.join(sleuth, file), index_col=0) gene_list = df[df.qval < q_threshold].ens_gene name = os.path.splitext(os.path.basename(file))[0] if len(gene_list) == 0: print_with_flush(('# there are no genes with q < {} in ' + '{}!').format(q_threshold, file)) print_with_flush( '# this means there are no significantly ' + 'differentially-expressed genes for ' + 'this set of conditions.') continue for analysis in analyses: print_with_flush(('# performing {} enrichment analysis ' + 'for {}').format(analysis, file)) fname = '{}_{}'.format(name.replace('betas_wt', out), analysis) title = os.path.join(out, fname) df_dict = tea.fetch_dictionary(analysis) df_results = tea.enrichment_analysis(gene_list, df_dict, aname=title + '.csv', save=True, show=False) tea.plot_enrichment_results(df_results, analysis=analysis, title=title, save=True) os.chdir(wdir) print_with_flush('# returned to root')
def walker(tissue_df, directory, save=True): """Given the tissue dictionary and a directory to save to, open all the gene sets, analyze them and deposit the results in the specified directory. Parameters: ------------------- tissue_df - pandas dataframe containing specified tissue dictionary directory - where to save to save - boolean indicating whether to save results or not. """ with open(directory + 'empty.txt', 'w') as f: f.write('Genesets with no enrichment:\n') # go through each file in the folder for fodder in os.walk(path_sets): for f_set in fodder[2]: # open df df = pd.read_csv(path_sets + f_set) # extract gene list and analyze short_name = f_set test = df.gene.values df_analysis, unused = tea.enrichment_analysis(test, tissue_df, show=False) # if it's not empty and you want to save: if df_analysis.empty is False & save: # save without index df_analysis.to_csv(directory + short_name + '.csv', index=False) # add a comment line = '#' + short_name + '\n' line_prepender(directory + short_name + '.csv', line) # plot tea.plot_enrichment_results(df_analysis, title=short_name, dirGraphs=directory, ftype='pdf') plt.close() # if it's empty and you want to save, place it in file called empty if df_analysis.empty & save: with open(directory + 'empty.txt', 'a+') as f: f.write(short_name + '\n')
def walker(tissue_df, directory, save=True): """Given the tissue dictionary and a directory to save to, open all the gene sets, analyze them and deposit the results in the specified directory. Parameters: ------------------- tissue_df - pandas dataframe containing specified tissue dictionary directory - where to save to save - boolean indicating whether to save results or not. """ with open(directory+'empty.txt', 'w') as f: f.write('Genesets with no enrichment:\n') # go through each file in the folder for fodder in os.walk(path_sets): for f_set in fodder[2]: # open df df = pd.read_csv(path_sets + f_set) # extract gene list and analyze short_name = f_set test = df.gene.values df_analysis, unused = tea.enrichment_analysis(test, tissue_df, show=False) # if it's not empty and you want to save: if df_analysis.empty is False & save: # save without index df_analysis.to_csv(directory+short_name+'.csv', index=False) # add a comment line = '#' + short_name+'\n' line_prepender(directory+short_name+'.csv', line) # plot tea.plot_enrichment_results(df_analysis, title=short_name, dirGraphs=directory, ftype='pdf') plt.close() # if it's empty and you want to save, place it in file called empty if df_analysis.empty & save: with open(directory+'empty.txt', 'a+') as f: f.write(short_name+'\n')
for species in Lreceptacles: current = Lreceptacles[species] n = current.n # number of dfs keys = current.result_dict.keys() if n == 0: next i = 0 if n > 1: fig, ax = plt.subplots(nrows=n, figsize=(8, 8)) fig.subplots_adjust(top=2) fig.suptitle(species, fontsize=15, y=1.02) tea.plot_enrichment_results(current.result_dict['Infection_downregulated'], title='Name', save=False, fig=fig, ax=ax[0]) # suppress xlabel ax[0].set_xlabel('') ax[0].set_ylabel('Down-Regulated Tissues') ax[0].yaxis.set_label_position('right') tea.plot_enrichment_results(current.result_dict['Infection_upregulated'], title='Name', save=False, fig=fig, ax=ax[1]) ax[1].set_ylabel('Up-Regulated Tissues') xlabel = ax[1].set_xlabel('Enrichment Fold Change - {0}'.format(species)) ax[1].yaxis.set_label_position('right') fig.tight_layout() plt.savefig('../output/Engelmann/Graphs/'+species+'Enrichment.pdf', rect=[0, 0.03, 1, 0.95], bbox_extra_artists=[xlabel], bbox_inches='tight') # plt.close()
for f_set in fodder[2]: df = pd.read_csv(path_sets + f_set) test = df.gene.values ntest = len(test) short_name = f_set[16:len(f_set)-16] df_analysis, unused = tea.enrichment_analysis(test, tissue_df, alpha=0.05, show=False) # save the analysis to the relevant folder savepath = '../output/HGT'+annot + '_' + method + '_Results/' df_analysis.to_csv(savepath + f_set+'.csv', index=False) tea.plot_enrichment_results(df_analysis, save='savepath'+f_set+'Graph', ftype='pdf') nana = len(df_analysis) # len of results nun = len(unused) # number of genes dropped avf = df_analysis['Enrichment Fold Change'].mean() avq = df_analysis['Q value'].mean() s = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format( annot, thresh, method, f_set, ntiss, ntest, nana, ntest-nun, avf, avq, ngenes) with open(dirSummaries+'ExecutiveSummary.csv', 'a+') as fSum: fSum.write(s) fSum.write('\n') # Print summary to csv df_summary = pd.read_csv(dirSummaries+'ExecutiveSummary.csv', comment='#')
get_n(go_traits, worm_genes)) print('-----------------------------------\n') # one of the traits foolishly has a '/' if '/' in trait: # rename the trait with this character or it breaks your code trait = 'post bronchodilator fev1 fevc ratio' # enrichment analyses: df = tea.enrichment_analysis(worm_genes, phenotype_df, show=False) df = df[df.Observed > n_min_obs].copy() df.to_csv('../output/phenologues_2/pea_' + trait + '.csv', index=False) if 'lupus' in trait: print('Graphing PEA results for ', trait) fig, ax = plt.subplots() tea.plot_enrichment_results(df, title='../output/lupus_pea', save=True, analysis='phenotype') plt.close() df = tea.enrichment_analysis(worm_genes, tissue_df, show=False) df = df[df.Observed > n_min_obs].copy() df.to_csv('../output/disease_tissues_2/tea_' + trait + '.csv', index=False) if 'lupus' in trait: print('Graphing TEA results for ', trait) fig, ax = plt.subplots() tea.plot_enrichment_results(df, title='../output/lupus_tea', save=True, analysis='tissue') plt.close() df = tea.enrichment_analysis(worm_genes, go_df, show=False) df = df[df.Observed > n_min_obs].copy()
df = pd.read_csv(path_sets + f_set) test = df.gene.values ntest = len(test) short_name = f_set[16:len(f_set) - 16] df_analysis, unused = tea.enrichment_analysis(test, tissue_df, alpha=0.05, show=False) # save the analysis to the relevant folder savepath = '../output/HGT' + annot + '_' + method + '_Results/' df_analysis.to_csv(savepath + f_set + '.csv', index=False) tea.plot_enrichment_results(df_analysis, save='savepath' + f_set + 'Graph', ftype='pdf') nana = len(df_analysis) # len of results nun = len(unused) # number of genes dropped avf = df_analysis['Enrichment Fold Change'].mean() avq = df_analysis['Q value'].mean() s = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format( annot, thresh, method, f_set, ntiss, ntest, nana, ntest - nun, avf, avq, ngenes) with open(dirSummaries + 'ExecutiveSummary.csv', 'a+') as fSum: fSum.write(s) fSum.write('\n') # Print summary to csv df_summary = pd.read_csv(dirSummaries + 'ExecutiveSummary.csv', comment='#')
def run_post(project, code='post', requires='diff'): print_with_flush('# starting post for project {}'.format(project.objectId)) organism = project.relation('samples').query()[0].reference.organism if organism.genus != 'caenorhabditis' or organism.species != 'elegans': print_with_flush('# Currently, post analysis is only supported for ' 'C. elegans') return config = Config.get() q_threshold = config['qThreshold'] tea_types = config['teaTypes'] diff_path = project.paths[requires] post_path = project.paths[code] for file in os.listdir(diff_path): file_name = os.path.splitext(os.path.basename(file))[0] file_path = os.path.join(diff_path, file) if file.startswith('betas') and file.endswith('.csv'): df = pd.read_csv(file_path, index_col=0) gene_list = df[df.qval < q_threshold].ens_gene # Skip if gene list is empty. if len(gene_list) == 0: print_with_flush( ('# there are no genes with q < {} in ' + '{}!').format( q_threshold, file)) print_with_flush('# this means there are no significantly ' + 'differentially-expressed genes for ' + 'this set of conditions.') continue for tea_type in tea_types: tea_file = '{}_{}'.format( file_name.replace('betas_wt', 'enrichment'), tea_type) tea_title = os.path.join(post_path, tea_file) print_with_flush( ('# performing {} enrichment analysis ' + 'for {}').format( tea_type, file)) df_dict = tea.fetch_dictionary(tea_type) df_results = tea.enrichment_analysis(gene_list, df_dict, aname=tea_title + '.csv', save=True, show=False) tea.plot_enrichment_results(df_results, analysis=tea_type, title=tea_title, save=True) # Archive. archive_path = archive(project, code) if code not in project.files: project.files[code] = {} project.files[code]['archive'] = archive_path project.save() print_with_flush('# done')