def load_and_prepare_data(indir, file_patt, comparisons, pids=consts.PIDS, alpha=0.005, alpha_relevant=0.05, outdir=None): """ Load IPA pathway data from the raw exported files, preparing several representations. If requested, save some of these to the specified output directory. :param indir: :param file_patt: :param comparisons: :param pids: :param alpha: :param alpha_relevant: :param outdir: :return: """ plogalpha = -np.log10(alpha) plogalpha_relevant = -np.log10(alpha_relevant) res = ipa.load_raw_reports( indir, file_patt, pids, comparisons ) for k, v in res.items(): rele_ix = v.index[v['-logp'] >= plogalpha_relevant] res['_'.join(k)] = v.loc[rele_ix] res.pop(k) # wideform version of this (i.e. 30 blocks) res_wide = ipa_results_to_wideform(res, plogalpha) # get a list of significant pathways (in at least one comparison) pathways_significant = set() for k, v in res.items(): pathways_significant.update(v.index[v['-logp'] > plogalpha]) if outdir is not None: # export full wideform results res_wide.to_excel(os.path.join(outdir, "ipa_results_full.xlsx")) # export significant results to an Excel file with separate tabs res_sign = dict([ (k, v.loc[v['-logp'] > plogalpha]) for k, v in res.items() ]) excel.pandas_to_excel(res_sign, os.path.join(outdir, "ipa_results_significant_separated.xlsx")) # export wideform, reduced to include only significant pathways res_wide.loc[sorted(pathways_significant)].to_excel( os.path.join(outdir, "ipa_results_significant.xlsx") ) return res, res_wide, pathways_significant
print "Found %d relevant input (DMP) files: %s" % (len(dmp_fns), ', '.join(dmp_fns)) outdir = output.unique_output_dir("mb_dmps") res = {} for fn in dmp_fns: base = os.path.splitext(os.path.basename(fn))[0] res[base] = {} dat = pd.read_excel(fn, sheet_name=None) for cmp, df in dat.items(): res[base][cmp] = annot_one(df, anno) # save to Excel out_fn = os.path.join(outdir, os.path.basename(fn)) excel.pandas_to_excel(res[base], out_fn, write_index=False) # 2.1 Look for common DMPs # 'Refold' the previous results dictionary res_flat = dictionary.nested_dict_to_flat(res) res_by_cmp = dict([(k[::-1], v) for k, v in res_flat.items()]) res_by_cmp = dictionary.flat_dict_to_nested(res_by_cmp) common_dmps = {} for cmp, d in res_by_cmp.items(): common_dmps[cmp] = setops.reduce_intersection( *[t.probe_id for t in d.values()]) # 2.2 Look for common genes
the_counts = pd.Series(the_counts) the_counts_full = pd.Series(the_counts_full) probe_counts[p][typ] = the_counts probe_counts_full[p][typ] = the_counts_full probe_dist[p][typ] = the_counts.divide(the_counts.sum()) probe_dist_full[p][typ] = the_counts_full.divide( the_counts_full.sum()) probe_dist_rel_bg[p][typ] = probe_dist[p][typ] / bg_dist probe_dist_full_rel_bg[p][typ] = probe_dist_full[p][typ] / bg_dist excel.pandas_to_excel( to_xls, os.path.join(outdir, "dmr_motif_analysis_patient_specific.xlsx")) # motif_count_breaks = [0, 2, 5, 10, 20, 30, 100] # motif_count_breaks_colnames = interval_names_from_bin_edges(motif_count_breaks, add_infty=True) # motif_count_breaks_nif = [0, 2, 4, 6, 8, 10] # motif_count_breaks_nif_colnames = interval_names_from_bin_edges(motif_count_breaks_nif, add_infty=True) # # motif_counts_binned = {} # motif_counts_nif_binned = {} # for p in pids: # motif_counts_binned[p] = {} # motif_counts_nif_binned[p] = {} # for typ in ['hypo', 'hyper']: # motif_counts_binned[p][typ] = pd.Series(index=cpg_statuses) # motif_counts_nif_binned[p][typ] = pd.Series(index=cpg_statuses)
p_res = {} ss_res = {} for typ in ('cell_culture', 'ffpe'): for src in ('star', 'salmon', 'star/cufflinks'): fn = os.path.join(outdir, "%s_%s.gct" % (SRC_MAP[src], typ)) the_dir, the_stem = os.path.split(fn) outfn = os.path.join(the_dir, "p_result_%s.txt" % the_stem) if not os.path.exists(outfn): continue this_pres = load_pvalue_results(outfn) p_res.setdefault(typ, {})[SRC_MAP[src]] = this_pres ss_res.setdefault(typ, {})[SRC_MAP[src]] = simplicity_score(this_pres) # export # easiest way is to flatten the dictionary, then combine export_p = dictionary.nested_dict_to_flat(p_res) export_ss = dictionary.nested_dict_to_flat(ss_res) to_export = {} for k in export_p: the_key = '_'.join(k) this = export_p[k].copy() this.insert(this.shape[1], 'Simplicity score', export_ss[k]) if k[0] == 'ffpe': this.insert(this.shape[1], 'Patient ID', nh_id_to_patient_id(this.index)) to_export[the_key] = this excel.pandas_to_excel(to_export, os.path.join(outdir, "wang_results.xlsx"))
jobs[lbl] = pool.apply_async(run_one_de, args=(dat, groups, cmp), kwds=de_params) # res[lbl] = run_one_de(dat, groups, cmp, **de_params) # print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum() for lbl in jobs: res[lbl] = jobs[lbl].get(1e6) print lbl print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum() for k, v in res.items(): general.add_gene_symbols_to_ensembl_data(v, tax_id=10090) res_sign[k] = v.loc[v.FDR <= de_params['fdr']] excel.pandas_to_excel(res, os.path.join(outdir, "mouse_GBM_NSC_DE_all.xlsx")) excel.pandas_to_excel( res_sign, os.path.join(outdir, "mouse_GBM_NSC_DE_significant.xlsx")) # finally, re-run with a lfc of zero # disabled for now to speed things up if False: de_params['lfc'] = 0 jobs2 = {} print "No logFC requirement" for cmp in comparisons: lbl = "%s_vs_%s" % cmp jobs2[lbl] = pool.apply_async(run_one_de, args=(dat, groups, cmp),
# for separated data, combine single and paired PC for maximum efficiency for first_dim in dims: dims_pair = (first_dim, first_dim + 1) ix_all = setops.reduce_union(*[ selected_by_quantile_separate_logfc[k].index for k in [(first_dim, ), dims_pair] ]) this_df = pd.DataFrame(index=ix_all) for k in [(first_dim, ), dims_pair]: tt = selected_by_quantile_separate_logfc[k].copy() tt = tt.loc[:, tt.columns.str.contains('logFC')] tt.columns = tt.columns.str.replace( '_logFC', '_%s_logFC' % '-'.join([str(t + 1) for t in k])) this_df = pd.concat((this_df, tt), axis=1, sort=True) this_df.to_excel( os.path.join(outdir, "for_ipa_separate_logfc_pc%d.xlsx" % (first_dim + 1))) # combine with DE results and export to table for_export = {} for first_dim in dims: dims_pair = (first_dim, first_dim + 1) for dim in [(first_dim, ), dims_pair]: the_key = "PC_%s" % '-'.join([str(t + 1) for t in dim]) this_feat = svd_res['feat_dat'][[i + 1 for i in dim]] this_ens = get_topmost_quantile_by_loading( this_feat, quantile).intersection(de_res.index) for_export[the_key] = de_res.loc[this_ens] excel.pandas_to_excel( for_export, os.path.join(outdir, "full_de_syngeneic_only_filtered_by_biplot.xlsx"))
for_plot = {} for pid in groups[grp]: for_plot[pid] = de_dmr_de_logfc_tss[grp][[pid]].dropna() for_plot[pid].columns = ['logFC'] plt_dict = same_de.bar_plot(for_plot, keys=groups[grp], figsize=(len(groups[grp]) - .5, 4.5)) plt_dict['fig'].savefig(os.path.join(outdir, "de_direction_by_group_%s_tss.png" % grp.lower()), dpi=200) # export for publication de_dmr_dmr_all_for_export = {} for grp, x in de_dmr_dmr_median_delta_all.items(): this_ = x.dropna(axis=1, how="all").reset_index().rename({"index": "dmr_id"}, axis=1).copy() this_["consistent"] = this_["consistent"].astype(int) de_dmr_dmr_all_for_export[grp] = this_ fn = os.path.join(outdir, "dmr_from_group_spec_de_dmrs_all.xlsx") excel.pandas_to_excel(de_dmr_dmr_all_for_export, fn, write_index=False) # Venn diagrams of DE fig = plt.figure(figsize=(5., 3.3)) ax = fig.add_subplot(111) plot_venn_de_directions(de_dmr_de_logfc_all, set_colours_dict, ax=ax) fig.savefig(os.path.join(outdir, "de_from_group_spec_de_dmrs_all.png"), dpi=200) fig = plt.figure(figsize=(5., 3.3)) ax = fig.add_subplot(111) plot_venn_de_directions(de_dmr_de_logfc_tss, set_colours_dict, ax=ax) fig.savefig(os.path.join(outdir, "de_from_group_spec_de_dmrs_tss.png"), dpi=200) # export for publication de_dmr_de_all_for_export = {} for grp, x in de_dmr_de_logfc_all.items():
for_export.to_excel( os.path.join(outdir, 'consistently_in_pair_only_across_all_refs.xlsx')) # correct the reference PO lists, take the intersection, then export to a file po_de_export = {} for pid in pids: this_row = pair_only.loc[pid, external_ref_labels] this_genes_pre = reduce(intersecter, this_row) this_genes = sorted(this_genes_pre.difference(po_specific_to_all_refs)) print "PID %s. Subtracted %d correction genes from the %d PO intersection genes to leave %d PO genes" % ( pid, len(po_specific_to_all_refs), len(this_genes_pre), len(this_genes)) po_de_export[pid] = de_res[(pid, pid)].loc[this_genes] excel.pandas_to_excel( po_de_export, os.path.join(outdir, 'pair_only_de_lists_corrected.xlsx')) # export with a different layout, analogous to trial 2 venn_set, venn_ct = setops.venn_from_arrays( *[po_de_export[pid].index for pid in pids]) po_combination_export = differential_expression.venn_set_to_dataframe( po_de_export, venn_set, pids) po_combination_export.to_excel( os.path.join(outdir, 'pair_only_de_lists_combined_corrected.xlsx')) # plot: how many DE genes are present in each reference comparison? fig, axs = plt.subplots(nrows=2, ncols=3) for pid in pids: if pid in subgroups['RTK I']:
this_ix = (df.p_bonferroni <= alpha) & (df.NS.isin(namespaces)) & (df.enrichment == 'e') df_filt = df.loc[this_ix] # include bottom-most nodes only ix = [] for go_id in df_filt.index: ix.append(len(go_obj.obo[go_id].get_all_children()) == 0) goea_res_filt[k] = df_filt.loc[ix] # minor manipulation of results, then save to a single Excel file # do this for full results and filtered all_res = reannotate(goea_res) all_res_filt = reannotate(goea_res_filt) excel.pandas_to_excel(all_res, os.path.join(outdir, "goea_de_all_results.xlsx")) excel.pandas_to_excel( all_res_filt, os.path.join(outdir, "goea_de_all_results_filtered.xlsx")) # create (mega)heatmap of all results tmp = pd.concat([v.name for v in all_res_filt.values()]) tmp = tmp.loc[~tmp.duplicated()] for_plot = pd.DataFrame(index=tmp.values) for pid in pids[::-1]: this = all_res[pid].reindex(tmp.index) this.index = tmp.values for_plot.insert(0, pid, -np.log10(this['p_bonferroni']))
to_export = collections.OrderedDict() to_export['Explanation'] = pd.Series( collections.OrderedDict([ ('DE ESC line 1', 'ENCODE H1 ESC (%d replicates)' % the_groups.value_counts()['ESC_encode']), ('DE ESC line 2', 'Cacchiarelli et al. (%d lines)' % the_groups.value_counts()['ESC_cacchiarelli']), ('DMR ESC line 1', 'ENCODE H7 ESC (no replicates)'), ('DMR ESC line 2', 'Weltner et al. H9 (3 replicates)'), ], ), name='All comparisons are stated as iPSC - ESC.') to_export.update( collections.OrderedDict([(pid, dedmr_results[pid]) for pid in pids if pid in dedmr_results])) excel.pandas_to_excel(to_export, os.path.join(outdir, "DE_DMR_results_combined.xlsx")) def aggregate_dm_results_by_gene(dmr_res, genes): delta_m = {} fdr = {} for g in genes: this_ix = dmr_res.index[dmr_res.genes.apply(lambda x: g in x)] delta_m[g] = dmr_res.loc[ this_ix, dmr_res.columns.str.contains('median_delta')].mean(axis=0) fdr[g] = dmr_res.loc[this_ix, dmr_res.columns.str.contains('padj')].mean( axis=0) delta_m = pd.DataFrame(delta_m).transpose().sort_index()
others = [ de_res_sign[("GBM%s" % gic_pid, "iNSC%s" % p)] for p in pd.Index(pids).drop(insc_pid) ] # 'syn only' index this_so_ix = this_syn.index.difference( setops.reduce_union(*[t.index for t in others])) syn_only[("GBM%s" % gic_pid, "iNSC%s" % insc_pid)] = this_syn.loc[this_so_ix] n_syn_only.loc[gic_pid, insc_pid] = this_so_ix.size true_syn_only = dict([(p, syn_only[("GBM%s" % p, "iNSC%s" % p)]) for p in pids]) # export to list excel.pandas_to_excel(true_syn_only, os.path.join(outdir, "de_only_in_syngeneic.xlsx")) # export for IPA # we're going to run the true syngeneic (10) against non-syngeneic chosen to give the greatest number of DE genes # in practice, this means fixing the identity of the iNSC selected_insc = ['018', '030', '054', '052'] cols = [] common_probes = set() for pid in pids: cols.append("GBM%siNSC%s_logFC" % (pid, pid)) common_probes.update(syn_only[("GBM%s" % pid, "iNSC%s" % pid)].index) for p1 in selected_insc: for p2 in pids: k = "GBM%siNSC%s_logFC" % (p1, p2) if k not in cols:
## genes that are pair-only in every possible ref comparison po_each = [ sorted( reduce(intersecter, pair_only.loc[pid, ~pair_only.columns.str.contains(pid)])) for pid in pids ] po_each = pd.Series(po_each, index=pids) # export gene lists here po_export = {} for pid in pids: po_export["GBM%s_pair_only" % pid] = de_res[(pid, pid)].loc[po_each.loc[pid]] excel.pandas_to_excel( po_export, os.path.join(outdir, "pair_only_all_consistent.xlsx")) subdir = os.path.join(outdir, "ipa_all_consistent") if not os.path.isdir(subdir): os.makedirs(subdir) ipa.results_to_ipa_format(po_export, outdir=subdir) # now relax this requirement: which genes would be included if we require their inclusion in N of the cells # (rather than all)? possible_counts = range(1, pair_only.shape[1]) po_each_threshold = pd.DataFrame(index=pids, columns=possible_counts) for pid in pids: this_counter = collections.Counter() # iterate over each column # we can include the empty diagonal cell, since it will not affect the counting for col in pair_only.columns: for e in pair_only.loc[pid, col]:
res = collections.OrderedDict() res_full = collections.OrderedDict() for pid in pids: for c in comparison_names: fn = os.path.join(indir, "%s%s.csv" % (pid, c)) this = pd.read_csv(fn, sep='\t', header=0, index_col=0, usecols=[0, 3, 5, 7]) this.columns = ['n_gene', 'nes', 'fdr'] this = this.reindex(keep_pathways).dropna(how='all') res_full["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha_relevant] res["%s_%s" % (pid, comparison_names[c])] = this.loc[this.fdr < alpha] pathways_sign = sorted(setops.reduce_union(*[t.index for t in res.values()])) pathways_rele = sorted(setops.reduce_union(*[t.index for t in res_full.values()])) excel.pandas_to_excel(res, os.path.join(outdir, "gsea_results_significant_by_patient.xlsx")) # use this list to export a second wideform Excel file with the top list of pathways for_export = pd.DataFrame(index=pathways_sign, columns=['n_gene']) nes_columns = [] fdr_columns = [] for k, v in res.items(): for_export.loc[v.index, 'n_gene'] = v.n_gene this_yn = pd.Series('N', index=pathways_sign) this_yn.loc[v.index] = 'Y' for_export.insert( for_export.shape[1], k, this_yn ) for_export.insert(
]] # to_add.columns = ['chrom', 'coord', 'genes', 'gene_relation'] df.insert(df.shape[1], 'chrom', to_add.CHR) df.insert(df.shape[1], 'coord', to_add.MAPINFO.fillna(-1).astype(int)) df.insert(df.shape[1], 'gene', [ ','.join(t) if hasattr(t, '__iter__') else '' for t in to_add.UCSC_RefGene_Name ]) df.insert(df.shape[1], 'gene_relation', [ ','.join(t) if hasattr(t, '__iter__') else '' for t in to_add.UCSC_RefGene_Group ]) new_dat[k] = df excel.pandas_to_excel( new_dat, os.path.join(outdir, fn.replace('.xlsx', '.annotated.xlsx'))) dmp_fn = os.path.join(indir, 'dmps_3021_swan.xlsx') dmps = pd.read_excel(dmp_fn, header=0, index_col=0, sheet_name=None) # combine all DMPs into a single wideform cols = reduce( lambda x, y: x + y, [['%s' % t, '%s_logFC' % t, '%s_FDR' % t] for t in dmps]) all_probes = setops.reduce_union( *[v.loc[v['adj.P.Val'] < 0.05].index for v in dmps.values()]) all_probes = all_probes.intersection(anno.index) dmps_all = pd.DataFrame(index=all_probes, columns=['CHR', 'coord', 'genes'] + cols)
1] # run clustering to order the rows/cols nicely rl = hc.linkage(co.fillna(0.).transpose(), method='average', metric='euclidean') row_ix = hc.leaves_list(rl) cl = hc.linkage(co.fillna(0.), method='average', metric='euclidean') col_ix = hc.leaves_list(cl) # reorder the data based on the clustering co = co.iloc[col_ix, row_ix] co_p = co_p.iloc[col_ix, row_ix] excel.pandas_to_excel({ corr_metric: co, 'pval': co_p }, os.path.join(outdir, "correlation_%s_syngeneic.xlsx" % corr_metric)) # quantify the number of patients involved in each of the pathways for follow up follow_up_pathways = quantify_follow_up_pathways(ipa_res, co_p, comparisons, pids, alpha=alpha, alpha_strict=alpha_strict) # for plotting, we only need an indicator of which values are significant plot_dict = plot_heatmap_with_quantification( co, co_p, follow_up_pathways,
po_counts = pair_only.applymap(len) ro_counts = ref_only.applymap(len) ## genes that are pair-only in every possible ref comparison po_each = [ sorted( reduce(intersecter, pair_only.loc[pid, ~pair_only.columns.str.contains(pid)]) ) for pid in pids ] po_each = pd.Series(po_each, index=pids) # export gene lists here po_export = {} for pid in pids: po_export["GBM%s_pair_only" % pid] = de_res[(pid, pid)].loc[po_each.loc[pid]] excel.pandas_to_excel(po_export, os.path.join(outdir, "pair_only_all_consistent.xlsx")) subdir = os.path.join(outdir, "ipa_all_consistent") if not os.path.isdir(subdir): os.makedirs(subdir) ipa.results_to_ipa_format(po_export, outdir=subdir) # What is present in X vs Y_i that isn't in X vs any other Y? po_diff = pd.DataFrame(index=pair_only.index, columns=pair_only.columns) for pid in pids: for pid2 in pair_only.columns: the_ref = pair_only.loc[pid, pid2] all_else = pair_only.loc[pid, pair_only.columns != pid2] union_all_else = reduce(set.union, all_else, set()) po_diff.loc[pid, pid2] = sorted(set(the_ref).difference(union_all_else)) # find DE genes that are always PO when a (non-matching) iNSC reference is used, but NOT when an external reference
subgroup_ind, subgroup_set_colours, venn_set=venn_set, min_size=1, n_plot=30, ) ups['axes']['set_size'].set_xlabel("Number of pathways in single patient") ups['axes']['main'].set_ylabel("Number of pathways in set") ups['figure'].savefig(os.path.join(outdir, "upset_pathways.png"), dpi=200) # export s1_specific = {} specific_sets = setops.specific_sets(pids) for p, s in specific_sets.items(): s1_specific[p] = s1_reports_all[p].loc[venn_set[s]] excel.pandas_to_excel(s1_specific, os.path.join(outdir, "s1_patient_specific.xlsx")) # S2 syngeneic-only s2_syngeneic = {} for p in pids: in_ours = s1_reports_all[p].index in_refs = setops.reduce_union( *[s2_reports_all["%s_%s" % (p, r)].index for r in refs]) in_so = in_ours.difference(in_refs) tmp = s1_reports_all[p].loc[in_so] s2_syngeneic[p] = tmp.loc[tmp.nes.abs().sort_values( ascending=False).index] fig = plt.figure() ax = fig.add_subplot(111)