def generate_summary_df(all_in, comps, pids=consts.PIDS): """ Given the full results dictionary, generate a single DataFrame summary of the number of comparisons and IDs exhibiting each pathway. :param all_in: DataFrame, indexes are pathways and columns are comparison names. Entries are boolean, where a T indicates that the pathway is significant in that comparison :param comps: List of comparisons, not including syngeneic :param pids: List of PIDs. :return: """ pathways = all_in.index n_set = pd.DataFrame( 0, index=pathways, columns=['Syngen. only', 'Ref. only', 'Intersect.'], dtype=int ) so = dict([(pw, []) for pw in pathways]) ro = dict([(pw, []) for pw in pathways]) inters = dict([(pw, []) for pw in pathways]) for pid in pids: s = all_in.index[all_in.loc[:, "%s_syngeneic" % pid]] r = all_in.index[all_in.loc[:, ["%s_%s" % (pid, t) for t in comps]].any(axis=1)] vs, _ = setops.venn_from_arrays(s, r) n_set.loc[vs['10'], 'Syngen. only'] += 1 n_set.loc[vs['01'], 'Ref. only'] += 1 n_set.loc[vs['11'], 'Intersect.'] += 1 for pw in vs['10']: so[pw].append(pid) for pw in vs['01']: ro[pw].append(pid) for pw in vs['11']: inters[pw].append(pid) # output excel file giving at-a-glance access to which patients are involved in each pathway, categorised as # 'syn only', 'ref only' and 'intersection' at_a_glance = pd.DataFrame( index=pathways, columns=['n_syngen_only', 'syngen_only_pids', 'n_ref_only', 'ref_only_pids', 'n_intersect', 'intersect_pids'], dtype=object ) for pw in pathways: at_a_glance.loc[pw, 'n_syngen_only'] = len(so[pw]) at_a_glance.loc[pw, 'syngen_only_pids'] = ';'.join(so[pw]) at_a_glance.loc[pw, 'n_ref_only'] = len(ro[pw]) at_a_glance.loc[pw, 'ref_only_pids'] = ';'.join(ro[pw]) at_a_glance.loc[pw, 'n_intersect'] = len(inters[pw]) at_a_glance.loc[pw, 'intersect_pids'] = ';'.join(inters[pw]) return n_set, at_a_glance
def compute_cross_comparison_correction(res, samples, external_refs, set_type='pair_only'): """ Compute the _correction_ list of features for the supplied results. These are the features that are EITHER present in every reference comparison but no cross-comparisons (set_type='ref_only') OR present in no reference comparison but all cross-comparisons (set_type='pair_only') :param res: Dictionary containing comparison results. Each comparison is keyed by the tuple (i, j), where i and j are the IDs of the two groups being compared. Values are iterables of unique feature identifiers (e.g. gene IDs, DMR cluster IDs). :param samples: The core sample list, without including external references. :param external_refs: A list of external reference sample names. :param set_type: See description. :return: Iterable of feature IDs """ members_rows = samples members_cols = members_rows + external_refs the_venn_set = pd.DataFrame(index=members_rows, columns=members_cols) for i in members_rows: p = res[(i, i)] for j in members_cols: r = res[(i, j)] x, _ = setops.venn_from_arrays(p, r) if set_type == 'pair_only': kset = '10' elif set_type == 'ref_only': kset = '01' else: raise AttributeError("set_type must be 'pair_only' or 'ref_only'.") the_venn_set.loc[i, j] = x[kset] # For each reference, get the features that are pair only in that reference and not in any of the iNSC vs_diff = pd.DataFrame(index=members_rows, columns=external_refs) for i in members_rows: for j in external_refs: the_ref = the_venn_set.loc[i, j] all_else = the_venn_set.loc[i, members_rows] union_all_else = setops.reduce_union(*all_else) vs_diff.loc[i, j] = sorted(set(the_ref).difference(union_all_else)) # Intersection down the columns gives us a correction list for each reference vs_specific_to_ref = vs_diff.apply(lambda x: setops.reduce_intersection(*x)) # Intersection across the references gives us a final list that need correcting vs_specific_to_all_refs = setops.reduce_intersection(*vs_specific_to_ref) return { 'specific_to_each_ref': vs_specific_to_ref, 'specific_to_all_refs': vs_specific_to_all_refs, 'venn_set': the_venn_set, 'ref_diff_set': vs_diff }
def set_permutation_test(data, n_iter=1000, parallel=True): K = len(data) N = len(setops.reduce_union(*data.values())) set_sizes = collections.OrderedDict([(k, len(v)) for k, v in data.items()]) simulated_sizes = collections.defaultdict(list) if parallel: pool = mp.Pool() jobs = {} for i in range(n_iter): jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N)) pool.close() pool.join() for i, j in jobs.items(): vc = j.get() for k, v in vc.items(): simulated_sizes[k].append(v) else: for i in range(n_iter): vc = one_random_perm(set_sizes, N) for k, v in vc.items(): simulated_sizes[k].append(v) _, vc_true = setops.venn_from_arrays(*data.values()) # to calculate the P value, we EITHER need to specify a single sided test OR decide how to compute a two-sided P # Some interesting discussions on this topic: # https://stats.stackexchange.com/questions/140107/p-value-in-a-two-tail-test-with-asymmetric-null-distribution # https://stats.stackexchange.com/questions/360864/2-tailed-permutation-tests-for-obviously-non-symmetric-data # https://stats.stackexchange.com/questions/34052/two-sided-permutation-test-vs-two-one-sided # However, a 'Z' value is easier to compute z = {} p = {} for k in simulated_sizes.keys(): obs = vc_true[k] t = stats.percentileofscore(simulated_sizes[k], obs) if t <= 50: p[k] = 2 * t / 100. else: p[k] = 2 * (1 - t / 100.) z[k] = t - 50. return { 'simulated_set_sizes': simulated_sizes, 'observed_set_sizes': vc_true, 'p': p, 'z': z }
def venn_diagram(*args, **kwargs): ax = kwargs.pop('ax', plt.gca()) n = len(args) venn = None if n not in {2, 3, 4}: raise NotImplementedError( "At present, we only support 2, 3 and 4 way Venn diagrams") venn_sets, venn_counts = setops.venn_from_arrays(*args, **kwargs) if n == 2: venn = venn2(subsets=venn_counts, ax=ax, **kwargs) elif n == 3: venn = venn3(subsets=venn_counts, ax=ax, **kwargs) elif n == 4: venn = venn4(venn_counts, ax=ax, **kwargs) return venn, venn_sets, venn_counts
def quantify_follow_up_pathways(ipa_res, corr_pval_df, comparisons, pids, alpha=0.05, alpha_strict=0.005): log_alpha_strict = -np.log10(alpha_strict) pws = corr_pval_df.columns[(corr_pval_df < alpha).any(axis=0)] pval_cols_syn = ipa_res.columns[ipa_res.columns.str.contains( r'_syngeneic_-logp')] follow_up_pathways = pd.DataFrame( index=pws, columns=['Syngen. only', 'Ref. only', 'Intersect.']) for pw in pws: this_ipa_pvals_syn = ipa_res.loc[pw, pval_cols_syn] this_ipa_pvals_syn.index = this_ipa_pvals_syn.index.str.replace( '_syngeneic_-logp', '') this_ipa_pvals_syn.dropna(inplace=True) this_ipa_pvals_refs = pd.DataFrame(columns=pids, index=comparisons) this_ipa_refs_sign = set() for r in comparisons: t = ipa_res.loc[pw, ["%s_%s_-logp" % (pid, r) for pid in pids]] t.index = t.index.str.replace(r'_.*', '') this_ipa_pvals_refs.loc[r, pids] = t this_ipa_refs_sign.update(t.index[t >= log_alpha_strict]) # use the venn set machinery for convenient counting for_venn = [ this_ipa_pvals_syn.index[ this_ipa_pvals_syn >= log_alpha_strict].tolist(), sorted(this_ipa_refs_sign) ] vs, vc = setops.venn_from_arrays(*for_venn) follow_up_pathways.loc[pw, 'Syngen. only'] = vc['10'] follow_up_pathways.loc[pw, 'Ref. only'] = vc['01'] follow_up_pathways.loc[pw, 'Intersect.'] = vc['11'] return follow_up_pathways
['GIBCO']) pair_and_ref_discordant = pd.DataFrame(index=pids, columns=pids + additional_pids + ['GIBCO']) # loop over GBM samples for pid in pids: # syngeneic comparison the_pair = de_res[(pid, pid)] # loop over (i)NSC samples # when this is the same as the syngeneic comparison, there will (obviously) be no 'pair only' or 'ref only' # genes! for pid2 in pids + additional_pids + ['GIBCO']: the_ref = de_res[(pid, pid2)] the_sets, _ = setops.venn_from_arrays(the_pair.index, the_ref.index) pair_only.loc[pid, pid2] = the_sets['10'] ref_only.loc[pid, pid2] = the_sets['01'] # for overlapping genes: separate based on direction (matching or non matching) the_conc_idx = (the_pair.loc[the_sets['11']].Direction == the_ref.loc[the_sets['11']].Direction) pair_and_ref_concordant.loc[pid, pid2] = the_pair.loc[ the_sets['11']].loc[the_conc_idx].index pair_and_ref_discordant.loc[pid, pid2] = the_pair.loc[ the_sets['11']].loc[~the_conc_idx].index # can get counts like this po_counts = pair_only.applymap(len) ro_counts = ref_only.applymap(len) # the permutation part
n_perm = 1000 # DE pids = consts.PIDS outdir = output.unique_output_dir() # load previously generated DE results fn = os.path.join(HGIC_LOCAL_DIR, 'current', 'core_pipeline', 'rnaseq', 'full_de_syngeneic_only.xlsx') de_res = pd.read_excel(fn, header=0, index_col=0) all_ens = de_res.index[(de_res[pids] == 'Y').any(axis=1)] de_per_pat = {pid: de_res.index[de_res[pid] == 'Y'] for pid in pids} n_tot = {pid: de_per_pat[pid].size for pid in pids} vs, vc = setops.venn_from_arrays(*[de_per_pat[pid] for pid in pids]) pp = setops.specific_sets(pids) n_ps = {pid: vc[pp[pid]] for pid in pids} # perms n_all = len(all_ens) n_spec = run_patient_specific_permutations(n_tot, n_all, n_perm=n_perm) fig, axs = plot_perms_kde_vs_obs( n_spec, n_ps, xlabel='Number of patient-specific DE genes', order=pids) fig.savefig(os.path.join(outdir, "patient_specific_de.png"), dpi=200) fig.savefig(os.path.join(outdir, "patient_specific_de.tiff"), dpi=200) fig.savefig(os.path.join(outdir, "patient_specific_de.pdf")) fn = os.path.join(HGIC_LOCAL_DIR, 'current', 'core_pipeline', 'methylation', 'full_dmr_syngeneic_only.xlsx')
venn.venn_diagram(*[this_res[pid][t].index for t in ['iNSC'] + refs], set_labels=['iNSC'] + refs, ax=ax) ax.set_title(pid, fontsize=16) for i in range(len(pids), 12): ax = axs.flat[i] ax.set_visible(False) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.02, top=0.95) fig.savefig(os.path.join(outdir, "number_de_genes_ref_comparison_%s_%s.png" % (nm, m)), dpi=200) # number of PO genes in Venn diagrams fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(10, 6)) for i, pid in enumerate(pids): a = this_res[pid]['iNSC'].index po = [] for ref in refs: b = this_res[pid][ref].index vs, vc = setops.venn_from_arrays(a, b) po.append(vs['10']) ax = axs.flat[i] venn.venn_diagram(*po, set_labels=refs, ax=ax) ax.set_title("GBM%s pair only" % pid, fontsize=16) for i in range(len(pids), 12): ax = axs.flat[i] ax.set_visible(False) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.02, top=0.95) fig.savefig(os.path.join(outdir, "po_de_genes_ref_comparison_%s_%s.png" % (nm, m)), dpi=200) # number of PO DE genes # overlap between individual references in terms of PO genes shared pct_pair_only_intersect = pd.DataFrame(index=pids, columns=refs)
# run the dgidb lookup against all genes # have to chunk this operation to avoid error all_genes = sorted( setops.reduce_union(*[t.gene.values for t in joint_de_dmr_s1.values()])) dgi_all = druggable_genome.dgidb_lookup_drug_gene_interactions(all_genes) # manually resolve a few known ambiguities ambig = {'ELTD1': 'ADGRL4', 'ODZ3': 'TENM3'} for k, v in ambig.items(): x = [t for t in dgi_all['ambiguous'][k] if t['geneName'] == v][0] dgi_all['interactions'][k] = x['interactions'] de_dmr_by_member = [joint_de_dmr_s1[pid].index for pid in pids] venn_set, venn_ct = setops.venn_from_arrays(*de_dmr_by_member) # define short and long list # long list ss = setops.specific_sets(pids) ps_de_dm_long = collections.OrderedDict([(pid, venn_set[ss[pid]]) for pid in pids]) ps_de_dm_long_list = setops.reduce_union(*ps_de_dm_long.values()) # short list vs_dm, vc_dm = setops.venn_from_arrays( *[dmr_res_s1[pid].results_significant.keys() for pid in pids]) vs_de, vc_de = setops.venn_from_arrays( *[de_res_s1[pid]['Gene Symbol'].dropna() for pid in pids])
# check that signature genes are all found in the data for k, v in genesets.items(): for i, t in enumerate(v): if t in manual_gene_name_correction: v[i] = manual_gene_name_correction[t] g_in = rnaseq_dat.index.intersection(v) if set(g_in) != set(v): missing = set(v).difference(rnaseq_dat.index) logger.warn( "%d genes in the %s signature do not match with the data index and will be dropped: %s.", len(missing), k, ', '.join(missing)) genesets[k] = g_in # check here whether there is any overlap vs, vc = setops.venn_from_arrays(*genesets.values()) n_overlap = sum( [vc[t] for t in setops.binary_combinations_sum_gte(len(genesets), 2)]) if n_overlap > 0: logger.warn( "The %d gene signatures used here have %d overlapping genes - please check this is OK.", len(genesets), n_overlap) # run ssGSEA then Z transform the results es = gsva.ssgsea(rnaseq_dat, genesets) es_z = z_transform(es, axis=1) # export for_export = es_z.transpose() for_export.insert(for_export.shape[1], 'Verhaak classification', rnaseq_meta.loc[for_export.index, 'expression_subclass'])
main_fig_bounds['top'] - main_fig_bounds['bottom'], ]) fig.savefig(os.path.join(outdir, "dmr_direction_effect_size_pie_array.png"), dpi=200) # run down the rows or columns and generate an 'overlap spectrum' for each one # rows: check the effect of varying the iNSC line (CONSISTENCY) # cols: check the effect of varying the GIC line (non-syngeneic DIFFERENCE) # also repeat for the columns, which is just the S1 approach (SYNGENEIC) row_collapse = pd.DataFrame( dict([(pid, setops.quantify_feature_membership( setops.venn_from_arrays( *[dmr_res_all['%s-%s' % (pid, p)].keys() for p in pids])[1])) for pid in pids]))[pids] col_collapse = pd.DataFrame( dict([(pid, setops.quantify_feature_membership( setops.venn_from_arrays( *[dmr_res_all['%s-%s' % (p, pid)].keys() for p in pids])[1])) for pid in pids]))[pids] syn_dist = setops.quantify_feature_membership( setops.venn_from_arrays( *[dmr_res_all['%s-%s' % (p, p)].keys() for p in pids])[1]) # bar charts fig, axs = plt.subplots(len(pids),
pool = mp.Pool() jobs = {} for i in range(n_iter): jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N)) pool.close() pool.join() for i, j in jobs.items(): vc = j.get() for k, v in vc.items(): simulated_sizes[k].append(v) for i in range(n_iter): rand_sets = [np.random.choice(N, v) for v in set_sizes.values()] _, vc = setops.venn_from_arrays(*rand_sets) for k, v in vc.items(): simulated_sizes[k].append(v) _, vc_true = setops.venn_from_arrays(*data.values()) # to calculate the P value, we EITHER need to specify a single sided test OR decide how to compute a two-sided P # Some interesting discussions on this topic: # https://stats.stackexchange.com/questions/140107/p-value-in-a-two-tail-test-with-asymmetric-null-distribution # https://stats.stackexchange.com/questions/360864/2-tailed-permutation-tests-for-obviously-non-symmetric-data # https://stats.stackexchange.com/questions/34052/two-sided-permutation-test-vs-two-one-sided # However, a 'Z' value is easier to compute z = {} p = {} for k in simulated_sizes.keys(): obs = vc_true[k]
# we're going to use passthrough mapping to customise the node colour # we'll define 3 colourmaps, with -log10(p) assigning the shade: # greyscale for syn. and ref. # reds for ref. only # blues for syn. only # colours are defined by HEX values? Add these to the nodes logp_vals = [t['-logp'] for t in this_ipa] vmax = max([t.max() for t in logp_vals]) # we need a lower offset for the non-grey colours, otherwise all the white shades look very similar vmin = -2 cmap_both_func = common.continuous_cmap(0, vmax, cmap='Greys') cmap_syn_func = common.continuous_cmap(vmin, vmax, cmap='Blues') cmap_ref_func = common.continuous_cmap(vmin, vmax, cmap='Reds') vs, _ = setops.venn_from_arrays(*[t.index for t in this_ipa]) node_significance = {} node_colours = {} node_attrs = {} for k, p_arr in vs.items(): ix = [t == '1' for t in k] n = float(sum(ix)) for pth in p_arr: m = 0 for i, t in enumerate(ix): if t: m += logp_vals[i][pth] node_attrs.setdefault( pth, {})["plogp_%s" % comparisons[i]] = logp_vals[i][pth]
u_hypo = {} u_hyper = {} core_dmrs_hypo[k1] = setops.reduce_intersection(*[ setops.reduce_intersection( *[x['Hypomethylated'] for k, x in u.items() if r in k]) for r in esc_ref_names ]) core_dmrs_hyper[k1] = setops.reduce_intersection(*[ setops.reduce_intersection( *[x['Hypermethylated'] for k, x in u.items() if r in k]) for r in esc_ref_names ]) # outcome vs, vc = setops.venn_from_arrays(*core_dmrs_hypo.values()) print "Hypomethylated core DMRs (hypo in both ESC comparisons). " print "Of the %d DMRs in our data, %d are shared with both HipSci and E-MTAB-6194" % ( len(core_dmrs_hypo[k_our_ipsc]), vc['111']) vs, vc = setops.venn_from_arrays(*core_dmrs_hyper.values()) print "Hypermethylated core DMRs (hyper in both ESC comparisons). " print "Of the %d DMRs in our data, %d are shared with both HipSci and E-MTAB-6194" % ( len(core_dmrs_hyper[k_our_ipsc]), vc['111']) # for each PID in iPSC vs ESC, define the core DMRs (shared by both ref comparisons) # then split into hyper and hypo core_dmr_our_ipsc_ref_esc = core_dmrs(dmr_res_our_ipsc_vs_esc, pids, esc_ref_names) core_dmr_direction_our_ipsc_ref_esc = core_dmr_by_direction(
de_res_full_s1 = dict([ (pid, de_res_full_s1[("GBM%s" % pid, "iNSC%s" % pid)]) for pid in pids ]) with open(fn, 'wb') as f: pickle.dump(de_res_full_s1, f) logger.info("Saved S1 DE results to %s", fn) # extract only significant DE genes de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']]) for k, v in de_res_full_s1.items()]) # generate wide-form lists and save to Excel file de_by_member = [de_res_s1[pid].index for pid in pids] venn_set, venn_ct = setops.venn_from_arrays(*de_by_member) # add null set manually from full DE results de_genes_all = setops.reduce_union(*venn_set.values()) k_null = ''.join(['0'] * len(pids)) venn_set[k_null] = list(de_res_full_s1[pids[0]].index.difference(de_genes_all)) venn_ct[k_null] = len(venn_set[k_null]) de_data = setops.venn_set_to_wide_dataframe(de_res_s1, venn_set, pids, full_data=de_res_full_s1, cols_to_include=['logFC', 'FDR'], consistency_check_col='logFC', consistency_check_method='sign') # add gene symbols back in
# reds for ref. only # blues for syn. only # colours are defined by HEX values? Add these to the nodes logp_syn = -np.log10(res_syn.fdr + eps) logp_r1 = -np.log10(res_r1.fdr + eps) logp_r2 = -np.log10(res_r2.fdr + eps) vmax = max( logp_syn.max(), logp_r1.max(), logp_r2.max(), ) cmap_both_func = common.continuous_cmap(0, vmax, cmap='Greys') cmap_syn_func = common.continuous_cmap(0, vmax, cmap='Blues') cmap_ref_func = common.continuous_cmap(0, vmax, cmap='Reds') vs, _ = setops.venn_from_arrays( *[t.index for t in (res_syn, res_r1, res_r2)]) node_colours = {} for pth in vs['111'] + vs['101'] + vs['110']: node_colours[pth] = cmap_both_func(logp_syn[pth]) for pth in vs['100']: node_colours[pth] = cmap_syn_func(logp_syn[pth]) for pth in vs['011']: # mean P for refs node_colours[pth] = cmap_ref_func(0.5 * (logp_r1[pth] + logp_r2[pth])) for pth in vs['010']: node_colours[pth] = cmap_ref_func(logp_r1[pth]) for pth in vs['001']: node_colours[pth] = cmap_ref_func(logp_r2[pth])
args=(the_data.loc[p], the_data), kwds=dict(method=corr_method)) # cor, pval = one_vs_many_correlation(the_data.loc[p], the_data, method=corr_method) # these_probes = cor.index[(cor.abs() > cross_corr_threshold) & (pval < alpha)] # myc_corr_probes.append(these_probes) pool.close() pool.join() for p in myc_probes: cor, pval = jobs[p].get(1e4) these_probes = cor.index[(cor.abs() > cross_corr_threshold) & (pval < alpha)] myc_corr_probes.append(these_probes) # out of interest, what is the overlap between these? (presumably quite high?) vs, vc = setops.venn_from_arrays(*myc_corr_probes) # union of probes keep_probes = setops.reduce_union(*myc_corr_probes) print "After comparing all data against each MYC probe, we are left with %d correlated probes" % len( keep_probes) genes_corr_with_myc = the_symbols.loc[keep_probes].dropna() print "These correspond to %d unique genes." % len( genes_corr_with_myc.unique()) # check the overlap with validated genes overlap = pd.Index(validated_genes).intersection( genes_corr_with_myc.unique()) if len(overlap) == len(validated_genes):
fdr_dat = fdr_dat[fdr_dat < alpha] fdr_dat.columns = res.keys() # dict is ordered, so this is OK all_in = ~fdr_dat.isnull() all_in.columns = res.keys() # dict is ordered, so this is OK log_fdr_dat = np.log10(fdr_dat + 1e-6) * -1 # number syngen. only, ref. only and intersection n_set = pd.DataFrame(0, index=pathways_sign, columns=['Syngen. only', 'Ref. only', 'Intersect.'], dtype=int) so = {} ro = {} inters = {} for pid in pids: s = all_in.index[all_in.loc[:, "%s_syngeneic" % pid]] r = all_in.index[all_in.loc[:, ["%s_%s" % (pid, t) for t in comparison_names.values()[1:]]].any(axis=1)] vs, _ = setops.venn_from_arrays(s, r) n_set.loc[vs['10'], 'Syngen. only'] += 1 n_set.loc[vs['01'], 'Ref. only'] += 1 n_set.loc[vs['11'], 'Intersect.'] += 1 n_set = n_set.fillna(0) from ipa_results_s1_s2 import pathway_involvement_heatmap_by_p comparison_dict = { 'syngeneic': 'Syngen.', 'h9': 'H9', 'gibco': 'Gibco' } # plot 1) P values, ordered by sum of -log(P) p_order = log_fdr_dat.sum(axis=1).sort_values(ascending=False).index plot_dict = pathway_involvement_heatmap_by_p(
dat_classified = dict([(pid, run_one_sort(var_dat[pid], 'GIC', 'iNSC')) for pid in pids]) # search through GIC only and GIC hom/iNSC het SNPs and 'other' and generate upset members = {} for pid, d in dat_classified.items(): members[pid] = set() for typ in ['GIC only', 'GIC hom iNSC het', 'other']: for x in d[typ]: if isinstance(x, dict): members[pid].add(str(x['GIC'])) else: members[pid].add(str(x)) vs, vc = setops.venn_from_arrays(*[members[pid] for pid in pids]) venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups( pids, groups) hypo_count_full = vc[venn_sets_by_group['full']['Hypo'][0]] hyper_count_full = vc[venn_sets_by_group['full']['Hyper'][0]] hypo_counts_partial = [(setops.key_to_members(t, pids), vc[t]) for t in venn_sets_by_group['partial']['Hypo']] hyper_counts_partial = [(setops.key_to_members(t, pids), vc[t]) for t in venn_sets_by_group['partial']['Hyper']] # is this significant in any way? # focus on 3/4 of hypo OR 5/6 of hyper it = itertools.combinations(pids, 4)
# now we need to compare the paired results with every other result (Gibco and other iNSC) pair_only = pd.DataFrame(index=pids, columns=cols) ref_only = pd.DataFrame(index=pids, columns=cols) pair_and_ref_concordant = pd.DataFrame(index=pids, columns=cols) pair_and_ref_discordant = pd.DataFrame(index=pids, columns=cols) # loop over GBM samples for pid in pids: # syngeneic comparison the_pair = de_res[(pid, pid)] # loop over (i)NSC samples # when this is the same as the syngeneic comparison, there will (obviously) be no 'pair only' or 'ref only' # genes! for pid2 in cols: the_ref = de_res[(pid, pid2)] the_sets, _ = setops.venn_from_arrays(the_pair.index, the_ref.index) pair_only.loc[pid, pid2] = the_sets['10'] ref_only.loc[pid, pid2] = the_sets['01'] # for overlapping genes: separate based on direction (matching or non matching) the_conc_idx = (the_pair.loc[the_sets['11']].Direction == the_ref.loc[the_sets['11']].Direction) pair_and_ref_concordant.loc[pid, pid2] = the_pair.loc[ the_sets['11']].loc[the_conc_idx].index pair_and_ref_discordant.loc[pid, pid2] = the_pair.loc[ the_sets['11']].loc[~the_conc_idx].index # can get counts like this po_counts = pair_only.applymap(len) ro_counts = ref_only.applymap(len) ## genes that are pair-only in every possible ref comparison
ax2.yaxis.set_ticks(new_ticks) ax2.yaxis.set_ticklabels(new_ticklabels, rotation=90, color='gray') ax2.set_ylabel("Number of patients sharing pathway", color='gray') ax2.grid(False) ax.figure.tight_layout() ax.figure.savefig(os.path.join(outdir, "hgic_de_ipa_top%d.png" % top_n), dpi=200) ax.figure.savefig(os.path.join(outdir, "hgic_de_ipa_top%d.tiff" % top_n), dpi=200) # export a wideform dataframe containing all these pathways with log_p, etc. for_export = {} for pid in pids: for_export[pid] = res[pid].loc[p_top[pid]] vs, vc = setops.venn_from_arrays(*[p_top[pid] for pid in pids]) out = setops.venn_set_to_wide_dataframe( for_export, vs, pids, full_data=res, cols_to_include=['-log_p', 'ratio', 'z'], static_cols_to_include=['genes']) # excel.pandas_to_excel(out, os.path.join(outdir, "ipa_de_top_%d_pathways.xlsx" % top_n)) out.to_excel(os.path.join(outdir, "ipa_de_top_%d_pathways.xlsx" % top_n)) """ Note to myself: I did consider an UpSet plot here. However, with the full DE lists, the result isn't very edifying... With the exception of patient-specific pathways, all sets have 2 or fewer pathways. """
def get_de_dmr_groups( joint_de_dmr, clusters, groups, pids=consts.PIDS, relation_filter=None ): """ Get group-specific DE/DMRs. These are defined as DEs that are consistent with the DMRs in a given selection of patients (from one to many) that are NOT shared across groups. :param joint_de_dmr: :param clusters: :param groups: Dictionary, keyed by group name. Values are iterables giving patient IDs in each group. :param pids: :param relation_filter: :return: """ venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(pids, groups) if relation_filter is not None: if not hasattr(relation_filter, '__iter__'): relation_filter = [relation_filter] de_dmr_groups = {} de_dmr_de_logfc = {} de_dmr_de_fdr = {} de_dmr_dmr_delta = {} if relation_filter is None: de_dmr_by_member = [joint_de_dmr[pid].index for pid in pids] else: de_dmr_by_member = [] for pid in pids: this_members = [] for t in joint_de_dmr[pid].index: gene_rel_options = [(t[1], rel) for rel in relation_filter] if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0: this_members.append(t) de_dmr_by_member.append(this_members) venn_set, venn_count = setops.venn_from_arrays(*de_dmr_by_member) for grp in groups: this_sets = venn_sets_by_group['full'][grp] + venn_sets_by_group['partial'][grp] this_de_dmrs = sorted(setops.reduce_union(*[venn_set[k] for k in this_sets])) if relation_filter is not None: new_de_dmrs = [] for t in this_de_dmrs: # look for any intersection here gene_rel_options = [(t[1], rel) for rel in relation_filter] if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0: new_de_dmrs.append(t) this_de_dmrs = new_de_dmrs de_dmr_groups[grp] = this_de_dmrs # get separate lists of DE genes and DMR IDs # DMRs is straightforward de_dmr_dmr_delta[grp] = pd.DataFrame( index=sorted(set([t[0] for t in this_de_dmrs])), columns=pids + ['consistent'], ) # DEs is trickier: some genes have mapped twice because I was so diligent in curating the original lists! this_de_genes = sorted(set([t[1] for t in this_de_dmrs])) this_de_ens = annotation_gene_to_ensembl.gene_to_ens(this_de_genes) this_de_ens = this_de_ens[~this_de_ens.duplicated()] this_de_genes = this_de_ens.index de_dmr_de_logfc[grp] = pd.DataFrame( index=this_de_genes.tolist(), columns=pids + ['consistent'], ) de_dmr_de_fdr[grp] = pd.DataFrame( index=this_de_genes.tolist(), columns=pids + ['consistent'], ) # fill them in for k in this_sets: this_vs = [t for t in venn_set[k] if t[1] in this_de_genes] this_pids = [pids[i] for i, t in enumerate(k) if t == '1'] for pid in this_pids: de_dmr_dmr_delta[grp].loc[[t[0] for t in this_vs], pid] = joint_de_dmr[pid].loc[ this_vs, 'dmr_median_delta'].values de_dmr_de_logfc[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[ this_vs, 'de_logFC'].values de_dmr_de_fdr[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[ this_vs, 'de_FDR'].values for k, row in de_dmr_dmr_delta[grp].iterrows(): tmp_dm = np.sign(row.dropna().astype(float)) row['consistent'] = (tmp_dm == tmp_dm.iloc[0]).all() for k, row in de_dmr_de_logfc[grp].iterrows(): tmp_de = np.sign(row.dropna().astype(float)) row['consistent'] = (tmp_de == tmp_de.iloc[0]).all() de_dmr_de_fdr[grp].loc[k, 'consistent'] = row['consistent'] return { 'dmr_median_delta_m': de_dmr_dmr_delta, 'de_logFC': de_dmr_de_logfc, 'de_FDR': de_dmr_de_fdr, 'de_dmr_groups': de_dmr_groups }
the_contrast = "GBM - NSC" # de_gibco[pid] = differential_expression.edger_exacttest( # the_data, # the_groups, # pair=['NSC', 'GBM'], # lfc=lfc, # fdr=fdr # ) # de_gibco[pid] = differential_expression.edger_glmqlfit(the_data, the_groups, the_contrast) de_gibco[pid] = differential_expression.edger_glmfit( the_data, the_groups, the_contrast) # Separate into sets # all de[pid], de_counts[pid] = setops.venn_from_arrays( de_matched[pid].index, de_gibco[pid].index) # up only idx_up_match = de_matched[pid].loc[de_matched[pid].logFC > 0].index idx_up_ref = de_gibco[pid].loc[de_gibco[pid].logFC > 0].index de_up[pid], de_counts_up[pid] = setops.venn_from_arrays( idx_up_match, idx_up_ref) # down only idx_down_match = de_matched[pid].loc[de_matched[pid].logFC < 0].index idx_down_ref = de_gibco[pid].loc[de_gibco[pid].logFC < 0].index de_down[pid], de_counts_down[pid] = setops.venn_from_arrays( idx_down_match, idx_down_ref) # write to files, one worksheet per list (5 per individual) # paired comparison (all)
# compute DE between hGIC and paired iNSC de_res = {} de_res_full = {} for pid in pids: hgic_samples = rnaseq_obj.meta.index[ rnaseq_obj.meta.index.str.contains(pid)] the_data = rnaseq_obj.data.loc[:, hgic_samples] the_groups = rnaseq_obj.meta.loc[hgic_samples, 'type'] the_comparison = ['GBM', 'iNSC'] de_res[pid] = differential_expression.run_one_de( the_data, the_groups, the_comparison, **de_params) # de_res_full[pid] = differential_expression.run_one_de(the_data, the_groups, the_comparison, return_full=True, **de_params) print "GBM %s paired comparison, %d DE genes" % (pid, de_res[pid].shape[0]) venn_set, venn_ct = setops.venn_from_arrays( *[de_res[pid].index for pid in pids]) # add null set manually de_genes_all = reduce(lambda x, y: set(x).union(y), venn_set.values()) k_null = ''.join(['0'] * len(pids)) venn_set[k_null] = list( de_res_full[pids[0]].index.difference(de_genes_all)) venn_ct[k_null] = len(venn_set[k_null]) # check direction is the same venn_set_consistent = {} venn_set_inconsistent = {} for k in venn_set: the_genes = venn_set[k] the_pids = [pids[i] for i, t in enumerate(k) if t == '1'] the_de_direction = pd.DataFrame(
def one_random_perm(set_sizes, N): rand_sets = [np.random.choice(N, v) for v in set_sizes.values()] _, vc = setops.venn_from_arrays(*rand_sets) return vc
# DE results the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), de_params) filename = 'de_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DE_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 DE results from %s", fn) with open(fn, 'rb') as f: de_res_full_s1 = pickle.load(f) else: raise AttributeError( "Unable to find pre-computed S1 comparison results.") de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']]) for k, v in de_res_full_s1.items()]) vs, vc = setops.venn_from_arrays(*[de_res_s1[pid].index for pid in pids]) de_res_wide = setops.venn_set_to_wide_dataframe( de_res_s1, vs, pids, cols_to_include=['logFC', 'FDR']) ipa_de_res = collections.OrderedDict() for pid in pids: fn = os.path.join(de_indir, "full_de_patient{pid}.xls".format(pid=pid)) this_df = pd.read_excel(fn, skiprows=1, header=0, index_col=0) this_df.columns = ['-logp', 'ratio', 'z', 'genes'] this_df.insert(3, 'n_gene', this_df.genes.str.split(',').apply(len)) # filter to include only relevant pathways ipa_de_res[pid] = this_df.loc[this_df['-logp'] >= plogalpha] # for plotting groups = [(pid, dat_s1.columns[dat_s1.columns.str.contains(pid)]) for pid in pids]
def upset_set_size_plot(data, set_labels, set_colours=None, order_by_n_members=False, include_singletons=False, min_size=1, n_plot=None, bar_width=0.9, point_ms=10, default_colour='#4C72B0', **kwargs): """ Produce a summary plot showing the set sizes when the number of sets is > 4. Inspired / totally copying UpsetR: https://cran.r-project.org/web/packages/UpSetR/vignettes/basic.usage.html :param data: Array of iterables containing the full data set of each member. :param set_labels: Array of strings giving the name of each member, in the same order as data. :param set_colours: Dict/list of tuples/OrderedDict giving the name and shading of one or more sets. E.g. [ (group_A, {'sets': ['010', '011'], 'colour': 'red'}), (group_B, {'sets': ['110', '001'], 'colour': 'blue'}), ] The name is used for the legend. It can be 'None' to skip any entry for this group. Use the ordered options if order matters in the lower left stacked plot. Use a list to have multiple entries with the same group name. If supplied, these will be used for shading all three plots. If not, we just shade the singleton sets in the lower left set size plot. :param order_by_n_members: If True, order the plot by the number of members participating in each set. This has the effect of generating a bar chart that has multiple bunches of descending bars. :param include_singletons: If True, singleton sets are included in the main bar. Not really necessary as they are also plotted in the lower left bar. :param min_size: This is used to exclude sets falling below the minimum size. Can be disabled (set to None), but this is pointless since it involves plotting empty sets, which cannot be ordered meaningfully. :param n_plot: If not None, this is used to limit the number of sets plotted. :param bar_width: Used for plotting bar charts. :param point_ms: Size of the circles in the lower right plot. :param default_colour: The colour used for anything that isn't otherwise shaded. """ n_set = len(set_labels) if len(data) != len(set_labels): raise AttributeError( "Number of supplied data sets (%d) doesn't match the length of set_labels (%d)." % (len(data), n_set)) venn_sets, venn_ct = setops.venn_from_arrays(*data, **kwargs) if set_colours is None: str_fmt = "{0:0%db}" % n_set # NB the string must be reversed here singleton_sets = set( [str_fmt.format(2**i)[::-1] for i in range(n_set)]) other_sets = set([k for k in venn_ct if k not in singleton_sets]) set_colours = [ ('Non-unique', { 'sets': other_sets, 'colour': default_colour }), ('Unique', { 'sets': singleton_sets, 'colour': '#ff8484' }), ] else: try: set_colours = set_colours.items() except AttributeError: set_colours = list(set_colours) sets_seen = set() for nm, d in set_colours: this_sets = d['sets'] if len(sets_seen.intersection(this_sets)): raise ValueError( "Group %s contains one or more sets already contained elsewhere in set_colours" % nm) sets_seen.update(this_sets) sets_remaining = set(venn_ct.keys()).difference(sets_seen) if len(sets_remaining) > 0: set_colours = [(None, { 'sets': sets_remaining, 'colour': default_colour })] + set_colours # convenience function to find the colour matching a given set def set_lookup(k): for t in set_colours: if k in t[1]['sets']: return t lightgrey = '#cecece' if include_singletons: sort_input = venn_ct else: # exclude any results with only one set sort_input = dict( [t for t in venn_ct.items() if len(t[0].replace('0', '')) > 1]) if min_size is not None: sort_input = dict([(k, v) for k, v in sort_input.items() if v > min_size]) if order_by_n_members: ordered_counts = [] for i in range(1, len(set_labels) + 1): # make a collection of results with this many members this_collection = [] for k in setops.binary_combinations_sum_eq(len(set_labels), i): # check whether this entry is present, if not it has already been filtered out if k in sort_input: this_collection.append((k, sort_input[k])) # sort in descending order and append to list ordered_counts.extend( sorted(this_collection, key=lambda x: x[1], reverse=True)) else: ordered_counts = sorted(sort_input.items(), key=lambda x: x[1], reverse=True) if n_plot: ordered_counts = ordered_counts[:n_plot] gs_kw = dict( left=0.05, right=0.99, top=0.99, bottom=0.1, wspace=0.1, hspace=0.01, height_ratios=[6, 3], width_ratios=[3, 6], ) # set up axis grid gs = gridspec.GridSpec(nrows=2, ncols=2, **gs_kw) fig = plt.figure(figsize=(9, 6)) ax_tl = fig.add_subplot(gs[0, 0]) ax_set_size = fig.add_subplot(gs[1, 0]) ax_intersect = fig.add_subplot(gs[1, 1], sharey=ax_set_size) ax_main = fig.add_subplot(gs[0, 1], sharex=ax_intersect) # hide some things ax_tl.set_visible(False) plt.setp(ax_intersect.get_yticklabels(), visible=False) plt.setp(ax_main.get_xticklabels(), visible=False) plt.setp(ax_intersect.get_xticklabels(), visible=False) # data x_arr = np.arange(len(ordered_counts)) + 0.5 y_arr = np.arange(n_set) # main bar chart colours = [set_lookup(t[0])[1]['colour'] for t in ordered_counts] ax_main.bar(x_arr, [t[1] for t in ordered_counts], width=bar_width, color=colours) ax_main.set_ylabel('Number of DE genes in set') # bottom right set intersections # grey markers everywhere for y in y_arr: ax_intersect.plot(x_arr, np.ones_like(x_arr) * y, marker='o', mfc=lightgrey, mec='none', ms=point_ms, ls='none') # overplot shaded markers on sets that are included for i, (k, v) in enumerate(ordered_counts): x = x_arr[i] y = [j for j, u in enumerate(k) if u == '1'] c = set_lookup(k)[1]['colour'] ax_intersect.plot(x * np.ones(len(y)), y, marker='o', mfc=c, mec=c, ms=point_ms, ls='none') # bottom left : set size and singleton (unique) set size left = np.zeros(n_set) set_sizes = [] for nm, d in set_colours: this_ss = np.zeros(n_set) for k in d['sets']: for i in range(n_set): if k[i] == '1': this_ss[i] += venn_ct[k] set_sizes.append([nm, this_ss]) ax_set_size.barh(y_arr + 0.5, this_ss, height=-bar_width, left=left, align='edge', label=nm, color=d['colour']) left += this_ss ax_set_size.invert_xaxis() ax_set_size.set_ylim([-.5, len(set_labels) - .5]) ax_set_size.yaxis.tick_right() ax_set_size.set_yticks(y_arr) ax_set_size.set_yticklabels(set_labels) ax_set_size.set_xlabel("Number of DE genes in single comparison") ax_set_size.legend( loc='lower left', # fontsize=8, frameon=False, facecolor='w', # edgecolor='k', bbox_to_anchor=(0.05, 1.1), # place above and outside the axis ) return { 'gs': gs, 'axes': { 'set_size': ax_set_size, 'intersection': ax_intersect, 'main': ax_main, 'top_left': ax_tl }, 'figure': fig }
dmr_sign.loc[pid, pid2] = sorted(dmr_res[pid][pid2].clusters_significant.keys()) dmr_counts = dmr_sign.applymap(len) # pair only pair_only = pd.DataFrame(index=pids, columns=pids + ['GIBCO']) ref_only = pd.DataFrame(index=pids, columns=pids + ['GIBCO']) pair_and_ref_concordant = pd.DataFrame(index=pids, columns=pids + ['GIBCO']) pair_and_ref_discordant = pd.DataFrame(index=pids, columns=pids + ['GIBCO']) for pid in pids: for pid2 in pids + ['GIBCO']: p = dmr_sign.loc[pid, pid] r = dmr_sign.loc[pid, pid2] pres = dmr_res[pid][pid].results_significant rres = dmr_res[pid][pid2].results_significant x, _ = setops.venn_from_arrays(p, r) pair_only.loc[pid, pid2] = x['10'] ref_only.loc[pid, pid2] = x['01'] # ref and pair IDs pr_id = x['11'] # signs pmed_change_sign = np.array([np.sign(pres[t]['median_change']) for t in pr_id]) rmed_change_sign = np.array([np.sign(rres[t]['median_change']) for t in pr_id]) pair_and_ref_concordant.loc[pid, pid2] = list( np.array(x['11'])[pmed_change_sign == rmed_change_sign] ) pair_and_ref_discordant.loc[pid, pid2] = list( np.array(x['11'])[pmed_change_sign != rmed_change_sign] )
de_res = differential_expression.compute_cross_de(rnaseq_obj, pids, external_references=external_refs, **de_params) # add the combined DE results for the refs combined for pid in pids: # complete intersection the_idx = sorted(reduce(intersecter, [de_res[(pid, t)].index for t in external_ref_labels])) one_cols = de_res[(pid, pid)].columns tups = reduce(lambda x, y: x + y, [zip([t] * one_cols.size, one_cols.tolist()) for t in external_ref_labels]) the_cols = pd.MultiIndex.from_tuples(tups, names=['ref', 'field']) the_block = pd.DataFrame(index=the_idx, columns=the_cols) for t in external_ref_labels: the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values de_res[(pid, 'ref_intersect')] = the_block # intersect 2 this_venn, _ = setops.venn_from_arrays(*[de_res[(pid, t)].index for t in external_ref_labels]) the_idx = reduce(unioner, [this_venn[k] for k in setops.binary_combinations_sum_gte(len(external_refs), 2)]) the_block = pd.DataFrame(index=the_idx, columns=the_cols) for t in external_ref_labels: try: the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values except KeyError: # no matches for this ref - no problem pass de_res[(pid, 'ref_intersect2')] = the_block # union the_idx = sorted(reduce(unioner, [de_res[(pid, t)].index for t in external_ref_labels])) the_block = pd.DataFrame(index=the_idx, columns=the_cols) for t in external_ref_labels: try:
def upset_plot_with_groups(data, set_labels, subgroup_ind, subgroup_colours, venn_set=None, other_lbl='Expanded core', specific_lbl='Specific', default_colour='gray', **kwargs): """ Wrapper around the basic upset plotting function. This allows us to highlight sets that fully or partially overlap with a pre-defined subgroup. :param data: Passed to upset_set_size_plot. Iterable of identifiers used to process venn sets. :param set_labels: Iterable of set labels. :param subgroup_ind: Dictionary, keys are set_labels, entries are Boolean indexes showing which of set_labels are in this subgroup. If ordering is desired, use an OrderedDict. :param subgroup_colours: Dict giving the colour for each of the subsets defined in subgroup ind. For each set S, two entries are needed, keyed `S full` and `S partial`. We can also define two additional colours, which otherwise have default values: `Expanded core` (or whatever `other_lbl` is set to) and `Specific`. :param venn_set: Output of setops.venn_from_arrays(data). Can supply it to skip recomputing. :param other_lbl: Label used to identify those sets that span multiple subgroups. :param specific_lbl: Label used to identify those sets that are specific to a single member. :param kwargs: Passed to upset_set_size_plot :return: Same output as upset plot function. """ # UpsetR attribute plots default_colour_other = '#4C72B0' default_colour_specific = '#f4e842' if venn_set is None: venn_set, _ = setops.venn_from_arrays(*data) # set colours for UpsetR plot sets_full = {} sets_partial = {} sets_unique = [] ## TODO: merge this with setops.full_partial_unique_other_sets_from_groups for k in venn_set: this_k = np.array([t for t in k]).astype(bool) if this_k.sum() == 1: sets_unique.append(k) elif this_k.sum() > 1: for grp, grp_idx in subgroup_ind.items(): n_member = grp_idx.sum() # no other matches if this_k[~grp_idx].sum() == 0: if this_k[grp_idx].sum() == n_member: sets_full.setdefault(grp, []).append(k) else: sets_partial.setdefault(grp, []).append(k) set_colours = [] for grp_name in subgroup_ind: k_full = "%s full" % grp_name if grp_name in sets_full: set_colours.append((k_full, { 'sets': sets_full[grp_name], 'colour': subgroup_colours.get(k_full, default_colour) })) k_part = "%s partial" % grp_name if grp_name in sets_partial: set_colours.append((k_part, { 'sets': sets_partial[grp_name], 'colour': subgroup_colours.get(k_part, default_colour) })) set_colours.append( (other_lbl, { 'sets': expanded_core_sets(venn_set, subgroup_ind), 'colour': subgroup_colours.get(other_lbl, default_colour_other) }), ) set_colours.append( (specific_lbl, { 'sets': sets_unique, 'colour': subgroup_colours.get(specific_lbl, default_colour_specific) }), ) return upset_set_size_plot(data, set_labels, set_colours=set_colours, default_colour=default_colour, venn_set=venn_set, **kwargs)