예제 #1
0
def generate_summary_df(all_in, comps, pids=consts.PIDS):
    """
    Given the full results dictionary, generate a single DataFrame summary of the number of comparisons and IDs
    exhibiting each pathway.
    :param all_in: DataFrame, indexes are pathways and columns are comparison names. Entries are boolean, where a T
    indicates that the pathway is significant in that comparison
    :param comps: List of comparisons, not including syngeneic
    :param pids: List of PIDs.
    :return:
    """
    pathways = all_in.index

    n_set = pd.DataFrame(
        0,
        index=pathways,
        columns=['Syngen. only', 'Ref. only', 'Intersect.'],
        dtype=int
    )
    so = dict([(pw, []) for pw in pathways])
    ro = dict([(pw, []) for pw in pathways])
    inters = dict([(pw, []) for pw in pathways])
    for pid in pids:
        s = all_in.index[all_in.loc[:, "%s_syngeneic" % pid]]
        r = all_in.index[all_in.loc[:, ["%s_%s" % (pid, t) for t in comps]].any(axis=1)]
        vs, _ = setops.venn_from_arrays(s, r)

        n_set.loc[vs['10'], 'Syngen. only'] += 1
        n_set.loc[vs['01'], 'Ref. only'] += 1
        n_set.loc[vs['11'], 'Intersect.'] += 1

        for pw in vs['10']:
            so[pw].append(pid)
        for pw in vs['01']:
            ro[pw].append(pid)
        for pw in vs['11']:
            inters[pw].append(pid)

    # output excel file giving at-a-glance access to which patients are involved in each pathway, categorised as
    # 'syn only', 'ref only' and 'intersection'
    at_a_glance = pd.DataFrame(
        index=pathways,
        columns=['n_syngen_only', 'syngen_only_pids', 'n_ref_only', 'ref_only_pids', 'n_intersect', 'intersect_pids'],
        dtype=object
    )
    for pw in pathways:
        at_a_glance.loc[pw, 'n_syngen_only'] = len(so[pw])
        at_a_glance.loc[pw, 'syngen_only_pids'] = ';'.join(so[pw])
        at_a_glance.loc[pw, 'n_ref_only'] = len(ro[pw])
        at_a_glance.loc[pw, 'ref_only_pids'] = ';'.join(ro[pw])
        at_a_glance.loc[pw, 'n_intersect'] = len(inters[pw])
        at_a_glance.loc[pw, 'intersect_pids'] = ';'.join(inters[pw])

    return n_set, at_a_glance
예제 #2
0
def compute_cross_comparison_correction(res, samples, external_refs, set_type='pair_only'):
    """
    Compute the _correction_ list of features for the supplied results. These are the features that are
    EITHER present in every reference comparison but no cross-comparisons (set_type='ref_only')
    OR present in no reference comparison but all cross-comparisons (set_type='pair_only')
    :param res: Dictionary containing comparison results. Each comparison is keyed by the tuple (i, j), where i and j
    are the IDs of the two groups being compared. Values are iterables of unique feature identifiers (e.g. gene IDs,
    DMR cluster IDs).
    :param samples: The core sample list, without including external references.
    :param external_refs: A list of external reference sample names.
    :param set_type: See description.
    :return: Iterable of feature IDs
    """

    members_rows = samples
    members_cols = members_rows + external_refs

    the_venn_set = pd.DataFrame(index=members_rows, columns=members_cols)
    for i in members_rows:
        p = res[(i, i)]
        for j in members_cols:
            r = res[(i, j)]
            x, _ = setops.venn_from_arrays(p, r)
            if set_type == 'pair_only':
                kset = '10'
            elif set_type == 'ref_only':
                kset = '01'
            else:
                raise AttributeError("set_type must be 'pair_only' or 'ref_only'.")
            the_venn_set.loc[i, j] = x[kset]

    # For each reference, get the features that are pair only in that reference and not in any of the iNSC
    vs_diff = pd.DataFrame(index=members_rows, columns=external_refs)
    for i in members_rows:
        for j in external_refs:
            the_ref = the_venn_set.loc[i, j]
            all_else = the_venn_set.loc[i, members_rows]
            union_all_else = setops.reduce_union(*all_else)
            vs_diff.loc[i, j] = sorted(set(the_ref).difference(union_all_else))

    # Intersection down the columns gives us a correction list for each reference
    vs_specific_to_ref = vs_diff.apply(lambda x: setops.reduce_intersection(*x))

    # Intersection across the references gives us a final list that need correcting
    vs_specific_to_all_refs = setops.reduce_intersection(*vs_specific_to_ref)

    return {
        'specific_to_each_ref': vs_specific_to_ref,
        'specific_to_all_refs': vs_specific_to_all_refs,
        'venn_set': the_venn_set,
        'ref_diff_set': vs_diff
    }
def set_permutation_test(data, n_iter=1000, parallel=True):
    K = len(data)
    N = len(setops.reduce_union(*data.values()))

    set_sizes = collections.OrderedDict([(k, len(v)) for k, v in data.items()])
    simulated_sizes = collections.defaultdict(list)

    if parallel:
        pool = mp.Pool()
        jobs = {}
        for i in range(n_iter):
            jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N))

        pool.close()
        pool.join()
        for i, j in jobs.items():
            vc = j.get()
            for k, v in vc.items():
                simulated_sizes[k].append(v)
    else:
        for i in range(n_iter):
            vc = one_random_perm(set_sizes, N)
            for k, v in vc.items():
                simulated_sizes[k].append(v)

    _, vc_true = setops.venn_from_arrays(*data.values())

    # to calculate the P value, we EITHER need to specify a single sided test OR decide how to compute a two-sided P
    # Some interesting discussions on this topic:
    # https://stats.stackexchange.com/questions/140107/p-value-in-a-two-tail-test-with-asymmetric-null-distribution
    # https://stats.stackexchange.com/questions/360864/2-tailed-permutation-tests-for-obviously-non-symmetric-data
    # https://stats.stackexchange.com/questions/34052/two-sided-permutation-test-vs-two-one-sided
    # However, a 'Z' value is easier to compute
    z = {}
    p = {}
    for k in simulated_sizes.keys():
        obs = vc_true[k]
        t = stats.percentileofscore(simulated_sizes[k], obs)
        if t <= 50:
            p[k] = 2 * t / 100.
        else:
            p[k] = 2 * (1 - t / 100.)

        z[k] = t - 50.

    return {
        'simulated_set_sizes': simulated_sizes,
        'observed_set_sizes': vc_true,
        'p': p,
        'z': z
    }
예제 #4
0
def venn_diagram(*args, **kwargs):
    ax = kwargs.pop('ax', plt.gca())
    n = len(args)
    venn = None
    if n not in {2, 3, 4}:
        raise NotImplementedError(
            "At present, we only support 2, 3 and 4 way Venn diagrams")
    venn_sets, venn_counts = setops.venn_from_arrays(*args, **kwargs)
    if n == 2:
        venn = venn2(subsets=venn_counts, ax=ax, **kwargs)
    elif n == 3:
        venn = venn3(subsets=venn_counts, ax=ax, **kwargs)
    elif n == 4:
        venn = venn4(venn_counts, ax=ax, **kwargs)
    return venn, venn_sets, venn_counts
def quantify_follow_up_pathways(ipa_res,
                                corr_pval_df,
                                comparisons,
                                pids,
                                alpha=0.05,
                                alpha_strict=0.005):
    log_alpha_strict = -np.log10(alpha_strict)

    pws = corr_pval_df.columns[(corr_pval_df < alpha).any(axis=0)]
    pval_cols_syn = ipa_res.columns[ipa_res.columns.str.contains(
        r'_syngeneic_-logp')]
    follow_up_pathways = pd.DataFrame(
        index=pws, columns=['Syngen. only', 'Ref. only', 'Intersect.'])

    for pw in pws:
        this_ipa_pvals_syn = ipa_res.loc[pw, pval_cols_syn]
        this_ipa_pvals_syn.index = this_ipa_pvals_syn.index.str.replace(
            '_syngeneic_-logp', '')
        this_ipa_pvals_syn.dropna(inplace=True)

        this_ipa_pvals_refs = pd.DataFrame(columns=pids, index=comparisons)
        this_ipa_refs_sign = set()
        for r in comparisons:
            t = ipa_res.loc[pw, ["%s_%s_-logp" % (pid, r) for pid in pids]]
            t.index = t.index.str.replace(r'_.*', '')
            this_ipa_pvals_refs.loc[r, pids] = t
            this_ipa_refs_sign.update(t.index[t >= log_alpha_strict])

        # use the venn set machinery for convenient counting
        for_venn = [
            this_ipa_pvals_syn.index[
                this_ipa_pvals_syn >= log_alpha_strict].tolist(),
            sorted(this_ipa_refs_sign)
        ]
        vs, vc = setops.venn_from_arrays(*for_venn)

        follow_up_pathways.loc[pw, 'Syngen. only'] = vc['10']
        follow_up_pathways.loc[pw, 'Ref. only'] = vc['01']
        follow_up_pathways.loc[pw, 'Intersect.'] = vc['11']

    return follow_up_pathways
                                           ['GIBCO'])
    pair_and_ref_discordant = pd.DataFrame(index=pids,
                                           columns=pids + additional_pids +
                                           ['GIBCO'])

    # loop over GBM samples
    for pid in pids:
        # syngeneic comparison
        the_pair = de_res[(pid, pid)]

        # loop over (i)NSC samples
        # when this is the same as the syngeneic comparison, there will (obviously) be no 'pair only' or 'ref only'
        # genes!
        for pid2 in pids + additional_pids + ['GIBCO']:
            the_ref = de_res[(pid, pid2)]
            the_sets, _ = setops.venn_from_arrays(the_pair.index,
                                                  the_ref.index)
            pair_only.loc[pid, pid2] = the_sets['10']
            ref_only.loc[pid, pid2] = the_sets['01']
            # for overlapping genes: separate based on direction (matching or non matching)
            the_conc_idx = (the_pair.loc[the_sets['11']].Direction ==
                            the_ref.loc[the_sets['11']].Direction)
            pair_and_ref_concordant.loc[pid, pid2] = the_pair.loc[
                the_sets['11']].loc[the_conc_idx].index
            pair_and_ref_discordant.loc[pid, pid2] = the_pair.loc[
                the_sets['11']].loc[~the_conc_idx].index

    # can get counts like this
    po_counts = pair_only.applymap(len)
    ro_counts = ref_only.applymap(len)

    # the permutation part
예제 #7
0
    n_perm = 1000

    # DE

    pids = consts.PIDS
    outdir = output.unique_output_dir()
    # load previously generated DE results
    fn = os.path.join(HGIC_LOCAL_DIR, 'current', 'core_pipeline', 'rnaseq',
                      'full_de_syngeneic_only.xlsx')
    de_res = pd.read_excel(fn, header=0, index_col=0)
    all_ens = de_res.index[(de_res[pids] == 'Y').any(axis=1)]

    de_per_pat = {pid: de_res.index[de_res[pid] == 'Y'] for pid in pids}
    n_tot = {pid: de_per_pat[pid].size for pid in pids}

    vs, vc = setops.venn_from_arrays(*[de_per_pat[pid] for pid in pids])
    pp = setops.specific_sets(pids)
    n_ps = {pid: vc[pp[pid]] for pid in pids}

    # perms
    n_all = len(all_ens)
    n_spec = run_patient_specific_permutations(n_tot, n_all, n_perm=n_perm)

    fig, axs = plot_perms_kde_vs_obs(
        n_spec, n_ps, xlabel='Number of patient-specific DE genes', order=pids)
    fig.savefig(os.path.join(outdir, "patient_specific_de.png"), dpi=200)
    fig.savefig(os.path.join(outdir, "patient_specific_de.tiff"), dpi=200)
    fig.savefig(os.path.join(outdir, "patient_specific_de.pdf"))

    fn = os.path.join(HGIC_LOCAL_DIR, 'current', 'core_pipeline',
                      'methylation', 'full_dmr_syngeneic_only.xlsx')
예제 #8
0
                venn.venn_diagram(*[this_res[pid][t].index for t in ['iNSC'] + refs], set_labels=['iNSC'] + refs, ax=ax)
                ax.set_title(pid, fontsize=16)
            for i in range(len(pids), 12):
                ax = axs.flat[i]
                ax.set_visible(False)
            fig.subplots_adjust(left=0.02, right=0.98, bottom=0.02, top=0.95)
            fig.savefig(os.path.join(outdir, "number_de_genes_ref_comparison_%s_%s.png" % (nm, m)), dpi=200)

            # number of PO genes in Venn diagrams
            fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(10, 6))
            for i, pid in enumerate(pids):
                a = this_res[pid]['iNSC'].index
                po = []
                for ref in refs:
                    b = this_res[pid][ref].index
                    vs, vc = setops.venn_from_arrays(a, b)
                    po.append(vs['10'])
                ax = axs.flat[i]
                venn.venn_diagram(*po, set_labels=refs, ax=ax)
                ax.set_title("GBM%s pair only" % pid, fontsize=16)
            for i in range(len(pids), 12):
                ax = axs.flat[i]
                ax.set_visible(False)
            fig.subplots_adjust(left=0.02, right=0.98, bottom=0.02, top=0.95)
            fig.savefig(os.path.join(outdir, "po_de_genes_ref_comparison_%s_%s.png" % (nm, m)), dpi=200)

            # number of PO DE genes

            # overlap between individual references in terms of PO genes shared
            pct_pair_only_intersect = pd.DataFrame(index=pids, columns=refs)
    # run the dgidb lookup against all genes
    # have to chunk this operation to avoid error
    all_genes = sorted(
        setops.reduce_union(*[t.gene.values
                              for t in joint_de_dmr_s1.values()]))
    dgi_all = druggable_genome.dgidb_lookup_drug_gene_interactions(all_genes)

    # manually resolve a few known ambiguities
    ambig = {'ELTD1': 'ADGRL4', 'ODZ3': 'TENM3'}
    for k, v in ambig.items():
        x = [t for t in dgi_all['ambiguous'][k] if t['geneName'] == v][0]
        dgi_all['interactions'][k] = x['interactions']

    de_dmr_by_member = [joint_de_dmr_s1[pid].index for pid in pids]
    venn_set, venn_ct = setops.venn_from_arrays(*de_dmr_by_member)

    # define short and long list

    # long list
    ss = setops.specific_sets(pids)
    ps_de_dm_long = collections.OrderedDict([(pid, venn_set[ss[pid]])
                                             for pid in pids])

    ps_de_dm_long_list = setops.reduce_union(*ps_de_dm_long.values())

    # short list
    vs_dm, vc_dm = setops.venn_from_arrays(
        *[dmr_res_s1[pid].results_significant.keys() for pid in pids])
    vs_de, vc_de = setops.venn_from_arrays(
        *[de_res_s1[pid]['Gene Symbol'].dropna() for pid in pids])
예제 #10
0
    # check that signature genes are all found in the data
    for k, v in genesets.items():
        for i, t in enumerate(v):
            if t in manual_gene_name_correction:
                v[i] = manual_gene_name_correction[t]
        g_in = rnaseq_dat.index.intersection(v)
        if set(g_in) != set(v):
            missing = set(v).difference(rnaseq_dat.index)
            logger.warn(
                "%d genes in the %s signature do not match with the data index and will be dropped: %s.",
                len(missing), k, ', '.join(missing))
            genesets[k] = g_in

    # check here whether there is any overlap
    vs, vc = setops.venn_from_arrays(*genesets.values())
    n_overlap = sum(
        [vc[t] for t in setops.binary_combinations_sum_gte(len(genesets), 2)])
    if n_overlap > 0:
        logger.warn(
            "The %d gene signatures used here have %d overlapping genes - please check this is OK.",
            len(genesets), n_overlap)

    # run ssGSEA then Z transform the results
    es = gsva.ssgsea(rnaseq_dat, genesets)
    es_z = z_transform(es, axis=1)

    # export
    for_export = es_z.transpose()
    for_export.insert(for_export.shape[1], 'Verhaak classification',
                      rnaseq_meta.loc[for_export.index, 'expression_subclass'])
예제 #11
0
        main_fig_bounds['top'] - main_fig_bounds['bottom'],
    ])
    fig.savefig(os.path.join(outdir,
                             "dmr_direction_effect_size_pie_array.png"),
                dpi=200)

    # run down the rows or columns and generate an 'overlap spectrum' for each one
    # rows: check the effect of varying the iNSC line (CONSISTENCY)
    # cols: check the effect of varying the GIC line (non-syngeneic DIFFERENCE)
    # also repeat for the columns, which is just the S1 approach (SYNGENEIC)

    row_collapse = pd.DataFrame(
        dict([(pid,
               setops.quantify_feature_membership(
                   setops.venn_from_arrays(
                       *[dmr_res_all['%s-%s' % (pid, p)].keys()
                         for p in pids])[1])) for pid in pids]))[pids]

    col_collapse = pd.DataFrame(
        dict([(pid,
               setops.quantify_feature_membership(
                   setops.venn_from_arrays(
                       *[dmr_res_all['%s-%s' % (p, pid)].keys()
                         for p in pids])[1])) for pid in pids]))[pids]

    syn_dist = setops.quantify_feature_membership(
        setops.venn_from_arrays(
            *[dmr_res_all['%s-%s' % (p, p)].keys() for p in pids])[1])

    # bar charts
    fig, axs = plt.subplots(len(pids),
    pool = mp.Pool()
    jobs = {}
    for i in range(n_iter):
        jobs[i] = pool.apply_async(one_random_perm, args=(set_sizes, N))

    pool.close()
    pool.join()
    for i, j in jobs.items():
        vc = j.get()
        for k, v in vc.items():
            simulated_sizes[k].append(v)

    for i in range(n_iter):

        rand_sets = [np.random.choice(N, v) for v in set_sizes.values()]
        _, vc = setops.venn_from_arrays(*rand_sets)
        for k, v in vc.items():
            simulated_sizes[k].append(v)

    _, vc_true = setops.venn_from_arrays(*data.values())

    # to calculate the P value, we EITHER need to specify a single sided test OR decide how to compute a two-sided P
    # Some interesting discussions on this topic:
    # https://stats.stackexchange.com/questions/140107/p-value-in-a-two-tail-test-with-asymmetric-null-distribution
    # https://stats.stackexchange.com/questions/360864/2-tailed-permutation-tests-for-obviously-non-symmetric-data
    # https://stats.stackexchange.com/questions/34052/two-sided-permutation-test-vs-two-one-sided
    # However, a 'Z' value is easier to compute
    z = {}
    p = {}
    for k in simulated_sizes.keys():
        obs = vc_true[k]
예제 #13
0
        # we're going to use passthrough mapping to customise the node colour
        # we'll define 3 colourmaps, with -log10(p) assigning the shade:
        # greyscale for syn. and ref.
        # reds for ref. only
        # blues for syn. only
        # colours are defined by HEX values? Add these to the nodes
        logp_vals = [t['-logp'] for t in this_ipa]
        vmax = max([t.max() for t in logp_vals])

        # we need a lower offset for the non-grey colours, otherwise all the white shades look very similar
        vmin = -2
        cmap_both_func = common.continuous_cmap(0, vmax, cmap='Greys')
        cmap_syn_func = common.continuous_cmap(vmin, vmax, cmap='Blues')
        cmap_ref_func = common.continuous_cmap(vmin, vmax, cmap='Reds')
        vs, _ = setops.venn_from_arrays(*[t.index for t in this_ipa])

        node_significance = {}
        node_colours = {}
        node_attrs = {}
        for k, p_arr in vs.items():
            ix = [t == '1' for t in k]
            n = float(sum(ix))
            for pth in p_arr:
                m = 0
                for i, t in enumerate(ix):
                    if t:
                        m += logp_vals[i][pth]
                        node_attrs.setdefault(
                            pth, {})["plogp_%s" %
                                     comparisons[i]] = logp_vals[i][pth]
예제 #14
0
        u_hypo = {}
        u_hyper = {}

        core_dmrs_hypo[k1] = setops.reduce_intersection(*[
            setops.reduce_intersection(
                *[x['Hypomethylated'] for k, x in u.items() if r in k])
            for r in esc_ref_names
        ])
        core_dmrs_hyper[k1] = setops.reduce_intersection(*[
            setops.reduce_intersection(
                *[x['Hypermethylated'] for k, x in u.items() if r in k])
            for r in esc_ref_names
        ])

    # outcome
    vs, vc = setops.venn_from_arrays(*core_dmrs_hypo.values())
    print "Hypomethylated core DMRs (hypo in both ESC comparisons). "
    print "Of the %d DMRs in our data, %d are shared with both HipSci and E-MTAB-6194" % (
        len(core_dmrs_hypo[k_our_ipsc]), vc['111'])

    vs, vc = setops.venn_from_arrays(*core_dmrs_hyper.values())
    print "Hypermethylated core DMRs (hyper in both ESC comparisons). "
    print "Of the %d DMRs in our data, %d are shared with both HipSci and E-MTAB-6194" % (
        len(core_dmrs_hyper[k_our_ipsc]), vc['111'])

    # for each PID in iPSC vs ESC, define the core DMRs (shared by both ref comparisons)
    # then split into hyper and hypo

    core_dmr_our_ipsc_ref_esc = core_dmrs(dmr_res_our_ipsc_vs_esc, pids,
                                          esc_ref_names)
    core_dmr_direction_our_ipsc_ref_esc = core_dmr_by_direction(
예제 #15
0
    de_res_full_s1 = dict([
        (pid, de_res_full_s1[("GBM%s" % pid, "iNSC%s" % pid)]) for pid in pids
    ])

    with open(fn, 'wb') as f:
        pickle.dump(de_res_full_s1, f)

    logger.info("Saved S1 DE results to %s", fn)

# extract only significant DE genes
de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']])
                  for k, v in de_res_full_s1.items()])

# generate wide-form lists and save to Excel file
de_by_member = [de_res_s1[pid].index for pid in pids]
venn_set, venn_ct = setops.venn_from_arrays(*de_by_member)

# add null set manually from full DE results
de_genes_all = setops.reduce_union(*venn_set.values())
k_null = ''.join(['0'] * len(pids))
venn_set[k_null] = list(de_res_full_s1[pids[0]].index.difference(de_genes_all))
venn_ct[k_null] = len(venn_set[k_null])

de_data = setops.venn_set_to_wide_dataframe(de_res_s1,
                                            venn_set,
                                            pids,
                                            full_data=de_res_full_s1,
                                            cols_to_include=['logFC', 'FDR'],
                                            consistency_check_col='logFC',
                                            consistency_check_method='sign')
# add gene symbols back in
예제 #16
0
        # reds for ref. only
        # blues for syn. only
        # colours are defined by HEX values? Add these to the nodes
        logp_syn = -np.log10(res_syn.fdr + eps)
        logp_r1 = -np.log10(res_r1.fdr + eps)
        logp_r2 = -np.log10(res_r2.fdr + eps)
        vmax = max(
            logp_syn.max(),
            logp_r1.max(),
            logp_r2.max(),
        )

        cmap_both_func = common.continuous_cmap(0, vmax, cmap='Greys')
        cmap_syn_func = common.continuous_cmap(0, vmax, cmap='Blues')
        cmap_ref_func = common.continuous_cmap(0, vmax, cmap='Reds')
        vs, _ = setops.venn_from_arrays(
            *[t.index for t in (res_syn, res_r1, res_r2)])

        node_colours = {}
        for pth in vs['111'] + vs['101'] + vs['110']:
            node_colours[pth] = cmap_both_func(logp_syn[pth])
        for pth in vs['100']:
            node_colours[pth] = cmap_syn_func(logp_syn[pth])
        for pth in vs['011']:
            # mean P for refs
            node_colours[pth] = cmap_ref_func(0.5 *
                                              (logp_r1[pth] + logp_r2[pth]))
        for pth in vs['010']:
            node_colours[pth] = cmap_ref_func(logp_r1[pth])
        for pth in vs['001']:
            node_colours[pth] = cmap_ref_func(logp_r2[pth])
예제 #17
0
                                   args=(the_data.loc[p], the_data),
                                   kwds=dict(method=corr_method))
        # cor, pval = one_vs_many_correlation(the_data.loc[p], the_data, method=corr_method)
        # these_probes = cor.index[(cor.abs() > cross_corr_threshold) & (pval < alpha)]
        # myc_corr_probes.append(these_probes)

    pool.close()
    pool.join()
    for p in myc_probes:
        cor, pval = jobs[p].get(1e4)
        these_probes = cor.index[(cor.abs() > cross_corr_threshold)
                                 & (pval < alpha)]
        myc_corr_probes.append(these_probes)

    #  out of interest, what is the overlap between these? (presumably quite high?)
    vs, vc = setops.venn_from_arrays(*myc_corr_probes)

    # union of probes
    keep_probes = setops.reduce_union(*myc_corr_probes)

    print "After comparing all data against each MYC probe, we are left with %d correlated probes" % len(
        keep_probes)

    genes_corr_with_myc = the_symbols.loc[keep_probes].dropna()
    print "These correspond to %d unique genes." % len(
        genes_corr_with_myc.unique())

    # check the overlap with validated genes
    overlap = pd.Index(validated_genes).intersection(
        genes_corr_with_myc.unique())
    if len(overlap) == len(validated_genes):
예제 #18
0
    fdr_dat = fdr_dat[fdr_dat < alpha]
    fdr_dat.columns = res.keys()  # dict is ordered, so this is OK
    all_in = ~fdr_dat.isnull()
    all_in.columns = res.keys()  # dict is ordered, so this is OK
    log_fdr_dat = np.log10(fdr_dat + 1e-6) * -1


    # number syngen. only, ref. only and intersection
    n_set = pd.DataFrame(0, index=pathways_sign, columns=['Syngen. only', 'Ref. only', 'Intersect.'], dtype=int)
    so = {}
    ro = {}
    inters = {}
    for pid in pids:
        s = all_in.index[all_in.loc[:, "%s_syngeneic" % pid]]
        r = all_in.index[all_in.loc[:, ["%s_%s" % (pid, t) for t in comparison_names.values()[1:]]].any(axis=1)]
        vs, _ = setops.venn_from_arrays(s, r)
        n_set.loc[vs['10'], 'Syngen. only'] += 1
        n_set.loc[vs['01'], 'Ref. only'] += 1
        n_set.loc[vs['11'], 'Intersect.'] += 1
    n_set = n_set.fillna(0)

    from ipa_results_s1_s2 import pathway_involvement_heatmap_by_p
    comparison_dict = {
        'syngeneic': 'Syngen.',
        'h9': 'H9',
        'gibco': 'Gibco'
    }

    # plot 1) P values, ordered by sum of -log(P)
    p_order = log_fdr_dat.sum(axis=1).sort_values(ascending=False).index
    plot_dict = pathway_involvement_heatmap_by_p(
예제 #19
0
    dat_classified = dict([(pid, run_one_sort(var_dat[pid], 'GIC', 'iNSC'))
                           for pid in pids])

    # search through GIC only and GIC hom/iNSC het SNPs and 'other' and generate upset
    members = {}
    for pid, d in dat_classified.items():
        members[pid] = set()
        for typ in ['GIC only', 'GIC hom iNSC het', 'other']:
            for x in d[typ]:
                if isinstance(x, dict):
                    members[pid].add(str(x['GIC']))
                else:
                    members[pid].add(str(x))

    vs, vc = setops.venn_from_arrays(*[members[pid] for pid in pids])

    venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(
        pids, groups)

    hypo_count_full = vc[venn_sets_by_group['full']['Hypo'][0]]
    hyper_count_full = vc[venn_sets_by_group['full']['Hyper'][0]]

    hypo_counts_partial = [(setops.key_to_members(t, pids), vc[t])
                           for t in venn_sets_by_group['partial']['Hypo']]
    hyper_counts_partial = [(setops.key_to_members(t, pids), vc[t])
                            for t in venn_sets_by_group['partial']['Hyper']]

    # is this significant in any way?
    # focus on 3/4 of hypo OR 5/6 of hyper
    it = itertools.combinations(pids, 4)
예제 #20
0
    # now we need to compare the paired results with every other result (Gibco and other iNSC)
    pair_only = pd.DataFrame(index=pids, columns=cols)
    ref_only = pd.DataFrame(index=pids, columns=cols)
    pair_and_ref_concordant = pd.DataFrame(index=pids, columns=cols)
    pair_and_ref_discordant = pd.DataFrame(index=pids, columns=cols)
    # loop over GBM samples
    for pid in pids:
        # syngeneic comparison
        the_pair = de_res[(pid, pid)]

        # loop over (i)NSC samples
        # when this is the same as the syngeneic comparison, there will (obviously) be no 'pair only' or 'ref only'
        # genes!
        for pid2 in cols:
            the_ref = de_res[(pid, pid2)]
            the_sets, _ = setops.venn_from_arrays(the_pair.index,
                                                  the_ref.index)
            pair_only.loc[pid, pid2] = the_sets['10']
            ref_only.loc[pid, pid2] = the_sets['01']
            # for overlapping genes: separate based on direction (matching or non matching)
            the_conc_idx = (the_pair.loc[the_sets['11']].Direction ==
                            the_ref.loc[the_sets['11']].Direction)
            pair_and_ref_concordant.loc[pid, pid2] = the_pair.loc[
                the_sets['11']].loc[the_conc_idx].index
            pair_and_ref_discordant.loc[pid, pid2] = the_pair.loc[
                the_sets['11']].loc[~the_conc_idx].index

    # can get counts like this
    po_counts = pair_only.applymap(len)
    ro_counts = ref_only.applymap(len)

    ## genes that are pair-only in every possible ref comparison
예제 #21
0
    ax2.yaxis.set_ticks(new_ticks)
    ax2.yaxis.set_ticklabels(new_ticklabels, rotation=90, color='gray')
    ax2.set_ylabel("Number of patients sharing pathway", color='gray')
    ax2.grid(False)

    ax.figure.tight_layout()
    ax.figure.savefig(os.path.join(outdir, "hgic_de_ipa_top%d.png" % top_n),
                      dpi=200)
    ax.figure.savefig(os.path.join(outdir, "hgic_de_ipa_top%d.tiff" % top_n),
                      dpi=200)

    # export a wideform dataframe containing all these pathways with log_p, etc.
    for_export = {}
    for pid in pids:
        for_export[pid] = res[pid].loc[p_top[pid]]

    vs, vc = setops.venn_from_arrays(*[p_top[pid] for pid in pids])
    out = setops.venn_set_to_wide_dataframe(
        for_export,
        vs,
        pids,
        full_data=res,
        cols_to_include=['-log_p', 'ratio', 'z'],
        static_cols_to_include=['genes'])
    # excel.pandas_to_excel(out, os.path.join(outdir, "ipa_de_top_%d_pathways.xlsx" % top_n))
    out.to_excel(os.path.join(outdir, "ipa_de_top_%d_pathways.xlsx" % top_n))
    """
    Note to myself:
    I did consider an UpSet plot here. However, with the full DE lists, the result isn't very edifying...
    With the exception of patient-specific pathways, all sets have 2 or fewer pathways. 
    """
def get_de_dmr_groups(
        joint_de_dmr,
        clusters,
        groups,
        pids=consts.PIDS,
        relation_filter=None
):
    """
    Get group-specific DE/DMRs. These are defined as DEs that are consistent with the DMRs in a given selection of
    patients (from one to many) that are NOT shared across groups.
    :param joint_de_dmr:
    :param clusters:
    :param groups: Dictionary, keyed by group name. Values are iterables giving patient IDs in each group.
    :param pids:
    :param relation_filter:
    :return:
    """
    venn_sets_by_group = setops.full_partial_unique_other_sets_from_groups(pids, groups)

    if relation_filter is not None:
        if not hasattr(relation_filter, '__iter__'):
            relation_filter = [relation_filter]

    de_dmr_groups = {}
    de_dmr_de_logfc = {}
    de_dmr_de_fdr = {}
    de_dmr_dmr_delta = {}

    if relation_filter is None:
        de_dmr_by_member = [joint_de_dmr[pid].index for pid in pids]
    else:
        de_dmr_by_member = []
        for pid in pids:
            this_members = []
            for t in joint_de_dmr[pid].index:
                gene_rel_options = [(t[1], rel) for rel in relation_filter]
                if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0:
                    this_members.append(t)
            de_dmr_by_member.append(this_members)
    venn_set, venn_count = setops.venn_from_arrays(*de_dmr_by_member)

    for grp in groups:
        this_sets = venn_sets_by_group['full'][grp] + venn_sets_by_group['partial'][grp]
        this_de_dmrs = sorted(setops.reduce_union(*[venn_set[k] for k in this_sets]))

        if relation_filter is not None:
            new_de_dmrs = []
            for t in this_de_dmrs:
                # look for any intersection here
                gene_rel_options = [(t[1], rel) for rel in relation_filter]
                if len(set(clusters[t[0]].genes).intersection(gene_rel_options)) > 0:
                    new_de_dmrs.append(t)
            this_de_dmrs = new_de_dmrs

        de_dmr_groups[grp] = this_de_dmrs

        # get separate lists of DE genes and DMR IDs
        # DMRs is straightforward
        de_dmr_dmr_delta[grp] = pd.DataFrame(
            index=sorted(set([t[0] for t in this_de_dmrs])),
            columns=pids + ['consistent'],
        )
        # DEs is trickier: some genes have mapped twice because I was so diligent in curating the original lists!
        this_de_genes = sorted(set([t[1] for t in this_de_dmrs]))
        this_de_ens = annotation_gene_to_ensembl.gene_to_ens(this_de_genes)
        this_de_ens = this_de_ens[~this_de_ens.duplicated()]
        this_de_genes = this_de_ens.index

        de_dmr_de_logfc[grp] = pd.DataFrame(
            index=this_de_genes.tolist(),
            columns=pids + ['consistent'],
        )
        de_dmr_de_fdr[grp] = pd.DataFrame(
            index=this_de_genes.tolist(),
            columns=pids + ['consistent'],
        )

        # fill them in
        for k in this_sets:
            this_vs = [t for t in venn_set[k] if t[1] in this_de_genes]
            this_pids = [pids[i] for i, t in enumerate(k) if t == '1']
            for pid in this_pids:
                de_dmr_dmr_delta[grp].loc[[t[0] for t in this_vs], pid] = joint_de_dmr[pid].loc[
                    this_vs, 'dmr_median_delta'].values
                de_dmr_de_logfc[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[
                    this_vs, 'de_logFC'].values
                de_dmr_de_fdr[grp].loc[[t[1] for t in this_vs], pid] = joint_de_dmr[pid].loc[
                    this_vs, 'de_FDR'].values

        for k, row in de_dmr_dmr_delta[grp].iterrows():
            tmp_dm = np.sign(row.dropna().astype(float))
            row['consistent'] = (tmp_dm == tmp_dm.iloc[0]).all()

        for k, row in de_dmr_de_logfc[grp].iterrows():
            tmp_de = np.sign(row.dropna().astype(float))
            row['consistent'] = (tmp_de == tmp_de.iloc[0]).all()
            de_dmr_de_fdr[grp].loc[k, 'consistent'] = row['consistent']

    return {
        'dmr_median_delta_m': de_dmr_dmr_delta,
        'de_logFC': de_dmr_de_logfc,
        'de_FDR': de_dmr_de_fdr,
        'de_dmr_groups': de_dmr_groups
    }
예제 #23
0
        the_contrast = "GBM - NSC"

        # de_gibco[pid] = differential_expression.edger_exacttest(
        #     the_data,
        #     the_groups,
        #     pair=['NSC', 'GBM'],
        #     lfc=lfc,
        #     fdr=fdr
        # )
        # de_gibco[pid] = differential_expression.edger_glmqlfit(the_data, the_groups, the_contrast)
        de_gibco[pid] = differential_expression.edger_glmfit(
            the_data, the_groups, the_contrast)

        # Separate into sets
        # all
        de[pid], de_counts[pid] = setops.venn_from_arrays(
            de_matched[pid].index, de_gibco[pid].index)

        # up only
        idx_up_match = de_matched[pid].loc[de_matched[pid].logFC > 0].index
        idx_up_ref = de_gibco[pid].loc[de_gibco[pid].logFC > 0].index
        de_up[pid], de_counts_up[pid] = setops.venn_from_arrays(
            idx_up_match, idx_up_ref)

        # down only
        idx_down_match = de_matched[pid].loc[de_matched[pid].logFC < 0].index
        idx_down_ref = de_gibco[pid].loc[de_gibco[pid].logFC < 0].index
        de_down[pid], de_counts_down[pid] = setops.venn_from_arrays(
            idx_down_match, idx_down_ref)

        # write to files, one worksheet per list (5 per individual)
        # paired comparison (all)
예제 #24
0
    # compute DE between hGIC and paired iNSC
    de_res = {}
    de_res_full = {}
    for pid in pids:
        hgic_samples = rnaseq_obj.meta.index[
            rnaseq_obj.meta.index.str.contains(pid)]
        the_data = rnaseq_obj.data.loc[:, hgic_samples]
        the_groups = rnaseq_obj.meta.loc[hgic_samples, 'type']
        the_comparison = ['GBM', 'iNSC']
        de_res[pid] = differential_expression.run_one_de(
            the_data, the_groups, the_comparison, **de_params)
        # de_res_full[pid] = differential_expression.run_one_de(the_data, the_groups, the_comparison, return_full=True, **de_params)
        print "GBM %s paired comparison, %d DE genes" % (pid,
                                                         de_res[pid].shape[0])

    venn_set, venn_ct = setops.venn_from_arrays(
        *[de_res[pid].index for pid in pids])

    # add null set manually
    de_genes_all = reduce(lambda x, y: set(x).union(y), venn_set.values())
    k_null = ''.join(['0'] * len(pids))
    venn_set[k_null] = list(
        de_res_full[pids[0]].index.difference(de_genes_all))
    venn_ct[k_null] = len(venn_set[k_null])

    # check direction is the same
    venn_set_consistent = {}
    venn_set_inconsistent = {}
    for k in venn_set:
        the_genes = venn_set[k]
        the_pids = [pids[i] for i, t in enumerate(k) if t == '1']
        the_de_direction = pd.DataFrame(
def one_random_perm(set_sizes, N):
    rand_sets = [np.random.choice(N, v) for v in set_sizes.values()]
    _, vc = setops.venn_from_arrays(*rand_sets)
    return vc
예제 #26
0
    # DE results
    the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), de_params)
    filename = 'de_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DE_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 DE results from %s", fn)
        with open(fn, 'rb') as f:
            de_res_full_s1 = pickle.load(f)
    else:
        raise AttributeError(
            "Unable to find pre-computed S1 comparison results.")

    de_res_s1 = dict([(k, v.loc[v.FDR < de_params['fdr']])
                      for k, v in de_res_full_s1.items()])
    vs, vc = setops.venn_from_arrays(*[de_res_s1[pid].index for pid in pids])
    de_res_wide = setops.venn_set_to_wide_dataframe(
        de_res_s1, vs, pids, cols_to_include=['logFC', 'FDR'])

    ipa_de_res = collections.OrderedDict()
    for pid in pids:
        fn = os.path.join(de_indir, "full_de_patient{pid}.xls".format(pid=pid))
        this_df = pd.read_excel(fn, skiprows=1, header=0, index_col=0)
        this_df.columns = ['-logp', 'ratio', 'z', 'genes']
        this_df.insert(3, 'n_gene', this_df.genes.str.split(',').apply(len))
        # filter to include only relevant pathways
        ipa_de_res[pid] = this_df.loc[this_df['-logp'] >= plogalpha]

    # for plotting
    groups = [(pid, dat_s1.columns[dat_s1.columns.str.contains(pid)])
              for pid in pids]
def upset_set_size_plot(data,
                        set_labels,
                        set_colours=None,
                        order_by_n_members=False,
                        include_singletons=False,
                        min_size=1,
                        n_plot=None,
                        bar_width=0.9,
                        point_ms=10,
                        default_colour='#4C72B0',
                        **kwargs):
    """
    Produce a summary plot showing the set sizes when the number of sets is > 4.
    Inspired / totally copying UpsetR: https://cran.r-project.org/web/packages/UpSetR/vignettes/basic.usage.html
    :param data: Array of iterables containing the full data set of each member.
    :param set_labels: Array of strings giving the name of each member, in the same order as data.
    :param set_colours: Dict/list of tuples/OrderedDict giving the name and shading of one or more sets. E.g.
    [
        (group_A, {'sets': ['010', '011'], 'colour': 'red'}),
        (group_B, {'sets': ['110', '001'], 'colour': 'blue'}),
    ]
    The name is used for the legend. It can be 'None' to skip any entry for this group.
    Use the ordered options if order matters in the lower left stacked plot. Use a list to have multiple entries
    with the same group name.
    If supplied, these will be used for shading all three plots. If not, we just shade the singleton sets in the
    lower left set size plot.
    :param order_by_n_members: If True, order the plot by the number of members participating in each set. This has
    the effect of generating a bar chart that has multiple bunches of descending bars.
    :param include_singletons: If True, singleton sets are included in the main bar. Not really necessary as they are
    also plotted in the lower left bar.
    :param min_size: This is used to exclude sets falling below the minimum size. Can be disabled (set to None), but
    this is pointless since it involves plotting empty sets, which cannot be ordered meaningfully.
    :param n_plot: If not None, this is used to limit the number of sets plotted.
    :param bar_width: Used for plotting bar charts.
    :param point_ms: Size of the circles in the lower right plot.
    :param default_colour: The colour used for anything that isn't otherwise shaded.
    """

    n_set = len(set_labels)
    if len(data) != len(set_labels):
        raise AttributeError(
            "Number of supplied data sets (%d) doesn't match the length of set_labels (%d)."
            % (len(data), n_set))
    venn_sets, venn_ct = setops.venn_from_arrays(*data, **kwargs)

    if set_colours is None:
        str_fmt = "{0:0%db}" % n_set
        # NB the string must be reversed here
        singleton_sets = set(
            [str_fmt.format(2**i)[::-1] for i in range(n_set)])
        other_sets = set([k for k in venn_ct if k not in singleton_sets])
        set_colours = [
            ('Non-unique', {
                'sets': other_sets,
                'colour': default_colour
            }),
            ('Unique', {
                'sets': singleton_sets,
                'colour': '#ff8484'
            }),
        ]
    else:
        try:
            set_colours = set_colours.items()
        except AttributeError:
            set_colours = list(set_colours)
        sets_seen = set()
        for nm, d in set_colours:
            this_sets = d['sets']
            if len(sets_seen.intersection(this_sets)):
                raise ValueError(
                    "Group %s contains one or more sets already contained elsewhere in set_colours"
                    % nm)
            sets_seen.update(this_sets)
        sets_remaining = set(venn_ct.keys()).difference(sets_seen)
        if len(sets_remaining) > 0:
            set_colours = [(None, {
                'sets': sets_remaining,
                'colour': default_colour
            })] + set_colours

    # convenience function to find the colour matching a given set
    def set_lookup(k):
        for t in set_colours:
            if k in t[1]['sets']:
                return t

    lightgrey = '#cecece'

    if include_singletons:
        sort_input = venn_ct
    else:
        # exclude any results with only one set
        sort_input = dict(
            [t for t in venn_ct.items() if len(t[0].replace('0', '')) > 1])

    if min_size is not None:
        sort_input = dict([(k, v) for k, v in sort_input.items()
                           if v > min_size])

    if order_by_n_members:
        ordered_counts = []
        for i in range(1, len(set_labels) + 1):
            # make a collection of results with this many members
            this_collection = []
            for k in setops.binary_combinations_sum_eq(len(set_labels), i):
                # check whether this entry is present, if not it has already been filtered out
                if k in sort_input:
                    this_collection.append((k, sort_input[k]))
            # sort in descending order and append to list
            ordered_counts.extend(
                sorted(this_collection, key=lambda x: x[1], reverse=True))
    else:
        ordered_counts = sorted(sort_input.items(),
                                key=lambda x: x[1],
                                reverse=True)

    if n_plot:
        ordered_counts = ordered_counts[:n_plot]

    gs_kw = dict(
        left=0.05,
        right=0.99,
        top=0.99,
        bottom=0.1,
        wspace=0.1,
        hspace=0.01,
        height_ratios=[6, 3],
        width_ratios=[3, 6],
    )

    # set up axis grid
    gs = gridspec.GridSpec(nrows=2, ncols=2, **gs_kw)
    fig = plt.figure(figsize=(9, 6))
    ax_tl = fig.add_subplot(gs[0, 0])
    ax_set_size = fig.add_subplot(gs[1, 0])
    ax_intersect = fig.add_subplot(gs[1, 1], sharey=ax_set_size)
    ax_main = fig.add_subplot(gs[0, 1], sharex=ax_intersect)

    # hide some things
    ax_tl.set_visible(False)
    plt.setp(ax_intersect.get_yticklabels(), visible=False)
    plt.setp(ax_main.get_xticklabels(), visible=False)
    plt.setp(ax_intersect.get_xticklabels(), visible=False)

    # data
    x_arr = np.arange(len(ordered_counts)) + 0.5
    y_arr = np.arange(n_set)

    # main bar chart
    colours = [set_lookup(t[0])[1]['colour'] for t in ordered_counts]
    ax_main.bar(x_arr, [t[1] for t in ordered_counts],
                width=bar_width,
                color=colours)
    ax_main.set_ylabel('Number of DE genes in set')

    # bottom right set intersections
    # grey markers everywhere
    for y in y_arr:
        ax_intersect.plot(x_arr,
                          np.ones_like(x_arr) * y,
                          marker='o',
                          mfc=lightgrey,
                          mec='none',
                          ms=point_ms,
                          ls='none')
    # overplot shaded markers on sets that are included
    for i, (k, v) in enumerate(ordered_counts):
        x = x_arr[i]
        y = [j for j, u in enumerate(k) if u == '1']
        c = set_lookup(k)[1]['colour']
        ax_intersect.plot(x * np.ones(len(y)),
                          y,
                          marker='o',
                          mfc=c,
                          mec=c,
                          ms=point_ms,
                          ls='none')

    # bottom left : set size and singleton (unique) set size
    left = np.zeros(n_set)

    set_sizes = []
    for nm, d in set_colours:
        this_ss = np.zeros(n_set)
        for k in d['sets']:
            for i in range(n_set):
                if k[i] == '1':
                    this_ss[i] += venn_ct[k]
        set_sizes.append([nm, this_ss])
        ax_set_size.barh(y_arr + 0.5,
                         this_ss,
                         height=-bar_width,
                         left=left,
                         align='edge',
                         label=nm,
                         color=d['colour'])
        left += this_ss

    ax_set_size.invert_xaxis()
    ax_set_size.set_ylim([-.5, len(set_labels) - .5])
    ax_set_size.yaxis.tick_right()
    ax_set_size.set_yticks(y_arr)
    ax_set_size.set_yticklabels(set_labels)
    ax_set_size.set_xlabel("Number of DE genes in single comparison")
    ax_set_size.legend(
        loc='lower left',
        # fontsize=8,
        frameon=False,
        facecolor='w',
        # edgecolor='k',
        bbox_to_anchor=(0.05, 1.1),  # place above and outside the axis
    )

    return {
        'gs': gs,
        'axes': {
            'set_size': ax_set_size,
            'intersection': ax_intersect,
            'main': ax_main,
            'top_left': ax_tl
        },
        'figure': fig
    }
예제 #28
0
            dmr_sign.loc[pid, pid2] = sorted(dmr_res[pid][pid2].clusters_significant.keys())

    dmr_counts = dmr_sign.applymap(len)

    # pair only
    pair_only = pd.DataFrame(index=pids, columns=pids + ['GIBCO'])
    ref_only = pd.DataFrame(index=pids, columns=pids + ['GIBCO'])
    pair_and_ref_concordant = pd.DataFrame(index=pids, columns=pids + ['GIBCO'])
    pair_and_ref_discordant = pd.DataFrame(index=pids, columns=pids + ['GIBCO'])
    for pid in pids:
        for pid2 in pids + ['GIBCO']:
            p = dmr_sign.loc[pid, pid]
            r = dmr_sign.loc[pid, pid2]
            pres = dmr_res[pid][pid].results_significant
            rres = dmr_res[pid][pid2].results_significant
            x, _ = setops.venn_from_arrays(p, r)
            pair_only.loc[pid, pid2] = x['10']
            ref_only.loc[pid, pid2] = x['01']
            # ref and pair IDs
            pr_id = x['11']
            # signs
            pmed_change_sign = np.array([np.sign(pres[t]['median_change']) for t in pr_id])
            rmed_change_sign = np.array([np.sign(rres[t]['median_change']) for t in pr_id])

            pair_and_ref_concordant.loc[pid, pid2] = list(
                np.array(x['11'])[pmed_change_sign == rmed_change_sign]
            )

            pair_and_ref_discordant.loc[pid, pid2] = list(
                np.array(x['11'])[pmed_change_sign != rmed_change_sign]
            )
    de_res = differential_expression.compute_cross_de(rnaseq_obj, pids, external_references=external_refs, **de_params)

    # add the combined DE results for the refs combined
    for pid in pids:
        # complete intersection
        the_idx = sorted(reduce(intersecter, [de_res[(pid, t)].index for t in external_ref_labels]))
        one_cols = de_res[(pid, pid)].columns
        tups = reduce(lambda x, y: x + y, [zip([t] * one_cols.size, one_cols.tolist()) for t in external_ref_labels])
        the_cols = pd.MultiIndex.from_tuples(tups, names=['ref', 'field'])
        the_block = pd.DataFrame(index=the_idx, columns=the_cols)
        for t in external_ref_labels:
            the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values
        de_res[(pid, 'ref_intersect')] = the_block

        # intersect 2
        this_venn, _ = setops.venn_from_arrays(*[de_res[(pid, t)].index for t in external_ref_labels])
        the_idx = reduce(unioner, [this_venn[k] for k in setops.binary_combinations_sum_gte(len(external_refs), 2)])
        the_block = pd.DataFrame(index=the_idx, columns=the_cols)
        for t in external_ref_labels:
            try:
                the_block.loc[the_idx, t] = de_res[(pid, t)].loc[the_idx].values
            except KeyError:
                # no matches for this ref - no problem
                pass
        de_res[(pid, 'ref_intersect2')] = the_block

        # union
        the_idx = sorted(reduce(unioner, [de_res[(pid, t)].index for t in external_ref_labels]))
        the_block = pd.DataFrame(index=the_idx, columns=the_cols)
        for t in external_ref_labels:
            try:
예제 #30
0
def upset_plot_with_groups(data,
                           set_labels,
                           subgroup_ind,
                           subgroup_colours,
                           venn_set=None,
                           other_lbl='Expanded core',
                           specific_lbl='Specific',
                           default_colour='gray',
                           **kwargs):
    """
    Wrapper around the basic upset plotting function. This allows us to highlight sets that fully or partially
    overlap with a pre-defined subgroup.
    :param data: Passed to upset_set_size_plot. Iterable of identifiers used to process venn sets.
    :param set_labels: Iterable of set labels.
    :param subgroup_ind: Dictionary, keys are set_labels, entries are Boolean indexes showing which of set_labels
    are in this subgroup. If ordering is desired, use an OrderedDict.
    :param subgroup_colours: Dict giving the colour for each of the subsets defined in subgroup ind. For each set S,
    two entries are needed, keyed `S full` and `S partial`.
    We can also define two additional colours, which otherwise have default values:
    `Expanded core` (or whatever `other_lbl` is set to) and `Specific`.
    :param venn_set: Output of setops.venn_from_arrays(data). Can supply it to skip recomputing.
    :param other_lbl: Label used to identify those sets that span multiple subgroups.
    :param specific_lbl: Label used to identify those sets that are specific to a single member.
    :param kwargs: Passed to upset_set_size_plot
    :return: Same output as upset plot function.
    """
    # UpsetR attribute plots
    default_colour_other = '#4C72B0'
    default_colour_specific = '#f4e842'

    if venn_set is None:
        venn_set, _ = setops.venn_from_arrays(*data)

    # set colours for UpsetR plot
    sets_full = {}
    sets_partial = {}
    sets_unique = []

    ## TODO: merge this with setops.full_partial_unique_other_sets_from_groups
    for k in venn_set:
        this_k = np.array([t for t in k]).astype(bool)
        if this_k.sum() == 1:
            sets_unique.append(k)
        elif this_k.sum() > 1:
            for grp, grp_idx in subgroup_ind.items():
                n_member = grp_idx.sum()
                # no other matches
                if this_k[~grp_idx].sum() == 0:
                    if this_k[grp_idx].sum() == n_member:
                        sets_full.setdefault(grp, []).append(k)
                    else:
                        sets_partial.setdefault(grp, []).append(k)

    set_colours = []
    for grp_name in subgroup_ind:
        k_full = "%s full" % grp_name
        if grp_name in sets_full:
            set_colours.append((k_full, {
                'sets':
                sets_full[grp_name],
                'colour':
                subgroup_colours.get(k_full, default_colour)
            }))

        k_part = "%s partial" % grp_name
        if grp_name in sets_partial:
            set_colours.append((k_part, {
                'sets':
                sets_partial[grp_name],
                'colour':
                subgroup_colours.get(k_part, default_colour)
            }))

    set_colours.append(
        (other_lbl, {
            'sets': expanded_core_sets(venn_set, subgroup_ind),
            'colour': subgroup_colours.get(other_lbl, default_colour_other)
        }), )

    set_colours.append(
        (specific_lbl, {
            'sets': sets_unique,
            'colour': subgroup_colours.get(specific_lbl,
                                           default_colour_specific)
        }), )

    return upset_set_size_plot(data,
                               set_labels,
                               set_colours=set_colours,
                               default_colour=default_colour,
                               venn_set=venn_set,
                               **kwargs)