예제 #1
0
            logger.warn(
                "%d genes in the %s signature do not match with the data index and will be dropped: %s.",
                len(missing), k, ', '.join(missing))
            genesets[k] = g_in

    # check here whether there is any overlap
    vs, vc = setops.venn_from_arrays(*genesets.values())
    n_overlap = sum(
        [vc[t] for t in setops.binary_combinations_sum_gte(len(genesets), 2)])
    if n_overlap > 0:
        logger.warn(
            "The %d gene signatures used here have %d overlapping genes - please check this is OK.",
            len(genesets), n_overlap)

    # run ssGSEA then Z transform the results
    es = gsva.ssgsea(rnaseq_dat, genesets)
    es_z = z_transform(es, axis=1)

    # export
    for_export = es_z.transpose()
    for_export.insert(for_export.shape[1], 'Verhaak classification',
                      rnaseq_meta.loc[for_export.index, 'expression_subclass'])
    for_export.insert(for_export.shape[1], 'Wang classification',
                      rnaseq_meta.loc[for_export.index, 'wang_classification'])
    for_export.to_excel(
        os.path.join(outdir, "tcga_signature_scores_and_subgroups.xlsx"))

    # boxplot by subgroup
    if class_method == 'verhaak':
        groups = rnaseq_meta.expression_subclass
        # remove any small groups (e.g. a single G-CIMP instance)
예제 #2
0
    to_export = the_list_mo.copy()
    to_export.columns = ['Mouse BMDM', 'Mouse MG']


    all_genes_in_set = setops.reduce_union(*the_list_hu.values())

    # DEBUG: disable filtering genes - why would we need to?
    if False:
        # remove genes that have no appreciable expression level
        # >=10 samples must have FPKM >= 1
        to_keep = ((rnaseq_dat > fpkm_cutoff).sum(axis=1) > fpkm_min_samples) | (rnaseq_dat.index.isin(all_genes_in_set))
        print "Keeping %d / %d genes that are sufficiently abundant" % (to_keep.sum(), to_keep.size)
        rnaseq_dat = rnaseq_dat.loc[to_keep]

    # run ssGSEA
    rna_es = gsva.ssgsea(rnaseq_dat, rna_list_hu)
    ffpe_es = gsva.ssgsea(ffpe_dat, rna_list_hu)

    # scale using the Z transform
    # TODO: previous operation had axis=None
    rna_z = z_transform(rna_es, axis=1)
    ffpe_z = z_transform(ffpe_es, axis=1)

    fig = plt.figure(num="TCGA RNA-Seq")
    ax = fig.add_subplot(111)
    for g_name in the_list_hu:
        sns.kdeplot(rna_z.loc[g_name], ax=ax)
    ax.set_xlabel("Normalised ssGSEA score")
    ax.set_ylabel("Density")
    fig.savefig(os.path.join(outdir, 'rnaseq_ssgsea_score_tcga.png'), dpi=200)
    fig.savefig(os.path.join(outdir, 'rnaseq_ssgsea_score_tcga.pdf'))