logger.warn( "%d genes in the %s signature do not match with the data index and will be dropped: %s.", len(missing), k, ', '.join(missing)) genesets[k] = g_in # check here whether there is any overlap vs, vc = setops.venn_from_arrays(*genesets.values()) n_overlap = sum( [vc[t] for t in setops.binary_combinations_sum_gte(len(genesets), 2)]) if n_overlap > 0: logger.warn( "The %d gene signatures used here have %d overlapping genes - please check this is OK.", len(genesets), n_overlap) # run ssGSEA then Z transform the results es = gsva.ssgsea(rnaseq_dat, genesets) es_z = z_transform(es, axis=1) # export for_export = es_z.transpose() for_export.insert(for_export.shape[1], 'Verhaak classification', rnaseq_meta.loc[for_export.index, 'expression_subclass']) for_export.insert(for_export.shape[1], 'Wang classification', rnaseq_meta.loc[for_export.index, 'wang_classification']) for_export.to_excel( os.path.join(outdir, "tcga_signature_scores_and_subgroups.xlsx")) # boxplot by subgroup if class_method == 'verhaak': groups = rnaseq_meta.expression_subclass # remove any small groups (e.g. a single G-CIMP instance)
to_export = the_list_mo.copy() to_export.columns = ['Mouse BMDM', 'Mouse MG'] all_genes_in_set = setops.reduce_union(*the_list_hu.values()) # DEBUG: disable filtering genes - why would we need to? if False: # remove genes that have no appreciable expression level # >=10 samples must have FPKM >= 1 to_keep = ((rnaseq_dat > fpkm_cutoff).sum(axis=1) > fpkm_min_samples) | (rnaseq_dat.index.isin(all_genes_in_set)) print "Keeping %d / %d genes that are sufficiently abundant" % (to_keep.sum(), to_keep.size) rnaseq_dat = rnaseq_dat.loc[to_keep] # run ssGSEA rna_es = gsva.ssgsea(rnaseq_dat, rna_list_hu) ffpe_es = gsva.ssgsea(ffpe_dat, rna_list_hu) # scale using the Z transform # TODO: previous operation had axis=None rna_z = z_transform(rna_es, axis=1) ffpe_z = z_transform(ffpe_es, axis=1) fig = plt.figure(num="TCGA RNA-Seq") ax = fig.add_subplot(111) for g_name in the_list_hu: sns.kdeplot(rna_z.loc[g_name], ax=ax) ax.set_xlabel("Normalised ssGSEA score") ax.set_ylabel("Density") fig.savefig(os.path.join(outdir, 'rnaseq_ssgsea_score_tcga.png'), dpi=200) fig.savefig(os.path.join(outdir, 'rnaseq_ssgsea_score_tcga.pdf'))