def reannotate(this_res): """ Convert Ensembl IDs back to gene symbols for easier intepretability :param this_res: :return: """ # get all Entrez gene IDs and convert in one go all_genes = set() for k, df in this_res.items(): for t in df.study_items.str.split(', ').dropna(): all_genes.update(t) gene_conv = reference_genomes.ensembl_to_gene_symbol(sorted(all_genes)) new_res = {} for k in this_res.keys(): df = this_res[k].copy() this_gene_symb = [] for t in df.study_items: if pd.isnull(t): this_gene_symb.append('') else: this_gene_symb.append(','.join( gene_conv.loc[t.split(', ')].dropna().values)) df.drop('study_items', axis=1, inplace=True) df.insert(df.shape[1], 'genes_in_term', this_gene_symb) new_res[k] = df return new_res
def top_genes( data, n=100, convert_to_symbols=True, tax_id=9606, ): """ Retrieve the top n genes from the data :param data: Indexed by ensembl_ID :param units: :param n: :return: """ if convert_to_symbols: # get gene symbols and drop all NaN gs = reference_genomes.ensembl_to_gene_symbol(data.index, tax_id=tax_id).dropna() gs = gs.loc[~gs.index.duplicated()] gs = gs.loc[~gs.duplicated()] res = {} for col in data.columns: t = data.loc[:, col].sort_values(ascending=False)[:n] if convert_to_symbols: new_idx = gs.loc[t.index] new_idx.loc[new_idx.isnull()] = t.index[new_idx.isnull()] t.index = new_idx res[col] = set(t.index) return res
def load_rnaseq_htseq_count_data(by_gene=False): """ Load in HTSeq counting data from pre-existing ht-seq run. :param by_gene: If True, translate the raw Ensembl codes to gene symbol. Discard any that do not translate, except _ambiguous, _no_feature, _unmapped. :return: """ infiles = { 'XZ1': 'xz1_exon_counts_gr37_reverse.dill', } res = pd.DataFrame() for tag, fn in infiles.items(): ff = os.path.join(RNASEQ_GENE_COUNTS_DIR, fn) with open(ff, 'rb') as f: t = pickle.load(f) if by_gene: trans = reference_genomes.ensembl_to_gene_symbol(t.index) # keep only the non-null entries trans = trans.loc[~trans.isnull()] t = t.loc[trans.index.union(RNA_COUNT_FIELDS)] # reindex t.index = list(trans.values) + RNA_COUNT_FIELDS res[tag] = t return res
def add_gene_symbols(df): """ Add gene symbols to the DataFrame df which is indexed by Ensembl IDs """ gs = reference_genomes.ensembl_to_gene_symbol(df.index) # resolve any duplicates arbitrarily (these should be rare) gs = gs.loc[~gs.index.duplicated()] df.insert(0, 'Gene Symbol', gs)
def prepare_gct_files(outdir=None): """ Prepare the GCT files required to perform classification: - Our GBM FFPE and cell culture samples - TCGA RNA-Seq cohort - Both combined In all cases, use FPKM units and gene symbols, as these are used by Wang """ if outdir is None: outdir = unique_output_dir("gct_files_for_wang") infiles = [] # 1) Our data obj_ffpe = rnaseq_data.load_by_patient('all', type='ffpe') dat_ffpe = obj_ffpe.get_fpkm() dat_ffpe.columns = ['%s_FFPE' % t for t in obj_ffpe.meta.reference_id] obj_cc = rnaseq_data.load_by_patient(patient_ids='all') dat_cc = obj_cc.get_fpkm() dat_cc = dat_cc.loc[:, obj_cc.meta.type == 'GBM'] dat_all = pd.concat((dat_cc, dat_ffpe), axis=1) idx = reference_genomes.ensembl_to_gene_symbol(dat_all.index).dropna() dat_all = dat_all.loc[idx.index] dat_all.index = idx fn = os.path.join(outdir, "gbm_ffpe_cc_fpkm.gct") gsea.data_to_gct(dat_all, fn) infiles.append(fn) # 2) TCGA (IDH1 WT only) tcga_dat, tcga_meta = rnaseq_data.tcga_primary_gbm(units='fpkm') tcga_dat = tcga_dat.loc[:, tcga_meta.idh1_status == 'WT'] idx = reference_genomes.ensembl_to_gene_symbol(tcga_dat.index).dropna() idx = idx.loc[~idx.index.duplicated()] tcga_dat = tcga_dat.loc[idx.index] tcga_dat.index = idx fn = os.path.join(outdir, "tcga_idh1_wt_fpkm.gct") gsea.data_to_gct(tcga_dat, fn) infiles.append(fn) # 3) Combined dat = gsea.combine_gct_files(*infiles) fn = os.path.join(outdir, "tcga_idh1_wt_and_gbm_ffpe_cc_fpkm.gct") gsea.data_to_gct(dat, fn)
design=design) de_res_separate = {} for p in pids: de_res_separate[p] = differential_expression.edger_test( fit, design, "groupSmartSeq%s - groupPolyA%s" % (p, p)) general.add_gene_symbols_to_ensembl_data(de_res_separate[p]) print "Patient %s: %d DE genes in SmartSeq2 - PolyA (%d up, %d down)." % ( p, de_res_separate[p].shape[0], (de_res_separate[p].logFC > 0).sum(), (de_res_separate[p].logFC < 0).sum(), ) de_in_all = reference_genomes.ensembl_to_gene_symbol( setops.reduce_intersection( *[t.index for t in de_res_separate.values()])) # sort this by the avg logFC logfc_in_all = pd.DataFrame.from_dict( dict([(p, v.loc[de_in_all.index, 'logFC']) for p, v in de_res_separate.items()])) logfc_in_all = logfc_in_all.loc[logfc_in_all.mean( axis=1).abs().sort_values(ascending=False).index] general.add_gene_symbols_to_ensembl_data(logfc_in_all) fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111, facecolor='w') venn.venn_diagram(set_labels=de_res_separate.keys(), *[t.index for t in de_res_separate.values()], ax=ax) fig.tight_layout()
# pick one in each case # duplicated() doesn't mark the first entry by default aa = aa[~pd.Index(aa.values).duplicated()] dat2 = dat.loc[:, aa.index.intersection(dat.columns)] cols = aa.loc[dat2.columns].values bb = cases_mani.loc[cols] # these should be unique... if pd.Index(bb.values).duplicated().any(): raise AttributeError("Some case IDs are duplicated in the final dataset") dat2.columns = bb.values # finally, only keep those cases that are also in the Brennan table not_in_meta = pd.Index(bb.values).difference(meta.index) if len(not_in_meta): this = bb.loc[pd.Index(bb.values).isin(not_in_meta)] print "Cases not in meta: \n%s" % this.to_string() meta = meta.loc[meta.index.intersection(bb.values)] dat2 = dat2.loc[:, meta.index] # add gene symbols gs = reference_genomes.ensembl_to_gene_symbol(dat2.index) dat2.loc[:, 'Approved Symbol'] = gs # export meta.to_csv(os.path.join(indir, 'sources.csv')) dat2.to_csv(os.path.join(indir, 'counts.csv'))
for g in cl.genes: gene_to_dm_cluster.setdefault(g[0], set()).add(c) ens_to_dm_cluster = {} for c, cl in dmr_res_s1.clusters.items(): for g in cl.genes: if g[0] in all_dm_ens: e = all_dm_ens[g[0]] ens_to_dm_cluster.setdefault(e, set()).add(c) all_de_ens_with_dm = sorted( setops.reduce_union(*[ de_res_full_s1[pid].reindex(all_dm_ens).dropna().index for pid in pids ])) ens_to_gs = reference_genomes.ensembl_to_gene_symbol(all_de_ens_with_dm) # single mega table single_de_dm_df = [] de_fdr = {} dm_fdr = {} # plus separate DGIdb dump dgi_db_df = [] # use to determine which relations are relevant all_relations = sorted( setops.reduce_union(*[[t[1] for t in cl.genes] for cl in dmr_res_s1.clusters.values()])) for e in all_de_ens_with_dm:
def plot_biplot(dat, meta, dims, scatter_colours, scatter_markers, annotate_features_radius=None, annotate_features_quantile=None, adjust_annotation=True, adjust_annotation_kwargs=None, **kwargs): """ :param dat: :param meta: pd.DataFrame, must have columns entitled `type` and `patient_id` :param dims: :param scatter_colours: :param scatter_markers: :param annotate_features_radius: If supplied, this is the biplot radius outside of which we annotate genes (by symbol). :param **kwargs: Passed to pca.biplot() :return: """ if annotate_features_radius is not None and annotate_features_quantile is not None: raise AttributeError( "Supply EITHER annotate_features_radius OR annotate_features_quantile." ) if annotate_features_quantile is not None: assert 0 < annotate_features_quantile < 1, "annotate_features_quantile must be between 0 and 1 (not inclusive)." if adjust_annotation_kwargs is None: adjust_annotation_kwargs = {} sample_colours = meta.patient_id.map(scatter_colours.get).to_dict() sample_markers = meta.type.map(scatter_markers.get).to_dict() res = pca.biplot(dat, plot_dims=dims, sample_colours=sample_colours, sample_markers=sample_markers, **kwargs) sample_x, sample_y = res['sample_data'] feat_x, feat_y = res['feature_data'] ax = res['ax'] fig = res['fig'] typ_ix, typ = meta.type.factorize() # connect patients for pid in meta.patient_id.unique(): ix = meta.patient_id == pid for t0, t1 in itertools.combinations(typ, 2): # draw all possible connections between these two cell types (in one direction only) ix0 = meta.index[ix & (meta.type == t0)] ix1 = meta.index[ix & (meta.type == t1)] for a, b in itertools.product(ix0, ix1): ax.plot([ sample_x[meta.index == a][0], sample_x[meta.index == b][0] ], [ sample_y[meta.index == a][0], sample_y[meta.index == b][0] ], lw=1.5, color=scatter_colours[pid], zorder=9) # custom legend outside of plot line_kwargs = { 'class': 'line', 'markerfacecolor': 'none', 'markeredgecolor': 'k', 'markeredgewidth': 1.0, 'linestyle': 'none' } patch_kwargs = {'class': 'patch', 'edgecolor': 'k', 'linewidth': 1.} legend_dict = { 'Patient': collections.OrderedDict(), 'Cell type': collections.OrderedDict() } for pid in consts.PIDS: ll = dict(patch_kwargs) ll['facecolor'] = scatter_colours[pid] legend_dict['Patient'][pid] = ll for t in typ: pp = dict(line_kwargs) pp['marker'] = scatter_markers[t] legend_dict['Cell type'][t] = pp res['legend_dict'] = legend_dict common.add_custom_legend(ax, legend_dict, loc_outside=True) fig.tight_layout() fig.subplots_adjust(right=0.8) selected = None if annotate_features_radius is not None: # annotate most influential genes selected = pca.highlight_biplot_features(feat_x, feat_y, annotate_features_radius, ax) if annotate_features_quantile is not None: rad = (feat_x**2 + feat_y**2)**.5 cut = sorted(rad)[int(len(rad) * annotate_features_quantile)] selected = rad >= cut if selected is not None: genes_selected = dat.index[selected] symbols_selected = reference_genomes.ensembl_to_gene_symbol( genes_selected) # add gene symbol annotations text_handles = [] for ix, gs in zip(np.where(selected)[0], symbols_selected): if not pd.isnull(gs): text_handles.append( ax.text(feat_x[ix], feat_y[ix], gs, zorder=10)) # rearrange them to avoid overlaps if adjust_annotation: adjuster.adjust_text_radial_plus_repulsion( text_handles, **adjust_annotation_kwargs) return fig, ax, res
de_res = differential_expression.compute_cross_de( rnaseq_obj, pids, external_references=external_refs, **de_params) # this is useful for volcano plots, but otherwise not worth computing? # de_res_full = differential_expression.compute_cross_de(rnaseq_obj, pids, external_references=external_refs, return_full=True, **de_params) cc_dict = cross_comparison.compute_cross_comparison_correction( dict([(k, v.index) for k, v in de_res.items()]), pids, external_ref_labels, set_type='pair_only') po_specific_to_all_refs = sorted(cc_dict['specific_to_all_refs']) pair_only = cc_dict['venn_set'] # get the genes that consistently differ in the pair comparison only and NOT in Gibco (across all patients) # these will have an expression pattern in Gibco similar to GBM, so that they do NOT appear po_specific_to_all_refs_gs = reference_genomes.ensembl_to_gene_symbol( po_specific_to_all_refs) po_specific_to_all_refs_gs = po_specific_to_all_refs_gs.where( ~po_specific_to_all_refs_gs.isnull(), po_specific_to_all_refs) po_dat = rnaseq_obj.data.loc[po_specific_to_all_refs] po_dat.index = po_specific_to_all_refs_gs po_dat = np.log2(po_dat + 1) # rearrange columns the_cols = (po_dat.columns[po_dat.columns.str.contains('GBM')].tolist() + ref_samples + po_dat.columns[po_dat.columns.str.contains('DURA')].tolist()) spacing1 = po_dat.columns.str.contains('GBM').sum() spacing2 = spacing1 + len( ref_samples ) + 1 # +1 required as we will already have added a space to the left of this
ssgsea_rnaseq_data, xcell_tcga, corr_metric=corr_metric) # heatmap showing correlation between pathways and cell types # precursor: check for cases where there is a substantial overlap in genes in pathways and cell type signatures # load xCell signatures xcell_s = pd.read_excel(XCELL_SIGNATURE_FN, header=0, index_row=0) xcell_signatures = {} for i, row in xcell_s.iterrows(): xcell_signatures[row.Celltype_Source_ID] = set( row.iloc[2:].dropna().values) # convert IPA pathway Ensembl IDs to symbols for compatibility ipa_signatures_symb = {} for k, v in ipa_signatures.items(): ipa_signatures_symb[k] = reference_genomes.ensembl_to_gene_symbol( v).dropna() # compute overlap between cell type signatures and IPA signatures pct_shared = analyse_xcell_results.compute_cell_type_pathway_overlap( ipa_signatures_symb, xcell_signatures, ) # aggregate taking max over pathways cc = pct_shared.columns.str.replace(r'(?P<ct>[^_]*)_.*', r'\g<ct>') pct_shared_aggr = pct_shared.groupby(cc, axis=1).max() # set of pathways with any significance logger.info( "%d pathways enriched in at least one patient and retained after correlation analysis" % co.shape[1])
if remove_idh1: # filter IDH1 mutants idh1_wt = (~rnaseq_meta.idh1_status.isnull()) & ( rnaseq_meta.idh1_status == 'WT') rnaseq_meta = rnaseq_meta.loc[idh1_wt] rnaseq_dat = rnaseq_dat_raw.loc[:, rnaseq_meta.index] else: rnaseq_dat = rnaseq_dat_raw.loc[:, rnaseq_dat_raw.columns.str. contains('TCGA')] if rnaseq_type != 'gliovis': # add gene symbols for gene signature scoring? gs = reference_genomes.ensembl_to_gene_symbol( rnaseq_dat.index).dropna() rnaseq_dat = rnaseq_dat.loc[gs.index] rnaseq_dat.index = gs.values if rnaseq_type == 'counts': # convert to CPM rnaseq_dat = rnaseq_dat.divide(rnaseq_dat.sum(axis=0), axis=1) * 1e6 rnaseq_meta.insert(0, 'wang_classification_simplicity', wang_classes.loc[rnaseq_meta.index, 'Simplicity score']) rnaseq_meta.insert( 0, 'wang_classification_num_matches', wang_classes.loc[rnaseq_meta.index, 'Number of matches']) rnaseq_meta.insert(0, 'wang_classification', wang_classes.loc[rnaseq_meta.index, 'Wang subclass'])
from rnaseq import general, gsea from utils import reference_genomes from utils.output import unique_output_dir if __name__ == '__main__': outdir = unique_output_dir("mouse_gsea_files", reuse_empty=True) dat = rnaseq_data.mouse_nsc_salmon() dat = general.ensembl_transcript_quant_to_gene(dat, tax_id=10090) idx = dat.columns.str.contains(r'eNSC[0-9]med') | dat.columns.str.contains( r'mDura[0-9AN]*human') dat = dat.loc[:, idx] the_groups = pd.Series('eNSC', index=dat.columns) the_groups[dat.columns.str.contains('mDura')] = 'iNSC' # now switch from Ensembl to gene symbol and capitalize (why?) gs = reference_genomes.ensembl_to_gene_symbol(dat.index, tax_id=10090) gs = gs.str.upper() gs = gs.loc[~gs.index.duplicated()] gs.dropna(inplace=True) dat = dat.loc[gs.index] dat.index = gs # this leaves some duplicate values # we'll take the average dupe_idx = dat.index[dat.index.duplicated()] dupe_map = dat.index.isin(dupe_idx) dupes = dat.loc[dupe_map] dat = dat.loc[~dupe_map] dupes_mean = dupes.groupby(dupes.index).mean() dat = dat.append(dupes_mean)
'ENSG00000135679', 'ENSG00000198625', 'ENSG00000141510', 'ENSG00000100393', 'ENSG00000149311', 'ENSG00000012048', 'ENSG00000139618', 'ENSG00000116062', ] # kde for one gene counts = np.arange(8000) example_col = X.columns[0] example_ens = X.index[0] example_gene = reference_genomes.ensembl_to_gene_symbol(example_ens) x1 = X.loc[example_ens] n = float(len(x1)) p = X.shape[0] fr1 = reduce(operator.add, (stats.poisson.pmf(counts, t + r) for t in x1)) Fr1 = fr1.cumsum() / n # run for all genes (rows) pool = mp.Pool() jobs = {} for ei, xi in X.iterrows(): jobs[ei] = pool.apply_async(eval_one_kde_poisson, args=(xi,)) pool.close()
mean_logfc = pd.Series(np.nanmean(de_res[["%s_logFC" % p for p in pids]], axis=1), index=de_res.index) mean_logfc.dropna(inplace=True) ix = feat_dat.index.intersection(mean_logfc.index) mean_logfc = mean_logfc.loc[ix] mean_logfc = mean_logfc.loc[mean_logfc.abs().sort_values(ascending=False).index] ax.scatter( feat_dat.loc[mean_logfc.index[:50], 'x'], feat_dat.loc[mean_logfc.index[:50], 'y'], c='k', facecolor='k', marker='^', ) gg = reference_genomes.ensembl_to_gene_symbol(mean_logfc.index[:50]).dropna() for k, v in feat_dat.loc[mean_logfc.index[:50]].iterrows(): g = gg[k] if k in gg else k ax.text(v['x'], v['y'], g) dims = (2, 3) # for copy paste convenience fig, ax, res = plot_biplot( dat, obj.meta, dims, scatter_colours, scatter_markers, scale=0.05 ) feat_dat = pd.DataFrame(np.array(res['feature_data']).transpose(), index=dat.index)
export_hypo = [] export_hyper = [] for tt, out_arr in zip([partial_hypo_recs, partial_hyper_recs], [export_hypo, export_hyper]): for pid_arr, rec in tt: genes = set() gene_names = [] for t in rec.INFO['ANN']: srch = re.search(r'(?P<g>ENSG[0-9]*)', t) if srch is not None: genes.add(srch.group('g')) if len(genes) > 0: try: gene_names = reference_genomes.ensembl_to_gene_symbol( genes).dropna().unique() except KeyError: gene_names = [] out = collections.OrderedDict([ ('id', rec.ID), ('chrom', rec.CHROM), ('start', rec.start), ('end', rec.end), ('ref', rec.REF), ('alt_seq', '|'.join([t.sequence for t in rec.ALT])), ('alt_type', '|'.join([t.type for t in rec.ALT])), ('gene_ens', ','.join(genes)), ('gene_symbol', ','.join(gene_names)), ]) for p in pids:
# now, find the union of genes that are PO when ANY of the external references is used tmp2 = reduce(unioner, pair_only.loc[pid, external_ref_labels]) po_intersection_insc.loc[pid, 'any'] = tmp.difference(tmp2) # find DE genes po_specific_to_reference = [ sorted( reduce(intersecter, po_diff.loc[~po_diff.index.str.contains(pid), pid])) for pid in cols ] po_specific_to_reference = pd.Series(po_specific_to_reference, index=cols) # get the genes that consistently differ in the pair comparison only and NOT in Gibco (across all patients) # these will have an expression pattern in Gibco similar to GBM, so that they do NOT appear po_gibco_diff = po_specific_to_reference.loc['GIBCO'] po_gibco_diff_gs = reference_genomes.ensembl_to_gene_symbol(po_gibco_diff) po_gibco_diff_gs = po_gibco_diff_gs.where(~po_gibco_diff_gs.isnull(), po_gibco_diff) po_dat = rnaseq_obj.data.loc[po_gibco_diff] po_dat.index = po_gibco_diff_gs po_dat = np.log2(po_dat + 1) # po_dat = salmon_dat.loc[po_gibco_diff] # po_dat.index = po_gibco_diff_gs # # dropna() here loses one gene - LINC01090 / ENSG00000231689 # # all others are present # po_dat = np.log2(po_dat.dropna() + 0.01) # rearrange columns the_cols = (po_dat.columns[po_dat.columns.str.contains('GBM')].tolist() +
med_dev_nsc_rel.loc[rel_dev_candidates], c='g') ax.scatter(ranked_perc.loc[hkg_ens], med_dev_nsc_rel.loc[hkg_ens], c='r') for g, e in zip(hkg, hkg_ens): ax.text(ranked_perc.loc[e], med_dev_nsc_rel.loc[e], g) ax.set_ylim([0, 2]) ax.set_xlabel("Abundance percentile") ax.set_ylabel("Relative median absolute difference from NSC") ax.set_title("%d genes meet relative NSC-MAD criteria" % rel_dev_candidates.size) ax.figure.savefig(os.path.join( outdir, 'relative_median_absolute_deviation_from_nsc.png'), dpi=200) final_candidates = ensembl_to_gene_symbol( mad_candidates.intersection(rel_dev_candidates).intersection( range_candidates)) print "Identified %d candidates" % final_candidates.size print '\n'.join(final_candidates) # now re-plot the first figure but with a subset of these new_hkg = ['GAPDH', 'ATP5B', 'ACTB', 'PPIA', 'H3F3B'] new_hkg_ens = gene_symbol_to_ensembl(new_hkg) hkg_dat = dat_n.loc[new_hkg_ens, sorted(dat_n.columns)] hkg_dat.index = pd.Index(new_hkg, name='Housekeeping gene') hkg_dat_rel = hkg_dat.divide(hkg_dat.loc[:, ref], axis=0) cols = [ref] + sorted(hkg_dat_rel.columns[hkg_dat_rel.columns != ref]) hkg_dat_rel = hkg_dat_rel.loc[:, cols]
mfc = np.sign(mfc) * 20 log2_mfc.loc[g] = mfc t_values.dropna(inplace=True) log2_mfc.dropna(inplace=True) idx = t_values.index.intersection(log2_mfc.index) t_values = t_values.loc[idx] log2_mfc = log2_mfc.loc[idx] p_values = p_values.loc[idx] from statsmodels.sandbox.stats import multicomp tmp = multicomp.multipletests(p_values.values, method='fdr_bh', alpha=0.001) # get the genes responsible for the observed changes reference_genomes.ensembl_to_gene_symbol(the_insc.loc[tmp[0]].index, tax_id=10090) # compare within mice data = obj.data.loc[obj.data.index.str.contains('ENS')] meta = obj.meta # cpm = data.divide(meta.loc[:, 'read_count'].values, axis=1) * 1e6 cpm = data.divide(data.sum(axis=0), axis=1) * 1e6 keep = (cpm > .5).sum(axis=1) > 5 the_dat_cv = np.log2(data.loc[keep] + 1) groups_by_mouse = [ ['eNSC3med', 'eNSC3mouse', 'mDura3N1mouse', 'mDura3N1human'], ['eNSC5med', 'eNSC5mouse', 'mDura5N24Amouse', 'mDura5N24Ahuman'],
tmp2 = pair_only.loc[pid, c] # we want anything in the first part that is NOT in the second part po_intersection_insc.loc[pid] = tmp.difference(tmp2) # find DE genes that are always unique to a given reference (regardless of the GBM) po_specific_to_reference = [ sorted( reduce(intersecter, po_diff.loc[~po_diff.index.str.contains(pid), pid]) ) for pid in cols ] po_specific_to_reference = pd.Series(po_specific_to_reference, index=cols) # get the genes that consistently differ in the pair comparison only and NOT in Gibco (across all patients) # these will have an expression pattern in Gibco similar to GBM, so that they do NOT appear po_ref_diff = po_specific_to_reference.loc[c] po_ref_diff_gs = reference_genomes.ensembl_to_gene_symbol(po_ref_diff) po_ref_diff_gs = po_ref_diff_gs.where(~po_ref_diff_gs.isnull(), po_ref_diff) po_dat = rnaseq_obj.data.loc[po_ref_diff] po_dat.index = po_ref_diff_gs po_dat = np.log2(po_dat + 1) # po_dat = salmon_dat.loc[po_gibco_diff] # po_dat.index = po_gibco_diff_gs # # dropna() here loses one gene - LINC01090 / ENSG00000231689 # # all others are present # po_dat = np.log2(po_dat.dropna() + 0.01) # rearrange columns the_cols = ( po_dat.columns[po_dat.columns.str.contains('GBM')].tolist() +
col_order = plot_all_clustermaps(data_nsc, filestem, col_colors=col_colors) # for every sample, extract the top N by count and summarise topNs = [10, 50, 100] for topN in topNs: common_genes = set() top_dat = [] for i in range(data_rr.shape[1]): t = data_rr_mt.iloc[:, i].sort_values(ascending=False)[:topN] common_genes.update(t.index) top_dat = data_rr_mt.loc[list(common_genes)].divide(data_rr.sum(), axis=1) symb = reference_genomes.ensembl_to_gene_symbol(top_dat.index) tidx = np.array(top_dat.index) tidx[~symb.isnull().values] = symb.loc[~symb.isnull()].values top_dat.index = tidx filestem = os.path.join(OUTDIR, 'clustermap_sub_rrna_mt_top_%d' % topN) col_order = plot_all_clustermaps(top_dat, filestem, col_colors=col_colors) filestem = os.path.join(OUTDIR, 'correlation_sub_rrna_mt_top_%d' % topN) plot_all_correlation_heatmaps(top_dat, filestem, col_order, vmin=0.5, vmax=1.) # bar charts of successive markers, used to characterise based on timeline # for this, only astrocytes and NSCs useful, so remove oligo and neuron astro_markers2 = [ 'NFIA',