def plot_acc_quartiles(acc_df, args, cdata): mpl.rcParams['axes.linewidth'] = 1.2 mpl.rcParams['axes.edgecolor'] = '0.05' fig, (ax_auc, ax_aupr) = plt.subplots(figsize=(22, 10), ncols=2) mtype_sizes = [len(cdata.train_mut[gene]) / len(cdata.samples) for gene in acc_df.index] auc_vals = acc_df['AUC'].quantile(q=0.25, axis=1) aupr_vals = acc_df['AUPR'].quantile(q=0.25, axis=1) ax_auc.scatter(mtype_sizes, auc_vals, s=15, c='black', alpha=0.47) ax_aupr.scatter(mtype_sizes, aupr_vals, s=15, c='black', alpha=0.47) auc_annot = place_annot(mtype_sizes, auc_vals.values.tolist(), size_vec=[15 for _ in mtype_sizes], annot_vec=aupr_vals.index, x_range=1, y_range=1) for annot_x, annot_y, annot, halign in auc_annot: ax_auc.text(annot_x, annot_y, annot, size=11, ha=halign) aupr_annot = place_annot(mtype_sizes, aupr_vals.values.tolist(), size_vec=[15 for _ in mtype_sizes], annot_vec=aupr_vals.index, x_range=1, y_range=1) for annot_x, annot_y, annot, halign in aupr_annot: ax_aupr.text(annot_x, annot_y, annot, size=11, ha=halign) for ax in (ax_auc, ax_aupr): ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax_auc.plot([-1, 2], [0.5, 0.5], linewidth=1.7, linestyle='--', color='#550000', alpha=0.6) ax_aupr.plot([-1, 2], [-1, 2], linewidth=1.7, linestyle='--', color='#550000', alpha=0.6) fig.text(0.5, -0.03, 'Proportion of {} Samples Mutated'.format(args.cohort), ha='center', va='center', fontsize=22, weight='semibold') ax_auc.set_ylabel('1st Quartile AUC', fontsize=22, weight='semibold') ax_aupr.set_ylabel('1st Quartile AUPR', fontsize=22, weight='semibold') fig.tight_layout(w_pad=2.2, h_pad=5.1) fig.savefig( os.path.join(plot_dir, args.model_name.split('__')[0], '{}__acc-quartiles__{}-{}_samps-{}.png'.format( args.model_name.split('__')[1], args.expr_source, args.cohort, args.samp_cutoff )), dpi=250, bbox_inches='tight' ) plt.close()
def plot_auc_quartiles(auc_df, args): mpl.rcParams['axes.linewidth'] = 1.2 mpl.rcParams['axes.edgecolor'] = '0.05' fig, ax = plt.subplots(figsize=(14, 13)) test_aucs = auc_df.applymap(itemgetter('test')) quart_df = pd.DataFrame(index=auc_df.index, columns=['Min', 'Max']) new_indx = ['' for _ in auc_df.index] for i, (((coh1, coh2), mtype), auc_dicts) in enumerate(test_aucs.iterrows()): auc_quants = pd.DataFrame.from_records( auc_dicts.values).quantile(q=0.25).sort_values() new_indx[i] = "({}) {}".format(' x '.join(auc_quants.index), str(mtype)) quart_df.iloc[i, :] = auc_quants.values quart_df.index = new_indx plot_min = np.min(quart_df.values) - 0.01 ax.scatter(quart_df.Min, quart_df.Max, s=15, c='black', alpha=0.47) for annot_x, annot_y, annot, halign in place_annot( quart_df.Min.tolist(), quart_df.Max.tolist(), size_vec=[15 for _ in auc_df.index], annot_vec=quart_df.index, x_range=1 - plot_min, y_range=1 - plot_min, gap_adj=79): ax.text(annot_x, annot_y, annot, size=11, ha=halign) ax.tick_params(pad=5.1) ax.set_xlim(plot_min, 1) ax.set_ylim(plot_min, 1) ax.set_xlabel('1st Qrt. AUC, min cohort', fontsize=22, weight='semibold') ax.set_ylabel('1st Qrt. AUC, max cohort', fontsize=22, weight='semibold') ax.plot([-1, 2], [-1, 2], linewidth=1.7, linestyle='--', color='#550000', alpha=0.6) fig.savefig(os.path.join( plot_dir, args.model_name.split('__')[0], '{}__acc-quartiles__{}_samps-{}.png'.format( args.model_name.split('__')[1], args.expr_source, args.samp_cutoff)), dpi=250, bbox_inches='tight') plt.close()
def plot_tuning_gene(par_df, acc_df, use_clf, args, cdata): fig, axarr = plt.subplots(figsize=(13, 12 * len(use_clf.tune_priors)), nrows=len(use_clf.tune_priors), ncols=1, squeeze=False) for ax, (par_name, tune_distr) in zip(axarr.flatten(), use_clf.tune_priors): par_vals = par_df[par_name].groupby(level=0).median() acc_vals = acc_df['AUC'].quantile(q=0.25, axis=1) size_vec = [1073 * len(cdata.train_mut[gene]) / len(cdata.samples) for gene in acc_vals.index] if detect_log_distr(tune_distr): par_vals = np.log10(par_vals) plt_xmin = 2 * np.log10(tune_distr[0]) - np.log10(tune_distr[1]) plt_xmax = 2 * np.log10(tune_distr[-1]) - np.log10(tune_distr[-2]) else: plt_xmin = 2 * tune_distr[0] - tune_distr[1] plt_xmax = 2 * tune_distr[-1] - tune_distr[-2] par_vals += np.random.normal( 0, (plt_xmax - plt_xmin) / (len(tune_distr) * 19), acc_df.shape[0]) ax.scatter(par_vals, acc_vals, s=size_vec, c='black', alpha=0.23) ax.set_xlim(plt_xmin, plt_xmax) ax.set_ylim(0, 1) ax.axhline(y=0.5, color='#550000', linewidth=3.1, linestyle='--', alpha=0.32) annot_placed = place_annot( par_vals, acc_vals.values.tolist(), size_vec=size_vec, annot_vec=acc_vals.index, x_range=plt_xmax - plt_xmin, y_range=1 ) for annot_x, annot_y, annot, halign in annot_placed: ax.text(annot_x, annot_y, annot, size=11, ha=halign) ax.set_xlabel('Median Tuned {} Value'.format(par_name), fontsize=26, weight='semibold') ax.set_ylabel('1st Quartile AUC', fontsize=26, weight='semibold') plt.tight_layout() fig.savefig( os.path.join(plot_dir, args.model_name.split('__')[0], '{}__tuning-gene__{}-{}_samps-{}.png'.format( args.model_name.split('__')[1], args.expr_source, args.cohort, args.samp_cutoff )), dpi=250, bbox_inches='tight' ) plt.close()
def plot_aupr_time(out_dict, args): fig, axarr = plt.subplots(figsize=(9, 15), nrows=2, sharex=True) time_quarts = np.log2( pd.Series({ mdl: (out_data['Tune']['Time']['fit']['avg'] + out_data['Tune']['Time']['fit']['std']).groupby( axis=1, level=0).quantile(q=0.75).mean().mean() for mdl, out_data in out_dict.items() })) aupr_vals = { mdl: out_data['Fit']['test']['AUPR'].quantile(q=0.25, axis=1) for mdl, out_data in out_dict.items() } aupr_list = [ pd.Series({mdl: vals.mean() for mdl, vals in aupr_vals.items()}), pd.Series( {mdl: vals.quantile(q=0.75) for mdl, vals in aupr_vals.items()}), ] expr_vec = time_quarts.index.get_level_values(0) expr_shapes = [ use_marks[sorted(set(expr_vec)).index(expr)] for expr in expr_vec ] model_vec = time_quarts.index.get_level_values(1).str.split('__').map( itemgetter(0)) model_cmap = sns.color_palette('Set1', n_colors=len(set(model_vec)), desat=.34) model_clrs = [ model_cmap[sorted(set(model_vec)).index(mdl)] for mdl in model_vec ] for ax, auprs in zip(axarr, aupr_list): for time_val, aupr_val, expr_shape, model_clr in zip( time_quarts.values, auprs.values, expr_shapes, model_clrs): ax.scatter(time_val, aupr_val, marker=expr_shape, c=model_clr, s=71, alpha=0.41) for annot_x, annot_y, annot, halign in place_annot( time_quarts.values.tolist(), auprs.values.tolist(), size_vec=[71 for _ in time_quarts], annot_vec=[' '.join(tst) for tst in time_quarts.index], x_range=time_quarts.max() - time_quarts.min(), y_range=auprs.max() - auprs.min(), gap_adj=79): ax.text(annot_x, annot_y, annot, size=10, ha=halign) ax.tick_params(axis='y', labelsize=14) axarr[1].xaxis.set_major_formatter(ticker.FormatStrFormatter(r'$2^{%d}$')) axarr[1].tick_params(axis='x', labelsize=21, pad=7) axarr[0].set_ylabel('Average AUPR', size=23, weight='semibold') axarr[1].set_ylabel('Third Quartile AUPR', size=23, weight='semibold') plt.xlabel('Fitting Time (seconds)', size=23, weight='semibold') plt.tight_layout(h_pad=3.3) fig.savefig(os.path.join(plot_dir, '{}__aupr-time.svg'.format(args.cohort)), bbox_inches='tight', format='svg') plt.close()
def plot_tuning_mtype_grid(par_df, auc_df, use_clf, args, cdata): par_count = len(use_clf.tune_priors) fig, axarr = plt.subplots(figsize=(0.5 + 7 * par_count, 7 * par_count), nrows=par_count, ncols=par_count) auc_vals = auc_df.quantile(q=0.25, axis=1) auc_clrs = auc_vals.apply(auc_cmap) size_vec = [ 461 * sum(cdata.train_pheno(mtype)) / (len(cdata.get_samples()) * par_count) for mtype in auc_vals.index ] for i, (par_name, tune_distr) in enumerate(use_clf.tune_priors): axarr[i, i].grid(False) if detect_log_distr(tune_distr): use_distr = [np.log10(par_val) for par_val in tune_distr] par_lbl = par_name + '\n(log-scale)' else: use_distr = tune_distr par_lbl = par_name distr_diff = np.mean( np.array(use_distr[1:]) - np.array(use_distr[:-1])) plt_min = use_distr[0] - distr_diff / 2 plt_max = use_distr[-1] + distr_diff / 2 axarr[i, i].set_xlim(plt_min, plt_max) axarr[i, i].set_ylim(plt_min, plt_max) axarr[i, i].text((plt_min + plt_max) / 2, (plt_min + plt_max) / 2, par_lbl, ha='center', fontsize=28, weight='semibold') for par_val in use_distr: axarr[i, i].axhline(y=par_val, color='#116611', ls='--', linewidth=4.1, alpha=0.27) axarr[i, i].axvline(x=par_val, color='#116611', ls='--', linewidth=4.1, alpha=0.27) for (i, (par_name1, tn_distr1)), (j, (par_name2, tn_distr2)) in combn( enumerate(use_clf.tune_priors), 2): if detect_log_distr(tn_distr1): use_distr1 = [np.log10(par_val) for par_val in tn_distr1] par_meds1 = np.log10(par_df[par_name1]).median(axis=1) par_means1 = np.log10(par_df[par_name1]).mean(axis=1) distr_diff = np.mean( np.log10(np.array(tn_distr1[1:])) - np.log10(np.array(tn_distr1[:-1]))) plt_ymin = np.log10(tn_distr1[0]) - distr_diff / 2 plt_ymax = np.log10(tn_distr1[-1]) + distr_diff / 2 else: use_distr1 = tn_distr1 par_meds1 = par_df[par_name1].median(axis=1) par_means1 = par_df[par_name1].mean(axis=1) distr_diff = np.mean( np.array(tn_distr1[1:]) - np.array(tn_distr1[:-1])) plt_ymin = tn_distr1[0] - distr_diff / 2 plt_ymax = tn_distr1[-1] + distr_diff / 2 if detect_log_distr(tn_distr2): use_distr2 = [np.log10(par_val) for par_val in tn_distr2] par_meds2 = np.log10(par_df[par_name2]).median(axis=1) par_means2 = np.log10(par_df[par_name2]).mean(axis=1) distr_diff = np.mean( np.log10(np.array(tn_distr2[1:])) - np.log10(np.array(tn_distr2[:-1]))) plt_xmin = np.log10(tn_distr2[0]) - distr_diff / 2 plt_xmax = np.log10(tn_distr2[-1]) + distr_diff / 2 else: use_distr2 = tn_distr2 par_meds2 = par_df[par_name2].median(axis=1) par_means2 = par_df[par_name2].mean(axis=1) distr_diff = np.mean( np.array(tn_distr2[1:]) - np.array(tn_distr2[:-1])) plt_xmin = tn_distr2[0] - distr_diff / 2 plt_xmax = tn_distr2[-1] + distr_diff / 2 par_meds1 = par_meds1[auc_clrs.index] par_meds2 = par_meds2[auc_clrs.index] y_adj = (plt_ymax - plt_ymin) / len(tn_distr1) x_adj = (plt_xmax - plt_xmin) / len(tn_distr2) plt_adj = (plt_xmax - plt_xmin) / (plt_ymax - plt_ymin) for med1, med2 in set(zip(par_meds1, par_meds2)): use_indx = (par_meds1 == med1) & (par_meds2 == med2) cnt_adj = use_indx.sum()**0.49 use_sizes = [s for s, ix in zip(size_vec, use_indx) if ix] sort_indx = sorted(enumerate(use_sizes), key=lambda x: x[1], reverse=True) from circlify import circlify mpl.use('Agg') for k, circ in enumerate(circlify([s for _, s in sort_indx])): axarr[i, j].scatter( med2 + (1 / 23) * cnt_adj * circ.y * plt_adj, med1 + (1 / 23) * cnt_adj * circ.x * plt_adj**-1, s=sort_indx[k][1], c=auc_clrs[use_indx][sort_indx[k][0]], alpha=0.36, edgecolor='black') par_means1 += np.random.normal(0, y_adj / 27, auc_df.shape[0]) par_means2 += np.random.normal(0, x_adj / 27, auc_df.shape[0]) axarr[j, i].scatter(par_means1[auc_clrs.index], par_means2[auc_clrs.index], s=size_vec, c=auc_clrs, alpha=0.36, edgecolor='black') axarr[i, j].set_xlim(plt_xmin, plt_xmax) axarr[i, j].set_ylim(plt_ymin, plt_ymax) axarr[j, i].set_ylim(plt_xmin, plt_xmax) axarr[j, i].set_xlim(plt_ymin, plt_ymax) annot_placed = place_annot(par_meds2, par_meds1, size_vec=size_vec, annot_vec=auc_vals.index, x_range=plt_xmax - plt_xmin, y_range=plt_ymax - plt_ymin) for annot_x, annot_y, annot, halign in annot_placed: axarr[i, j].text(annot_x, annot_y, annot, size=11, ha=halign) for par_val1 in use_distr1: axarr[i, j].axhline(y=par_val1, color='#116611', ls=':', linewidth=2.3, alpha=0.19) axarr[j, i].axvline(x=par_val1, color='#116611', ls=':', linewidth=2.3, alpha=0.19) for par_val2 in use_distr2: axarr[i, j].axvline(x=par_val2, color='#116611', ls=':', linewidth=2.3, alpha=0.19) axarr[j, i].axhline(y=par_val2, color='#116611', ls=':', linewidth=2.3, alpha=0.19) plt.tight_layout() fig.savefig(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name.split('__')[0], "{}__tuning-mtype-grid.svg".format(args.model_name.split('__')[1])), bbox_inches='tight', format='svg') plt.close()
def plot_tuning_mtype(par_df, auc_df, use_clf, args, cdata): fig, axarr = plt.subplots(figsize=(1 + 9 * len(use_clf.tune_priors), 13), nrows=3, ncols=len(use_clf.tune_priors), gridspec_kw={'height_ratios': [1, 0.3, 1]}, squeeze=False, sharex=False, sharey=True) auc_vals = auc_df.quantile(q=0.25, axis=1) size_vec = [ 198 * len(mtype.get_samples(cdata.mtree)) / len(cdata.get_samples()) for mtype in auc_vals.index ] for i, (par_name, tune_distr) in enumerate(use_clf.tune_priors): axarr[1, i].set_axis_off() axarr[2, i].tick_params(length=6) if detect_log_distr(tune_distr): med_vals = np.log10(par_df[par_name]).median(axis=1) mean_vals = np.log10(par_df[par_name]).mean(axis=1) use_distr = [np.log10(par_val) for par_val in tune_distr] par_lbl = par_name + '\n(log-scale)' else: med_vals = par_df[par_name].median(axis=1) mean_vals = par_df[par_name].mean(axis=1) use_distr = tune_distr par_lbl = par_name med_vals = med_vals[auc_vals.index] mean_vals = mean_vals[auc_vals.index] distr_diff = np.mean( np.array(use_distr[1:]) - np.array(use_distr[:-1])) for j in range(3): axarr[j, i].set_xlim(use_distr[0] - distr_diff / 2, use_distr[-1] + distr_diff / 2) axarr[1, i].text((use_distr[0] + use_distr[-1]) / 2, 0.5, par_lbl, ha='center', va='center', fontsize=25, weight='semibold') med_vals += np.random.normal(0, (use_distr[-1] - use_distr[0]) / (len(tune_distr) * 17), auc_df.shape[0]) mean_vals += np.random.normal(0, (use_distr[-1] - use_distr[0]) / (len(tune_distr) * 23), auc_df.shape[0]) axarr[0, i].scatter(med_vals, auc_vals, s=size_vec, c='black', alpha=0.23) axarr[2, i].scatter(mean_vals, auc_vals, s=size_vec, c='black', alpha=0.23) axarr[0, i].set_ylim(0, 1) axarr[2, i].set_ylim(0, 1) axarr[0, i].set_ylabel("1st Quartile AUC", size=19, weight='semibold') axarr[2, i].set_ylabel("1st Quartile AUC", size=19, weight='semibold') axarr[0, i].axhline(y=0.5, color='#550000', linewidth=2.3, linestyle='--', alpha=0.32) axarr[2, i].axhline(y=0.5, color='#550000', linewidth=2.3, linestyle='--', alpha=0.32) for par_val in use_distr: axarr[1, i].axvline(x=par_val, color='#116611', ls='--', linewidth=3.4, alpha=0.27) axarr[0, i].axvline(x=par_val, color='#116611', ls=':', linewidth=1.3, alpha=0.16) axarr[2, i].axvline(x=par_val, color='#116611', ls=':', linewidth=1.3, alpha=0.16) annot_placed = place_annot(med_vals, auc_vals.values.tolist(), size_vec=size_vec, annot_vec=auc_vals.index, x_range=use_distr[-1] - use_distr[0] + 2 * distr_diff, y_range=1) for annot_x, annot_y, annot, halign in annot_placed: axarr[0, i].text(annot_x, annot_y, annot, size=8, ha=halign) plt.tight_layout(h_pad=0) fig.savefig(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name.split('__')[0], "{}__tuning-mtype.svg".format(args.model_name.split('__')[1])), bbox_inches='tight', format='svg') plt.close()
def plot_acc_quartiles(auc_df, aupr_df, args, cdata): mpl.rcParams['axes.linewidth'] = 1.2 mpl.rcParams['axes.edgecolor'] = '0.05' fig, (ax_auc, ax_aupr) = plt.subplots(figsize=(22, 10), ncols=2) auc_vals = auc_df.quantile(q=0.25, axis=1) aupr_vals = aupr_df.quantile(q=0.25, axis=1) mtype_sizes = [ len(mtype.get_samples(cdata.mtree)) / len(cdata.get_samples()) for mtype in auc_df.index ] ax_auc.scatter(mtype_sizes, auc_vals, s=17, c='black', alpha=0.47) ax_aupr.scatter(mtype_sizes, aupr_vals, s=17, c='black', alpha=0.47) for annot_x, annot_y, annot, halign in place_annot( mtype_sizes, auc_vals.values.tolist(), size_vec=[15 for _ in mtype_sizes], annot_vec=aupr_vals.index, x_range=max(mtype_sizes) * 1.03, y_range=1, gap_adj=53): ax_auc.text(annot_x, annot_y, annot, size=11, ha=halign) for annot_x, annot_y, annot, halign in place_annot( mtype_sizes, aupr_vals.values.tolist(), size_vec=[15 for _ in mtype_sizes], annot_vec=aupr_vals.index, x_range=1, y_range=1, gap_adj=53): ax_aupr.text(annot_x, annot_y, annot, size=11, ha=halign) ax_auc.set_xlim(0, max(mtype_sizes) * 1.03) ax_aupr.set_xlim(0, 1) for ax in (ax_auc, ax_aupr): ax.tick_params(pad=3.9) ax.set_ylim(0, 1) ax_auc.plot([-1, 2], [0.5, 0.5], linewidth=1.7, linestyle='--', color='#550000', alpha=0.6) ax_aupr.plot([-1, 2], [-1, 2], linewidth=1.7, linestyle='--', color='#550000', alpha=0.6) fig.text(0.5, -0.03, 'Proportion of {} Samples Mutated'.format(args.cohort), ha='center', va='center', fontsize=22, weight='semibold') ax_auc.set_ylabel('1st Quartile AUC', fontsize=22, weight='semibold') ax_aupr.set_ylabel('1st Quartile AUPR', fontsize=22, weight='semibold') fig.tight_layout(w_pad=2.2, h_pad=5.1) fig.savefig(os.path.join( plot_dir, args.expr_source, "{}__samps-{}".format(args.cohort, args.samp_cutoff), args.model_name.split('__')[0], "{}__acc-quartiles.svg".format(args.model_name.split('__')[1])), bbox_inches='tight', format='svg') plt.close()