def quantile_accuracy(gene_targets, gene_preds, gene_stat, out_pdf, numq=4): ''' Plot accuracy (PearsonR) in quantile bins across targets. ''' # plot PearsonR in variance statistic bins quant_indexes = stats.quantile_indexes(gene_stat, numq) quantiles_series = [] targets_series = [] pcor_series = [] for qi in range(numq): # slice quantile gene_targets_quant = gene_targets[quant_indexes[qi]].astype('float32') gene_preds_quant = gene_preds[quant_indexes[qi]].astype('float32') # compute target PearsonR for ti in range(gene_targets_quant.shape[1]): pcor, _ = pearsonr(gene_targets_quant[:,ti], gene_preds_quant[:,ti]) quantiles_series.append(qi) targets_series.append(ti) pcor_series.append(pcor) # construct DataFrame df_quant = pd.DataFrame({'Quantile':quantiles_series, 'Target':targets_series, 'PearsonR':pcor_series}) df_quant.to_csv('%s.csv' % out_pdf[:-4]) # print summary table table_out = open('%s.txt' % out_pdf[:-4], 'w') for qi in range(numq): quantile_cors = df_quant[df_quant.Quantile == qi].PearsonR print('%2d %.4f %.4f' % \ (qi, np.mean(quantile_cors),np.median(quantile_cors)), file=table_out) table_out.close() # construct figure plt.figure() # plot individual targets as light lines for ti in range(gene_targets.shape[1]): df_quant_target = df_quant[df_quant.Target == ti] plt.plot(df_quant_target.Quantile, df_quant_target.PearsonR, alpha=0.1) # plot PearsonR distributions in quantiles sns.violinplot(x='Quantile', y='PearsonR', data=df_quant, color='tomato') plt.savefig(out_pdf) plt.close() # sort targets by their decrease target_ratios = [] for ti in range(gene_targets.shape[1]): df_quant_target = df_quant[df_quant.Target == ti] assert(df_quant_target.Quantile.iloc[0] == 0) assert(df_quant_target.Quantile.iloc[-1] == numq-1) cor_ratio = df_quant_target.PearsonR.iloc[-1] / df_quant_target.PearsonR.iloc[0] target_ratios.append((cor_ratio,ti)) target_ratios = sorted(target_ratios) # take 10 samples across pct_indexes = np.linspace(0, len(target_ratios)-1, 10+1).astype('int') # write quantile targets table_out = open('%s_qt.txt' % out_pdf[:-4], 'w') sns.set(font_scale=1.2, style='ticks') # scatter plot each quantile for qi in range(numq): # slice quantile gene_targets_quant = gene_targets[quant_indexes[qi]].astype('float32') gene_preds_quant = gene_preds[quant_indexes[qi]].astype('float32') for pqi in range(len(pct_indexes)): pct_i = pct_indexes[pqi] ti = target_ratios[pct_i][1] print(qi, pqi, ti, target_ratios[ti], file=table_out) qout_pdf = '%s_pq%d_q%d.pdf' % (out_pdf[:-4], pqi, qi) plots.jointplot(gene_targets_quant[:,ti], gene_preds_quant[:,ti], qout_pdf, alpha=0.8, point_size=8, kind='reg', figsize=5, x_label='log2 Experiment', y_label='log2 Prediction') table_out.close()
def variance_accuracy(gene_targets, gene_preds, out_prefix, log_pseudo=None): """ Compare MSE accuracy to gene mean and variance. Assumes the targets and predictions have been normalized. """ # compute mean, var, and MSE across targets print('gene_targets', gene_targets.shape) gene_mean = np.mean(gene_targets, axis=1, dtype='float64') gene_max = np.max(gene_targets, axis=1) gene_std = np.std(gene_targets, axis=1, dtype='float64') gene_mse = np.power(gene_targets - gene_preds, 2).mean(axis=1, dtype='float64') # filter for sufficient expression expr_indexes = (gene_mean > 0.5) & (gene_max > 3) gene_targets = gene_targets[expr_indexes,:] gene_preds = gene_preds[expr_indexes,:] gene_mse = gene_mse[expr_indexes] gene_mean = gene_mean[expr_indexes] gene_std = gene_std[expr_indexes] print('%d "expressed genes" considered in variance plots' % expr_indexes.sum()) sns.set(style='ticks', font_scale=1.3) if len(gene_mse) < 2000: ri = np.arange(len(gene_mse)) else: ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False) # plot mean vs std out_pdf = '%s_mean-std.pdf' % out_prefix plots.jointplot(gene_mean[ri], gene_std[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Mean across experiments', y_label='Std Dev across experiments') # plot mean vs MSE out_pdf = '%s_mean.pdf' % out_prefix plots.jointplot(gene_mean[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Mean across experiments', y_label='Mean squared prediction error') # plot std vs MSE out_pdf = '%s_std.pdf' % out_prefix plots.jointplot(gene_std[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Std Dev across experiments', y_label='Mean squared prediction error') # plot CV vs MSE gene_cv = np.divide(gene_std, gene_mean) out_pdf = '%s_cv.pdf' % out_prefix plots.jointplot(gene_cv[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Coef Var across experiments', y_label='Mean squared prediction error') # plot MSE distributions in CV bins numq = 5 quant_indexes = stats.quantile_indexes(gene_cv, numq) quant_mse = [] for qi in range(numq): for gi in quant_indexes[qi]: quant_mse.append([qi, gene_mse[gi]]) quant_mse = pd.DataFrame(quant_mse, columns=['Quantile','MSE']) quant_mse.to_csv('%s_quant.txt' % out_prefix, sep='\t') plt.figure() sns.boxplot(x='Quantile', y='MSE', data=quant_mse, palette=sns.cubehelix_palette(numq), showfliers=False) ax = plt.gca() ax.grid(True, linestyle=':') ax.set_ylabel('Mean squared prediction error') plt.savefig('%s_quant.pdf' % out_prefix) plt.close() # CV quantiles quantile_accuracy(gene_targets, gene_preds, gene_cv, '%s_qcv.pdf'%out_prefix, 4) # stdev quantiles quantile_accuracy(gene_targets, gene_preds, gene_std, '%s_qstd.pdf'%out_prefix, 4)
def variance_accuracy(gene_targets, gene_preds, out_prefix, log=False): """ Compare MSE accuracy to gene mean and variance. Assumes the targets and predictions have been normalized. """ # compute mean, var, and MSE across targets gene_mse = np.zeros(gene_targets.shape[0]) gene_mean = np.zeros(gene_targets.shape[0]) gene_max = np.zeros(gene_targets.shape[0]) gene_std = np.zeros(gene_targets.shape[0]) for gi in range(gene_targets.shape[0]): if log: gti = np.log2(gene_targets[gi, :] + 1) gpi = np.log2(gene_preds[gi, :] + 1) else: gti = gene_targets[gi, :] gpi = gene_preds[gi, :] gene_mse[gi] = np.power(gti - gpi, 2).mean() gene_mean[gi] = gti.mean() gene_max[gi] = gti.max() gene_std[gi] = gpi.std() # filter for expression expr_indexes = (gene_mean > 0.1) & (gene_max > 3) gene_mse = gene_mse[expr_indexes] gene_mean = gene_mean[expr_indexes] gene_std = gene_std[expr_indexes] print('%d "expressed genes" considered in variance plots' % expr_indexes.sum()) sns.set(style='ticks', font_scale=1.3) # plot mean vs MSE out_pdf = '%s_mean.pdf' % out_prefix ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False) basenji.plots.jointplot(gene_mean[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Mean across experiments', y_label='Mean squared prediction error') # plot std vs MSE out_pdf = '%s_std.pdf' % out_prefix ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False) basenji.plots.jointplot(gene_std[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Std Dev across experiments', y_label='Mean squared prediction error') # plot CV vs MSE gene_cv = np.divide(gene_std, gene_mean) out_pdf = '%s_cv.pdf' % out_prefix ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False) basenji.plots.jointplot(gene_cv[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Coef Var across experiments', y_label='Mean squared prediction error') # plot MSE distributions in CV bins numq = 4 quant_indexes = stats.quantile_indexes(gene_cv, numq) quant_mse = [] for qi in range(numq): for gi in quant_indexes[qi]: quant_mse.append([qi, gene_mse[gi]]) quant_mse = pd.DataFrame(quant_mse, columns=['Quantile', 'MSE']) quant_mse.to_csv('%s_quant.txt' % out_prefix, sep='\t') plt.figure() sns.boxplot(x='Quantile', y='MSE', data=quant_mse, palette=sns.cubehelix_palette(numq), showfliers=False) ax = plt.gca() ax.grid(True, linestyle=':') ax.set_ylabel('Mean squared prediction error') plt.savefig('%s_quant.pdf' % out_prefix) plt.close()