def reject(y, y_exp, var, plot=False, L1=None, name='Rejection.png'): error = (y-y_exp)**2 P_0 = pearson(y, y_exp)[0][0] if L1 is None: array = np.concatenate((y, y_exp, error, var), axis=1) else: L1[196:]=40 array = np.concatenate((y, y_exp, error, var, L1[:, np.newaxis]), axis=1) sorted_array = array[array[:,2].argsort()] results=[[0.0, P_0]] results_var=[[0.0, P_0]] results_min = [[0.0, P_0]] for i in xrange(1, array.shape[0]): x = np.concatenate((sorted_array[:-i,0], sorted_array[-i:,1]), axis=0) p = pearson(x, sorted_array[:, 1])[0] results.append([float(i)/float(array.shape[0]), p]) results_min.append([float(i)/float(array.shape[0]), P_0 + (1.0-P_0)*float(i)/float(array.shape[0])]) if L1 is not None: L1_best = sorted_array[:, 4] tpr = [] sorted_array = array[array[:,3].argsort()] for i in xrange(1, array.shape[0]): x = np.concatenate((sorted_array[:-i,0], sorted_array[-i:,1]), axis=0) p = pearson(x, sorted_array[:, 1])[0] if ( float(i)/float(array.shape[0]) <= 0.100001 ) and (float(i)/float(array.shape[0]) >= 0.090009): tpr.append(p) results_var.append([float(i)/float(array.shape[0]), p]) if L1 is not None: L1_var = sorted_array[:, 4] max_auc = auc([x[0] for x in results], [x[1] - P_0 for x in results], reorder=True) var_auc = auc([x[0] for x in results_var], [x[1] - P_0 for x in results_var], reorder=True) min_auc = auc([x[0] for x in results_min], [x[1] - P_0 for x in results_min], reorder=True) if plot: plt.scatter([x[0] for x in results], [x for x in np.asarray(sorted(var, reverse=True))]) plt.xlim(0.0, 1.0) plt.savefig('Variance.png', bbox_inches='tight') plt.close() if L1 is not None: plt.scatter([x[0] for x in results], [x[1] for x in results], c=L1_best, cmap=plt.cm.winter) plt.scatter([x[0] for x in results_var], [x[1] for x in results_var], c=L1_var, cmap=plt.cm.winter) plt.scatter([x[0] for x in results_var], [x[1] for x in results_min], c=L1_var, cmap=plt.cm.winter) else: plt.plot([x[0] for x in results], [x[1] for x in results], 'b^', [x[0] for x in results_var], [x[1] for x in results_var], 'ro', [x[0] for x in results_var], [x[1] for x in results_min], 'go') plt.legend(['Optimal-Rejection', 'Model-Rejection', 'Expected Random-Rejection'],loc=4, prop={'size':18.5}) plt.xlim(0.0, 1.0) plt.ylim(0.86, 1.0) plt.xlabel('Rejection Fraction') plt.ylabel('Pearson Correlation') #plt.show() plt.savefig(name, bbox_inches='tight') plt.close() print 'AUC', auc([x[0] for x in results_var], [x[1] for x in results_var], reorder=True) return var_auc/(1.0-P_0), max_auc/(1.0-P_0), min_auc/(1.0-P_0), (var_auc-min_auc)/(max_auc-min_auc), np.mean(tpr)
def get_pearson(gp, test_data, samples=1000, its=10): feats = test_data[:, :-1] gold_labels = test_data[:, -1] mean, cov = gp.predict(feats, full_cov=True) mean = mean.flatten() prs_preds = prs_loss(mean, cov, samples=samples, its=its) r_mean = pearson(mean, gold_labels) r_loss = pearson(prs_preds.flatten(), gold_labels) return r_mean, r_loss
def reject_fill(y, y_exp, var, plot=False, L1=None, name='Rejection.png'): error = (y-y_exp)**2 P_0 = pearson(y, y_exp)[0][0] if L1 is None: array = np.concatenate((y, y_exp, error, var), axis=1) else: L1[196:]=40 array = np.concatenate((y, y_exp, error, var, L1[:, np.newaxis]), axis=1) sorted_array = array[array[:,2].argsort()] results=[[0.0, P_0]] results_var=[[0.0, P_0]] results_min = [[0.0, P_0]] for i in xrange(1, array.shape[0]): x = np.concatenate((sorted_array[:-i,0], sorted_array[-i:,1]), axis=0) p = pearson(x, sorted_array[:, 1])[0] results.append([float(i)/float(array.shape[0]), p]) results_min.append([float(i)/float(array.shape[0]), P_0 + (1.0-P_0)*float(i)/float(array.shape[0])]) if L1 is not None: L1_best = sorted_array[:, 4] sorted_array = array[array[:,3].argsort()] for i in xrange(1, array.shape[0]): x = np.concatenate((sorted_array[:-i,0], sorted_array[-i:,1]), axis=0) p = pearson(x, sorted_array[:, 1])[0] results_var.append([float(i)/float(array.shape[0]), p]) if L1 is not None: L1_var = sorted_array[:, 4] max_auc = auc([x[0] for x in results], [x[1] - P_0 for x in results], reorder=True) var_auc = auc([x[0] for x in results_var], [x[1] - P_0 for x in results_var], reorder=True) min_auc = auc([x[0] for x in results_min], [x[1] - P_0 for x in results_min], reorder=True) if plot: fig, ax = plt.subplots() plt.fill_between([x[0] for x in results], np.zeros(224), P_0*np.ones(224), alpha=0.01, color='g') plt.fill_between([x[0] for x in results], P_0*np.ones(224), [x[1] for x in results_min], alpha=0.07, color='g') plt.fill_between([x[0] for x in results_var], [x[1] for x in results_min], [x[1] for x in results_var], alpha=0.5, color='r') plt.fill_between([x[0] for x in results_var], [x[1] for x in results_var], [x[1] for x in results], alpha=0.5, color='b') #plt.legend([r'AUC $\rho$', 'AUC Random', 'AUC Variance', 'AUC Maximum'],loc=4 ) plt.xlim(0.0, 1.0) plt.ylim(0.86, 1.0) ypoints = [0.86, P_0, 0.897, 1.0] xpoints = [0.0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0] plt.yticks(ypoints, ['0.0' , 'PCC', '10% Rej.\nPCC', '1.0'], fontsize=17) plt.yticks(ypoints, fontsize=17) plt.xticks(xpoints, ['0.0', '0.1', '0.2', '0.4', '0.6', '0.8', '1.0'], fontsize=17) plt.xlabel('Rejection Fraction', fontsize=17) plt.ylabel('Pearson Correlation', fontsize=17) #plt.show() plt.savefig('auc_diagramm.png', bbox_inches='tight') plt.close()
def exploratory_analysis(data): box_features = [ 'ARI', 'CLI', 'count_trailing', 'count_repetitions', 'count_pauses', 'SIM_score', 'MMSE' ] for feat in box_features: temp_data = [ np.array(data[feat][:242]), np.array(data[feat][data['Category'] == 1]) ] #, # np.array(data[feat][data['Category']==2]), np.array(data[feat][data['Category']==3])] plt.figure() plt.boxplot(temp_data, medianprops=dict(linestyle='-', linewidth=2, color='firebrick')) plt.xticks([1, 2], ['Control', 'AD'], fontsize=21.0, fontweight='bold') plt.yticks(fontsize=21.0, fontweight='bold') # plt.ylabel(feat, fontsize=21.0, fontweight='bold') plt.title(feat, fontsize=21.0, fontweight='bold') plt.show() box_features = [ 'ttr', 'R', 'num_concepts_mentioned', 'ARI', 'CLI', 'prp_count', 'VP_count', 'NP_count', 'prp_noun_ratio', 'word_sentence_ratio', 'count_pauses', 'count_unintelligible', 'count_trailing', 'count_repetitions' ] for feat in box_features: [r, p] = pearson(data[feat], data['Category']) print('{}--{}--{}'.format(feat, r**2, p))
def interpolate(targets, model_1, model_2, dir, name, mse_plot=False): """ Function to create an interpolation plot of two models. targets: Targets which both models are compared with model_1: Predictions from model 1 model_2: Predictions from model 2 dir: Directory where to save to. name: name of chart """ #Interpolation correlations = [] MSEs = [] for i in xrange(100): interp = (100.0 - float(i)) / 100.0 * model_1 + model_2 * (float(i)) / 100.0 p = pearson(interp, targets) mse = MSE(interp, targets) correlations.append([float(i) / 100.0, p[0]]) MSEs.append([float(i) / 100.0, mse]) print np.max(np.asarray(correlations)[:, 1]) print np.min(np.asarray(MSEs)[:, 1]) plt.plot([i[0] for i in correlations], [i[1] for i in correlations]) plt.xlabel('DNN Fraction') plt.ylabel('Pearson Correlation') plt.savefig(os.path.join(dir, 'interpolation_pearson_' + name + '.png')) plt.close() if mse_plot: plt.plot([i[0] for i in MSEs], [i[1] for i in MSEs]) plt.xlabel('DNN Fraction') plt.ylabel('MSE') plt.savefig(os.path.join(dir, 'interpolation_mse_' + name + '.png')) plt.close()
def save_cautious_curves(model, test_data, target, median=False): """ Sort predictions by variance and calculate metrics on the top X% most confident ones, generating a curve on X. """ feats = test_data[:, :-1] gold_labels = test_data[:, -1] if median: # should only be used for Warped GPs preds = model.predict(feats, median=True) else: preds = model.predict(feats) preds = zip(preds[0].flatten(), preds[1].flatten(), gold_labels) preds.sort(key=lambda x: x[0]) preds = np.array(preds) metric_vals = [] #import pprint; pprint.pprint(preds) for i in xrange(1, len(preds) + 1): sub_preds = preds[:i, 0] sub_gold = preds[:i, 2] mae = MAE(sub_preds, sub_gold) rmse = np.sqrt(MSE(sub_preds, sub_gold)) prs = pearson(sub_preds, sub_gold) metric_vals.append([mae, rmse, prs[0], prs[1]]) np.savetxt(target, metric_vals, fmt='%.4f')
def get_rec_metrics(model, test_data, median=False): """ Get predictions and evaluate. """ feats = test_data[:, :-1] gold_labels = 1 / test_data[:, -1] if median: # should only be used for Warped GPs preds = model.predict(feats, median=True) else: preds = model.predict(feats) preds_mean = preds[0].flatten() rec_preds = model.predict_reciprocal(feats).flatten() mae_naive = MAE(1/preds_mean, gold_labels) rmse_naive = np.sqrt(MSE(1/preds_mean, gold_labels)) prs_naive = pearson(1/preds_mean, gold_labels) mae_rec = MAE(rec_preds, gold_labels) rmse_rec = np.sqrt(MSE(rec_preds, gold_labels)) prs_rec = pearson(rec_preds, gold_labels) return mae_naive, rmse_naive, prs_naive, mae_rec, rmse_rec, prs_rec
def prs_loss(mean, cov, samples=1000, its=10): curr_a = np.copy(mean) # start with mean #curr_a = np.ones_like(mean) + np.random.random(size=(SIZE)) n = curr_a.shape[0] initial_a = np.copy(curr_a) curr_a = norm_a(curr_a, mean) for evals in xrange(its): mv_samples = np.random.multivariate_normal(mean, cov, samples) print pearson(mean, curr_a) for i in xrange(n): mask = np.ones(curr_a.shape, dtype=bool) mask[i] = 0 ai = curr_a[mask] yi = mv_samples.T[mask] yk = mv_samples.T[i] ak = dloss(ai, yi, yk) curr_a[i] = ak / samples #curr_a[i] = ak #curr_a = norm_a(curr_a, mean) print curr_a print np.mean(np.abs(initial_a - curr_a)) return curr_a
def get_metrics(model, test_data): """ Get predictions and evaluate. """ feats = test_data[:, :-1] gold_labels = test_data[:, -1] preds = model.predict_y(feats) preds_mean = preds[0].flatten() preds_var = preds[1] #print preds_mean[:10] #print gold_labels[:10] mae = MAE(preds_mean, gold_labels) rmse = np.sqrt(MSE(preds_mean, gold_labels)) prs = pearson(preds_mean, gold_labels) nlpd = - np.mean(model.predict_density(feats, gold_labels[:, None])) return mae, rmse, prs, nlpd
def get_metrics(model, test_data, median=False): """ Get predictions and evaluate. """ feats = test_data[:, :-1] gold_labels = test_data[:, -1] if median and isinstance(model, GPy.models.WarpedGP): # should only be used for Warped GPs preds = model.predict(feats, median=True) else: preds = model.predict(feats) preds_mean = preds[0].flatten() preds_var = preds[1] #print preds_mean[:10] #print gold_labels[:10] mae = MAE(preds_mean, gold_labels) rmse = np.sqrt(MSE(preds_mean, gold_labels)) prs = pearson(preds_mean, gold_labels) nlpd = - np.mean(model.log_predictive_density(feats, gold_labels[:, None])) pred_q = model.predict_quantiles(feats, quantiles=(25., 75.))[1].flatten() return mae, rmse, prs, nlpd
def make_reg_plot(vehicle_df): ''' Create a plot relating price difference to total reservations showing the regression line Args: vehicle_df: pandas data frame of vehicle attributes Returns: None ''' x = vehicle_df['price_difference'] y = vehicle_df['total_reservations'] stat = pearson(x, y) stats = "pearsonr= {:0.2f}; p={:0.2e}".format(stat[0], stat[1]) fig, ax = plt.subplots() sns.regplot(x, y) ax.set_ylabel('Total Reservations') ax.set_xlabel('Price Difference') ax.set_title('Total Reservations vs. Price Difference') ax.annotate(stats, xy=(350, 320), xycoords='axes points') plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) plt.savefig('total_res_vs_price_diff')
sns.set(palette='Purples_r') sns.set(palette='Reds_r') mpl.rc('figure', figsize=(5, 5)) np.random.seed(9221999) x = randn(50) y = x + randn(50) sns.regplot(x, y) df = pd.DataFrame(np.transpose([x, y]), columns=["X", "Y"]) sns.regplot("X", 'Y', df) sns.regplot("X", 'Y', df, ci=None, color='slategray') r2 = lambda x, y: stats.pearson(x, y)[0] ** 2 sns.regplot('X', 'Y', df, corr_func=r2, func_name='$R^2$', color='seagreen') tips = pd.read_csv("https://raw.github.com/mwaskom/seaborn/master/examples/tips.csv") tips["big_tip"] = tips.tip > (.2 * tips.total_bill) tips["smoker"] = tips["smoker"] == "Yes" tips["female"] = tips["sex"] == "Female" mpl.rc("figure", figsize=(7, 7)) sns.corrplot(tips) sns.corrplot(tips, sig_stars=False) sns.corrplot(tips, sig_tail='upper', cmap='PuRd', cmap_range=(-.2, .8)) mpl.rc('figure', figsize=(5, 5)) sns.lmplot('total_bill', 'tip', tips) sns.lmplot('total_bill', 'tip', tips, color='time')