def test_gradient_boosting_estimator_with_smooth_quantile_loss(): np.random.seed(0) m = 15000 n = 10 p = .8 X = np.random.normal(size=(m,n)) beta = np.random.normal(size=n) mu = np.dot(X, beta) y = np.random.lognormal(mu) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333) loss_function = SmoothQuantileLossFunction(1, p, .0001) q_loss = QuantileLossFunction(1, p) model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), loss_function, n_estimators=150, stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True) assert_raises(NotFittedError, lambda : model.predict(X_train)) model.fit(X_train, y_train) prediction = model.predict(X_test) model2 = GradientBoostingRegressor(loss='quantile', alpha=p) model2.fit(X_train, y_train) prediction2 = model2.predict(X_test) assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2)) assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2)) q = np.mean(y_test <= prediction) assert_less(np.abs(q-p), .05) assert_greater(model.score_, 0.) assert_approx_equal(model.score(X_train, y_train), model.score_)
def question_four(): """Training models on high degree polynomial features can result in overly complex models that overfit, so we often use regularized versions of the model to constrain model complexity, as we saw with Ridge and Lasso linear regression. For this question, train two models: a non-regularized LinearRegression model (default parameters) and a regularized Lasso Regression model (with parameters alpha=0.01, max_iter=10000) both on polynomial features of degree 12. Return the R2 score for both the LinearRegression and Lasso model's test sets. This function should return one tuple (LinearRegression_R2_test_score, Lasso_R2_test_score)""" from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Lasso, LinearRegression from sklearn.metrics.regression import r2_score X_tr = X_train.reshape(-1, 1) X_tst =X_test.reshape(-1, 1) poly = PolynomialFeatures(degree=12) X_poly_tr = poly.fit_transform(X_tr) X_poly_tst = poly.fit_transform(X_tst) linreg = LinearRegression() linreg.fit(X_poly_tr, y_train) linear = r2_score(y_test, linreg.predict(X_poly_tst)) lasso = Lasso(alpha=0.01, max_iter=10000) lasso.fit(X_poly_tr, y_train) lasso = r2_score(y_test, lasso.predict(X_poly_tst)) return linear, lasso
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score # Your code here train_result = np.zeros([10, 1]) test_result = np.zeros([10, 1]) for i in range(0,10): poly = PolynomialFeatures(degree=i) X_train_poly = poly.fit_transform(X_train.reshape(-1, 1)) X_test_poly = poly.fit_transform(X_test.reshape(-1, 1)) linreg = LinearRegression().fit(X_train_poly, y_train) score_train = r2_score( y_train,linreg.predict(X_train_poly)) score_test = r2_score( y_test,linreg.predict(X_test_poly)) train_result[i] = score_train test_result[i] = score_test train_result = train_result.flatten() test_result = test_result.flatten() return (train_result, test_result)# Your answer here
def answer_four(): from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Lasso, LinearRegression from sklearn.metrics.regression import r2_score # X_train to poly poly = PolynomialFeatures(degree=12) X_poly = poly.fit_transform(x.reshape(-1, 1)) X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0) # Fit non-regularized LinearRegression model (default parameters) linreg = LinearRegression().fit(X_train, y_train) # Fit regularized Lasso Regression model (with parameters `alpha=0.01`, `max_iter=10000`) lasso = Lasso(alpha=0.01, max_iter=10000).fit(X_train, y_train) # Add prediction to array y_linreg_test = linreg.predict(X_test) y_lasso_test = lasso.predict(X_test) # Compute R2_ Linreg_R2_test_score = r2_score(y_test, y_linreg_test) Lasso_R2_test_score = r2_score(y_test, y_lasso_test) return Linreg_R2_test_score, Lasso_R2_test_score
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score degrees = np.arange(10) nbLines_train, = X_train.shape nbLines_test, = X_test.shape score_train = [] score_test = [] for n in degrees: poly = PolynomialFeatures(degree=n) X_train_poly = poly.fit_transform(X_train.reshape(nbLines_train,1)) X_test_poly = poly.fit_transform(X_test.reshape(nbLines_test,1)) linreg = LinearRegression().fit(X_train_poly, y_train) score_train = score_train + [r2_score(y_train, linreg.predict(X_train_poly))] score_test = score_test + [r2_score(y_test, linreg.predict(X_test_poly))] return (np.array(score_train), np.array(score_test))
def main(): path1 = "analysis/wine/log_data.csv" path2 = "analysis/wine/pc_data.csv" df1 = pd.read_csv(path1) df2 = pd.read_csv(path2) np.random.seed(777) randint = np.random.choice(df1.index, size=250) test_x = torch.from_numpy(df1.loc[randint].drop('quality', axis=1).values).float() test_y = torch.from_numpy(df1.loc[randint, 'quality'].values).float() train_x_pre = torch.from_numpy( df1.drop(randint).drop('quality', axis=1).values).float() train_y_pre = torch.from_numpy( df1.drop(randint).loc[:, 'quality'].values).float() model, likelihood = train(train_x_pre, train_y_pre, training_iter=1000) observed_pred, mean, lower, upper = test(test_x, model, likelihood) sns.scatterplot(x=test_y.numpy(), y=mean.cpu().numpy()) r2_score(test_y.numpy(), mean.cpu().numpy()) # test_x2 = torch.from_numpy(df2.loc[randint].drop('quality', axis=1).values).float() # test_y2 = torch.from_numpy(df2.loc[randint, 'quality'].values).float() # train_x_pre2 = torch.from_numpy(df2.drop(randint).drop('quality', axis=1).values).float() # train_y_pre2 = torch.from_numpy(df2.drop(randint).loc[:,'quality'].values).float() model2, likelihood2 = train2(train_x_pre, train_y_pre, training_iter=1000) observed_pred2, mean2, lower2, upper2 = test(test_x, model2, likelihood2) sns.scatterplot(x=test_y.numpy(), y=mean2.cpu().numpy()) r2_score(test_y.numpy(), mean2.cpu().numpy())
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score R2_train = np.zeros([10, 1]) R2_test = np.zeros([10, 1]) for deg in range(10): poly = PolynomialFeatures(degree=deg) X_train_poly = poly.fit_transform(X_train.reshape(-1, 1)) X_test_poly = poly.fit_transform(X_test.reshape(-1, 1)) model = LinearRegression().fit(X_train_poly, y_train) r2_train = r2_score(y_train, model.predict(X_train_poly)) r2_test = r2_score(y_test, model.predict(X_test_poly)) R2_train[deg] = r2_train R2_test[deg] = r2_test R2_train = R2_train.flatten() R2_test = R2_test.flatten() return (R2_train, R2_test)
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score r2_train, r2_test = [], [] # Your code here for i in range(10): poly = PolynomialFeatures(i) X_train_poly = poly.fit_transform(X_train.reshape(-1, 1)) X_test_poly = poly.fit_transform(X_test.reshape(-1, 1)) clf = LinearRegression() clf.fit(X_train_poly, y_train) #clf.score(X_train_poly, y_train) #clf.score(X_test_poly, y_test) score_train = r2_score(y_train, clf.predict(X_train_poly)) score_test = r2_score(y_test, clf.predict(X_test_poly)) r2_train.append(score_train) r2_test.append(score_test) return (r2_train, r2_test)
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score x_train = X_train.reshape(-1, 1) x_test = X_test.reshape(-1, 1) # Your code here r2_train = np.array([]) r2_test = np.array([]) for i in range(10): poly_transform = PolynomialFeatures(degree=i).fit(x_train) X_train_poly = poly_transform.transform(x_train) X_test_poly = poly_transform.transform(x_test) model = LinearRegression().fit(X_train_poly, y_train) fitted_model_prediction = model.predict(X_train_poly) fitted_model_score = r2_score(y_train, fitted_model_prediction) r2_train = np.append(r2_train, fitted_model_score) predicted_model_prediction = model.predict(X_test_poly) predicted_model_score = r2_score(y_test, predicted_model_prediction) r2_test = np.append(r2_test, predicted_model_score) return (r2_train, r2_test) # Your answer here
def answer_two(): from sklearn.metrics.regression import r2_score from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures import numpy as np from sklearn.model_selection import train_test_split np.random.seed(0) n = 15 x = np.linspace(0, 10, n) + np.random.randn(n) / 5 y = np.sin(x) + x / 6 + np.random.randn(n) / 10 x = x.reshape(-1, 1) y = y.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0) test_data = np.linspace(0, 10, 100).reshape(-1, 1) r2_train = [] r2_test = [] for i in range(10): poly = PolynomialFeatures(degree=i) x_poly = poly.fit_transform(X_train) model = LinearRegression() model.fit(x_poly, y_train) y_pred_train = poly.fit_transform(X_train) y_pred_train = model.predict(y_pred_train) r2_train.append(r2_score(y_train, y_pred_train)) y_pred_test = poly.transform(X_test) y_pred_test = model.predict(y_pred_test) r2_test.append(r2_score(y_test, y_pred_test)) r2_train = np.array(r2_train) r2_test = np.array(r2_test) # Your code here return r2_train, r2_test
def answer_two(): R2_train = [] R2_test = [] for i in np.arange(0, 10): poly = PolynomialFeatures(degree=i) X_train_poly = poly.fit_transform(X_train.reshape(-1, 1)) X_test_poly = poly.fit_transform(X_test.reshape(-1, 1)) linreg = LinearRegression().fit(X_train_poly, y_train) w = linreg.coef_ w = w.transpose() b = linreg.intercept_ y_predict_train = np.dot(X_train_poly, w) + b y_predict_test = np.dot(X_test_poly, w) + b r2_train = r2_score(y_train, y_predict_train) r2_test = r2_score(y_test, y_predict_test) R2_train.append(r2_train) R2_test.append(r2_test) R2_train = np.array(R2_train) R2_test = np.array(R2_test) return (R2_train, R2_test)
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score X_tr = X_train.reshape(-1, 1) X_tst = X_test.reshape(-1, 1) train_scores = np.zeros([10]) test_scores = np.zeros([10]) for degree in range(10): poly = PolynomialFeatures(degree=degree) X_poly_tr = poly.fit_transform(X_tr) X_poly_tst = poly.fit_transform(X_tst) linreg = LinearRegression() linreg.fit(X_poly_tr, y_train) train = r2_score(y_train, linreg.predict(X_poly_tr)) test = r2_score(y_test, linreg.predict(X_poly_tst)) train_scores[degree] = train test_scores[degree] = test return train_scores, test_scores
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score # Your code here degrees = range(0,10) r2_train = np.zeros([10,1]) r2_test = np.zeros([10,1]) for d in degrees: poly = PolynomialFeatures(degree=d) X_train_poly = poly.fit_transform(X_train.reshape(-1, 1)) X_test_poly = poly.fit_transform(X_test.reshape(-1, 1)) linreg = LinearRegression().fit(X_train_poly, y_train) score_train = r2_score(y_train, linreg.predict(X_train_poly)) score_test = r2_score(y_test, linreg.predict(X_test_poly)) r2_train[d] = score_train r2_test[d] = score_test r2_train = r2_train.flatten() r2_test = r2_test.flatten() return (r2_train, r2_test)
def evaluate(self, y_predict, y_true, target_names=None): if self.config.task_type == 'classification': classification_report(y_true=y_true, y_pred=y_predict, target_names=target_names) elif self.config.task_type == 'ranking': roc_auc_score(y_true=y_true, y_score=y_predict) elif self.config.task_type == 'regression': r2_score(y_true=y_true, y_pred=y_predict)
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score # Your code here # np.random.seed(0) n = 15 x = np.linspace(0, 10, n) + np.random.randn(n) / 5 y_true = np.sin(x) + x / 6 + np.random.randn(n) / 10 # Init X_pred = np.linspace(0, 10, 100).reshape(-1, 1) r2_train_arr = np.empty([ 0, ]) r2_test_arr = np.empty([ 0, ]) # Fit various polynomials, predict, add predictions to array for i, p in enumerate(range(0, 10)): # X_train to poly poly = PolynomialFeatures(degree=p) X_poly = poly.fit_transform(x.reshape(-1, 1)) y_true = y_true.reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=0) # Fit model linreg = LinearRegression().fit(X_train, y_train) # Add prediction to array y_preds_train = linreg.predict(X_train) y_preds_test = linreg.predict(X_test) # print(y_train.shape, y_test.shape, y_preds_train.shape, y_preds_test.shape) # Compute R2_ r2_train = r2_score(y_train, y_preds_train) r2_test = r2_score(y_test, y_preds_test) # print(r2_train, r2_test) # Add to array r2_train_arr = np.hstack([r2_train_arr, r2_train]) r2_test_arr = np.hstack([r2_test_arr, r2_test]) # Sanity check assert r2_train_arr.shape == (10, ), str(r2_train_arr.shape) assert r2_test_arr.shape == (10, ), str(r2_test_arr.shape) return r2_train_arr, r2_test_arr
def answer_four(): from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Lasso, LinearRegression from sklearn.metrics.regression import r2_score poly = PolynomialFeatures(degree = 12) X_train_poly = poly.fit_transform(X_train.reshape(11,1)) X_test_poly = poly.fit_transform(X_test.reshape(4,1)) linreg = LinearRegression().fit(X_train_poly, y_train) lasso = Lasso(alpha = 0.01, max_iter = 10000).fit(X_train_poly, y_train) return (r2_score(y_test, linreg.predict(X_test_poly)), r2_score(y_test, lasso.predict(X_test_poly)))
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score r2_train = np.zeros(10) r2_test = np.zeros(10) for degree in range(10): poly = PolynomialFeatures(degree=degree) X_poly = poly.fit_transform(X_train.reshape(11, 1)) linreg = LinearRegression().fit(X_poly, y_train) r2_train[degree] = r2_score(y_train, linreg.predict(X_poly)) X_test_poly = poly.fit_transform(X_test.reshape(4, 1)) r2_test[degree] = r2_score(y_test, linreg.predict(X_test_poly)) return (r2_train, r2_test)
def test_r2_within_business_value(self): m = MyModel() m.init() # Add code to load your super secret cross validation stuff # use secrets encoded in a variable group available in the ADO Pipeline # cuz scientists can't access that and game the system. super_secret_data = None y_predicted = m.predict(super_secret_data[x]) r2_score(super_secret_data[y], y_predicted) assert (r2_score > 2 < r2_score)
def test_with_response_transformation(): X, y = load_boston(return_X_y=True) log_y = np.log(y) X = pandas.DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) y = pandas.DataFrame(y, columns=['y']) transformer = VariableTransformer(dict(y=Log(Identity('y')))) model = ResponseTransformingEstimator(Earth(), transformer) model.fit(X, y) log_y_pred = model.predict(X) assert r2_score(log_y, log_y_pred) > .8 assert r2_score(y, log_y_pred) < .1
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score r2_train, r2_test = [], [] X_train_data = X_train.reshape(-1, 1) X_test_data = X_test.reshape(-1, 1) for degree in range(0, 10): poly = PolynomialFeatures(degree=degree) X_poly = poly.fit_transform(X_train_data) X_test_poly = poly.fit_transform(X_test_data) linreg = LinearRegression().fit(X_poly, y_train) r2_train.append(r2_score(y_train, linreg.predict(X_poly))) r2_test.append(r2_score(y_test, linreg.predict(X_test_poly))) return r2_train, r2_test
def answer_four(): degree = 12 poly = PolynomialFeatures(degree=degree) x_fit = poly.fit_transform(X_train.reshape(-1, 1)) x_test_fit = poly.fit_transform(X_test.reshape(-1, 1)) model = LinearRegression() model.fit(x_fit, y_train) y_linear_pred = model.predict(x_test_fit) r_linear = r2_score(y_test, y_linear_pred) lasso = Lasso(alpha=0.01, max_iter=10000) lasso.fit(x_fit, y_train) y_lasso_pred = lasso.predict(x_test_fit) r_lasso = r2_score(y_test, y_lasso_pred) ans = (r_linear, r_lasso) return ans
def main(): data = get_data_without_cols(2) X_train, y_train, X_test, y_test = data model = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=1, criterion="mse", max_features="sqrt", # min_samples_split=3, max_depth=9, # min_samples_leaf=1, # min_weight_fraction_leaf=0, # min_impurity_decrease=0, ) model.fit(X_train, y_train.iloc[:, 0]) y_pred = model.predict(X_test).reshape(-1, 1) r2 = r2_score(y_test, y_pred) m_e = mean_absolute_error(y_test, y_pred) mean = y_test.mean(axis=0).mean() mean_pred = y_pred.mean(axis=0).mean() print("{} | mean error : {} | mean {} / {}".format( round(r2, 3), round(m_e, 3), round(mean_pred, 3), round(mean, 3), ))
def answer_four(): from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Lasso, LinearRegression from sklearn.metrics.regression import r2_score # Your code here poly = PolynomialFeatures(12) X_train_new = poly.fit_transform(X_train.reshape(X_train.shape[0], 1)) X_test_new = poly.fit_transform(X_test.reshape(X_test.shape[0], 1)) lr = LinearRegression() lr.fit(X_train_new, y_train) lr_r2 = r2_score(y_test, lr.predict(X_test_new)) lasso = Lasso(alpha=0.01, max_iter=10000) lasso.fit(X_train_new, y_train) ls_r2 = r2_score(y_test, lasso.predict(X_test_new)) return (lr_r2, ls_r2) # Your answer here
def answer_two(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics.regression import r2_score results_train = np.zeros(10) results_test = np.zeros(10) for i in range(0, 10): poly = PolynomialFeatures(degree=i) X_train_poly = poly.fit_transform(X_train.reshape(-1, 1)) X_test_poly = poly.fit_transform(X_test.reshape(-1, 1)) linreg = LinearRegression().fit(X_train_poly, y_train) score_train = r2_score(y_train, linreg.predict(X_train_poly)) score_test = r2_score(y_test, linreg.predict(X_test_poly)) results_train[i] = score_train results_test[i] = score_test return (results_train, results_test)
def plotScatterPlot(actual, predicted, outFileName): 'Make a scatter plot showing the predicted vs actual activation energy for each reaction' plt.scatter(actual, predicted, s=7, color='#4b9da6') axes = plt.gca() # make plot square with equal x and y axes bounds = [ min(list(actual) + list(predicted) + [0]) - 1, max(list(actual) + list(predicted)) + 1 ] plt.axis(bounds * 2) axes.set_aspect('equal', adjustable='box') # plot the identity for visual reference (10% darker than data) plt.plot([bounds[0], bounds[1]], [bounds[0], bounds[1]], color='#d95d41') rSquared = r2_score(actual, predicted) print(rSquared) plt.figtext(0.6, 0.15, '$R^2 = $' + format(rSquared, '.4f'), fontsize=11) plt.xlabel('QM Calculated $\mathregular{H_{2}}$ Binding Energy', fontsize=10) plt.ylabel('Model Predicted $\mathregular{H_{2}}$ Binding Energy', fontsize=10) plt.title( 'Model Predicted vs. QM Calculated $\mathregular{H_{2}}$ Binding', fontsize=12) plt.tight_layout() plt.savefig(str(outFileName) + '.png') plt.clf()
def test_adjusted_r2_score(self): y_true = [1, 2, 3, 4, 5] y_pred = [1, 3, 2, 5, 4] r2 = r2_score(y_true, y_pred) assert r2 == 0.6 for df in range(4): n = len(y_true) f = (n - 1) / (n - df - 1) expected = 1 - (1 - r2) * f self.assertAlmostEqual(adjusted_r2_score(y_true, y_pred, MockModel(coef_=[1] * df)), expected, places=3, msg=f'failed for df={df}') # if degree of freedom gets too large returns 0 coef = [1] * 4 self.assertAlmostEqual(adjusted_r2_score(y_true, y_pred, MockModel(coef_=coef)), 0, places=3, msg=f'failed for df={df}') coef = [1] * 5 self.assertAlmostEqual(adjusted_r2_score(y_true, y_pred, MockModel(coef_=coef)), 0, places=3, msg=f'failed for df={df}')
def score(self, X, y, method="r2", verbose=False): """ Produce multi-step prediction of y, and compute the metrics against y. Nan is ignored when computing the metrics. :param array-like X: exogenous input time series, shape = (n_samples, n_exog_inputs) :param array-like y: target time series to predict, shape = (n_samples) :param string method: could be "r2" (R Square) or "mse" (Mean Square Error). :return: prediction metric. Nan is ignored when computing the metrics. """ ypred = self.predict(X, y) mask = np.isnan(y) | np.isnan(ypred) if verbose: print('Evaluating {} score, {} of {} data points are evaluated.'. format(method, np.sum(~mask), y.shape[0])) if method == "r2": return r2_score(y[~mask], ypred[~mask]) elif method == "mse": return mean_squared_error(y[~mask], ypred[~mask]) else: raise ValueError( '{} method is not supported. Please choose from \"r2\" or \"mse\".' )
def log_r2_score(y_true, y_pred, sample_weight=None, multioutput="uniform_average"): """ r squared on log of prediction Parameters ---------- y_true : array-like of shape = (n_samples) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs) Estimated target values. sample_weight : array-like of shape = (n_samples), optional Sample weights. multioutput : string in ['raw_values', 'uniform_average'] \ or array-like of shape = (n_outputs) Defines aggregating of multiple output values. Array-like value defines weights used to average errors. 'raw_values' : Returns a full set of errors when the input is of multioutput format. 'uniform_average' : Errors of all outputs are averaged with uniform weight. """ y_type, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput) if not (y_true >= 0).all() and not (y_pred >= 0).all(): raise ValueError("Mean Squared Logarithmic Error cannot be used when " "targets contain negative values.") return r2_score(np.log(y_true + 1), np.log(y_pred + 1), sample_weight, multioutput)
def get_scores(y_true,y_pred): brier_score = brier_score_loss(y_true,y_pred) log_score = log_loss(y_true,y_pred) roc_score = roc_auc_score(y_true, y_pred) pr_score = average_precision_score(y_true,y_pred) r2score = r2_score(y_true,y_pred) return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
def Hyd(actual, predicted, nlmos, slope, yint, outFileName): 'Make a scatter plot showing the predicted vs actual activation energy for each reaction' xmin = -0.17 xmax = -0.38 ymin = [xmin * slope + yint] ymax = [xmax * slope + yint] print(ymin, ymax) plt.scatter(nlmos, actual, s=7, color='#4b9da6') axes = plt.gca() # make plot square with equal x and y axes # bounds = [min(list(actual) + list(predicted) + [0])-1, max(list(actual) + list(predicted))+1] boundsx = [min(list(nlmos)), max(list(nlmos))] boundsy = [max(list(actual)), min(list(actual))] print(boundsx[0], boundsx[1]) # plt.axis(boundsy * 2) axes.set_aspect(aspect=1 / (280), adjustable='box') # plt.axis('tight') # plot the identity for visual reference (10% darker than data) plt.plot([xmin, xmax], [ymin, ymax], color='#d95d41') rSquared = r2_score(actual, predicted) # print(rSquared) plt.figtext(0.4, 0.15, '$R^2 = $' + format(rSquared, '.3f'), fontsize=12) plt.ylabel('QM Calculated $\mathregular{pK_{a}}$', fontsize=14) plt.xlabel('Co-H NLMO Energy (eV)', fontsize=14) plt.title('QM Calculated $\mathregular{pK_{a}}$ vs. Co-H NLMO Energy', fontsize=14) plt.tight_layout() plt.savefig(str(outFileName) + '.png') plt.clf()
def answer_four(): X_tr = X_train.reshape(-1, 1) X_tt = X_test.reshape(-1, 1) poly = PolynomialFeatures(degree=12) X_tr_poly = poly.fit_transform(X_tr) X_tt_poly = poly.fit_transform(X_tt) linreg = LinearRegression().fit(X_tr_poly, y_train) lasso = Lasso(alpha=0.01, max_iter=10000).fit(X_tr_poly, y_train) LinearRegression_R2_test_score = r2_score(y_test, linreg.predict(X_tt_poly)) Lasso_R2_test_score = r2_score(y_test, lasso.predict(X_tt_poly)) return LinearRegression_R2_test_score, Lasso_R2_test_score
def get_scores(shots): y_true = [shot.result for shot in shots] y_pred = [shot.pred for shot in shots] brier_score = brier_score_loss(y_true,y_pred) log_score = log_loss(y_true,y_pred) roc_score = roc_auc_score(y_true, y_pred) pr_score = average_precision_score(y_true,y_pred) r2score = r2_score(y_true,y_pred) return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
def score(self, X, y, sample_weight=None): from commonml.skchainer.classifier import Classifier from commonml.skchainer.regressor import Regressor if isinstance(self.model, Classifier): from sklearn.metrics.classification import accuracy_score return accuracy_score(y, self.predict(X), sample_weight=sample_weight) elif isinstance(self.model, Regressor): from sklearn.metrics.regression import r2_score return r2_score(y, self.predict(X), sample_weight=sample_weight, multioutput='variance_weighted') else: raise ValueError('Unsupported model.')
def calc_error_metrics(self): # Log loss, aka logistic loss or cross-entropy loss. self.scores['LogLoss'] = log_loss(self.y_true, self.y_pred) # Mean Squared Error self.scores['Mean Squared Error'] = mean_squared_error(self.y_true, self.y_pred) # Mean Absolute Error self.scores['Mean Absolute Error'] = mean_absolute_error(self.y_true, self.y_pred) # R^2 (coefficient of determination) regression score function - indicated how well data fits the statistical model self.scores['R2 Score'] = r2_score(self.y_true, self.y_pred) """TBD compute the log-loss to consider boolean""" return
from sklearn.preprocessing import MinMaxScaler from dbn.tensorflow import SupervisedDBNRegression # Loading dataset boston = load_boston() X, Y = boston.data, boston.target # Splitting data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Data scaling min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) # Training regressor = SupervisedDBNRegression(hidden_layers_structure=[100], learning_rate_rbm=0.01, learning_rate=0.01, n_epochs_rbm=20, n_iter_backprop=200, batch_size=16, activation_function='relu') regressor.fit(X_train, Y_train) # Test X_test = min_max_scaler.transform(X_test) Y_pred = regressor.predict(X_test) print 'Done.\nR-squared: %f\nMSE: %f' % (r2_score(Y_test, Y_pred), mean_squared_error(Y_test, Y_pred))
def evaluateAndPredict(estimator, opt, output, evaluate=True, predict=False, model_output=None, features_output=None): # Load data print "loading data" d = load_datasets( opt["subchallenge"], opt["final"], opt["filterQA"], opt["use_mut"], opt["use_CNV"], opt["use_exp"], opt["use_methyl"], opt["use_drug_info"], opt["use_cell_info"], ) saveObserved(d) if opt["use_drug_info"]: # Get split datasets data = [ (d["comb_train_input_None"], d["comb_train_output_None"], d["leaderboard_None"]), (d["comb_train_input_One"], d["comb_train_output_One"], d["leaderboard_One"]), (d["comb_train_input_Both"], d["comb_train_output_Both"], d["leaderboard_Both"]), ] # in this case, data is a list that contains 3 sets of X, y, LB else: data = [(d["comb_train_input"], d["comb_train_output"], d["leaderboard"])] if path.exists(output): remove(output) f = open(output, "a") # append mode f.write("Estimator:\t" + estimator[0] + "\n\n") f.write("Options:\n") for key in opt: f.write(key + "\t" + str(opt[key]) + "\n") f.write("\n") iters_cv_pred = [] iters_predictions = [] iters_confidence = [] feature_lists = [] for i in xrange(len(data)): if len(data) > 1: f.write("Model " + str(i + 1) + "\n") # Preprocess data print "preprocessing data" X_train, LB, feature_names = preprocess( data[i][0], data[i][2], d, opt["use_mut"], opt["use_CNV"], opt["use_exp"], opt["exp_threshold"], opt["use_methyl"], opt["use_cell_info"], opt["scale"], ) y_train = data[i][1].values # Feature selection (if using SelectKBest) if opt["selection_method"] == "kbest": print "feature selection" if opt["num_features"] > len(feature_names): selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], "all") else: selected = featureSelection( X_train, y_train, opt["selection_method"], estimator[1], opt["num_features"] ) X_train = X_train[:, selected] LB = LB[:, selected] # Optimize parameters (only does the actual optimization if parameters dict is not empty) and write best parameters to file opt_est = optParameters(estimator[1], X_train, y_train, estimator[2], data[i][0]) f.write("Best parameters:\n" + str(opt_est[1]) + "\n\n") # Feature selection (if not using SelectKBest) if opt["selection_method"] != "kbest": selected = featureSelection(X_train, y_train, opt["selection_method"], opt_est[0], opt["num_features"]) X_train = X_train[:, selected] LB = LB[:, selected] feature_lists.append([feature for (feature, mask) in zip(feature_names, selected) if mask]) if evaluate: print "evaluating model" # Model evaluation predictions, confidence = evalModel(opt_est[0], X_train, y_train, 5, opt["cv_iterations"]) iters_cv_pred.append(predictions) iters_confidence.append(confidence) if predict: print "predicting" # Predict leaderboard/test output: opt_est[0].fit(X_train, y_train) iters_predictions.append(opt_est[0].predict(LB)) if features_output != None: saveFeatures(feature_lists, features_output) # Join dataset subsets and calculate scores/save to file if evaluate: print "calculating scores" if opt["use_drug_info"]: cv_predictions = pd.Series( data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input_None"].index ).append(pd.Series(data=iters_cv_pred[1], name="PREDICTION", index=d["comb_train_input_One"].index)) cv_predictions = cv_predictions.append( pd.Series(data=iters_cv_pred[2], name="PREDICTION", index=d["comb_train_input_Both"].index) ) confidence = pd.concat( [ pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input_None"].index), pd.Series(data=iters_confidence[1], name="CONFIDENCE", index=d["comb_train_input_One"].index), pd.Series(data=iters_confidence[2], name="CONFIDENCE", index=d["comb_train_input_Both"].index), ], axis=0, ) else: if estimator[0] == "PLS": iters_cv_pred[0] = [float(iters_cv_pred[0][i]) for i in xrange(len(iters_cv_pred[0]))] iters_confidence[0] = [float(iters_confidence[0][i]) for i in xrange(len(iters_confidence[0]))] cv_predictions = pd.Series(data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input"].index) confidence = pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input"].index) savePredictions(cv_predictions, d, CV=True) saveConfidence(confidence, d) # Calculate scores and write to file r2 = r2_score(d["comb_train_output"].values, cv_predictions.values) challenge_performance = drugCombiScore(d["comb_train_output"].values, cv_predictions.values, d) challenge_global = globalScore(d["comb_train_output"].values, cv_predictions.values, d) print challenge_performance print challenge_global f.write("Model Evaluation\n") f.write("R2:\t" + str(r2) + "\n") f.write("Performance score:\t" + str(challenge_performance[0]) + "\n") f.write("Standard error:\t" + str(challenge_performance[1]) + "\n") f.write("Global score:\t" + str(challenge_global[0]) + "\n") f.write("Primary metric:\t" + str(challenge_global[1]) + "\n") f.write("Tie-breaking metric:\t" + str(challenge_global[2]) + "\n") if predict: if opt["use_drug_info"]: predictions = pd.concat( [ pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard_None"].index), pd.Series(data=iters_predictions[1], index=d["leaderboard_One"].index), pd.Series(data=iters_predictions[2], index=d["leaderboard_Both"].index), ], axis=0, ) else: predictions = pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard"].index) savePredictions(predictions, d, CV=False) # Save model if model_output != None: joblib.dump(estimator, model_output) f.close()
def validate(y_true, y_pred): print 'Kolmogorov-Smirnov test = ', ks_2samp(y_true, y_pred) print 'mean_squared_error = ', mean_squared_error(y_true, y_pred) print 'mean_absolute_error = ', mean_absolute_error(y_true, y_pred) print 'r2_score = ', r2_score(y_true, y_pred) """TBD compute the log-loss to consider boolean""" print "log_loss = " + str(log_loss(y_true, y_pred)) #Log loss, aka logistic loss or cross-entropy loss. precision, recall, thresholds = precision_recall_curve(y_true, y_pred) #Compute precision-recall pairs for different probability thresholds #print "precision = " + str(precision) #print "recall = " + str(recall) #print "thresholds = " + str(thresholds) average_precision = average_precision_score(y_true, y_pred) #Compute average precision (AP) from prediction scores print "average_precision_score = ", average_precision ############################################################################## # Plot of a ROC curve for a specific class plt.figure() plt.plot(precision, recall, label='AUC = %0.2f' % average_precision) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend(loc="lower right") plt.show() ############################################################################## fpr, tpr, thresholds = roc_curve(y_true, y_pred) #Compute Receiver operating characteristic (ROC) print "fpr = " + str(fpr) print "tpr = " + str(tpr) print "thresholds = " + str(thresholds) print "roc_auc_score = " + str(roc_auc_score(y_true, y_pred)) #Compute Area Under the Curve (AUC) from prediction scores roc_auc = auc(fpr, tpr) ############################################################################## # Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic ') plt.legend(loc="lower right") plt.show() ############################################################################## ##print "pearsonr = " + str(pearsonr(np.array(y_true), np.array(y_pred))) ##print "spearmanr = " + str(spearmanr(np.array(y_true), np.array(y_pred))) ##print matthews_corrcoef(y_true, y_pred) #Compute the Matthews correlation coefficient (MCC) for binary classes ##print confusion_matrix(y_true, y_pred) #Compute confusion matrix to evaluate the accuracy of a classification ##print accuracy_score(y_true, y_pred) #Accuracy classification score. ##print classification_report(y_true, y_pred) #Build a text report showing the main classification metrics ##print f1_score(y_true, y_pred) #Compute the F1 score, also known as balanced F-score or F-measure ##print fbeta_score(y_true, y_pred) #Compute the F-beta score ##print hamming_loss(y_true, y_pred) #Compute the average Hamming loss. ##print jaccard_similarity_score(y_true, y_pred) #Jaccard similarity coefficient score ##print precision_recall_fscore_support(y_true, y_pred) #Compute precision, recall, F-measure and support for each class ##print precision_score(y_true, y_pred) #Compute the precision ##print recall_score(y_true, y_pred) #Compute the recall ##print zero_one_loss(y_true, y_pred) #Zero-one classification loss. ##print "hinge_loss = " + str(hinge_loss(y_true, y_pred)) #Average hinge loss (non-regularized) return