def test_posterior(self): """Check the posterior over weights function returns mean and covar.""" clf = RVR() x = np.array([[1, 2], [3, 4]]) y = np.array([[5, 6], [7, 8]]) clf.phi = clf._apply_kernel(x, y) clf.alpha_ = np.ones(3) clf.m_ = np.ones(3) clf.beta_ = 1 clf.y = np.array([1, 1]) clf._posterior() m_target = np.array([6.103885e-03, 3.750334e-08, 6.666294e-01]) sigma_target = np.array([ [9.997764e-01, -1.373791e-09, -6.103885e-03], [-1.373791e-09, 1.000000e+00, -3.750334e-08], [-6.103885e-03, -3.750334e-08, 3.333706e-01] ]) np.testing.assert_allclose(clf.m_, m_target) np.testing.assert_allclose(clf.sigma_, sigma_target)
def test_predict(self): """Check the predict function works with pre-set values.""" clf = RVR(kernel='linear', bias_used=False) clf.relevance_ = np.array([[1, 1]]) clf.m_ = np.array([1]) y = clf.predict(np.array([1, 1])) self.assertEqual(y, 2)
def __init__(self, HI): if (HI.shape[0] == 1): HI = HI.reshape(1, 1) timesteps = np.array([i for i in range(len(HI)) ]).reshape(len(HI), HI.shape[0]) else: timesteps = np.array([i for i in range(len(HI)) ]).reshape(len(HI), HI.shape[1]) self.rvrmodel = RVR(kernel='linear') self.optimize(timesteps, HI)
def test_fit(self): """Check the fit function works correctly.""" clf = RVR(kernel='linear', threshold_alpha=1e3, verbose=True) X = np.array([ [1], [2], [3], ]) y = np.array([1, 2, 3]) np.random.seed(1) y = y + 0.1 * np.random.randn(y.shape[0]) clf.fit(X, y) m_target = np.array([0.065906, 0.131813, 0.197719, 0.159155]) np.testing.assert_array_equal(clf.relevance_, X) np.testing.assert_allclose(clf.m_, m_target, rtol=1e-3)
def test_regression_sinc(self): """Check regression works with y=sinc(x).""" clf = RVR() x = np.linspace(0, 10, 101) y = np.sinc(x) np.random.seed(1) y = y + 0.1 * np.random.randn(y.shape[0]) X = x[:, np.newaxis] clf.fit(X, y) score = clf.score(X, y) m_target = [ 1.117655e+00, -6.334513e-01, 5.868671e-01, -4.370936e-01, 2.320311e-01, -4.638864e-05, -7.505325e-02, 6.133291e-02 ] self.assertGreater(score, 0.85) np.testing.assert_allclose(clf.m_, m_target, rtol=1e-3) self.assertEqual(clf.relevance_.shape, (8, 1)) prediction, mse = clf.predict(np.array([[0.5]]), eval_MSE=True) self.assertAlmostEqual(prediction[0], 0.611, places=3) self.assertAlmostEqual(mse[0], 0.00930, places=5)
def test_regression_linear_noise(self): """Check regression works with a linear function with added noise.""" clf = RVR(kernel='linear', alpha=1e11) x = np.arange(1, 101) y = x + 5 np.random.seed(1) y = y + 0.1 * np.random.randn(y.shape[0]) X = x[:, np.newaxis] clf.fit(X, y) score = clf.score(X, y) m_target = np.array([1, 5]) rel_target = np.array([[1]]) self.assertGreater(score, 0.99) np.testing.assert_allclose(clf.m_, m_target, rtol=1e-2) np.testing.assert_allclose(clf.relevance_, rel_target) self.assertAlmostEqual(clf.beta_, 126.583, places=3) prediction, mse = clf.predict(np.array([[50]]), eval_MSE=True) self.assertAlmostEqual(prediction[0], 55.006, places=3) self.assertAlmostEqual(mse[0], 0.00798, places=5)
class RVRDegradationModel: def __init__(self, HI): if (HI.shape[0] == 1): HI = HI.reshape(1, 1) timesteps = np.array([i for i in range(len(HI)) ]).reshape(len(HI), HI.shape[0]) else: timesteps = np.array([i for i in range(len(HI)) ]).reshape(len(HI), HI.shape[1]) self.rvrmodel = RVR(kernel='linear') self.optimize(timesteps, HI) def optimize(self, X, Y): self.rvrmodel.fit(X, Y) def update(self, X, Y): self.optimize(X, Y) def predict(self, X): # self.rvrmodel.fit(X, X) Yp = self.rvrmodel.predict(X) print(Yp) return Yp
def rvr_pipeline(x,y,pca,kernel,x_p=0,y_p=0,fold=10,seed=2019,predict_data=False): rvr = RVR(kernel=kernel) kf = KFold(n_splits=fold, shuffle=True, random_state=seed) score = np.zeros((fold,)) i = 0 for train,test in kf.split(x,y): t1 = time.time() x_train, y_train = x[train], y[train] x_test, y_test = x[test], y[test] pca.fit(x_train) new_train = pca.transform(x_train) new_test = pca.transform(x_test) scaler.fit(new_train) new_train = scaler.transform(new_train) new_test = scaler.transform(new_test) rvr.fit(new_train,y_train) pred = rvr.predict(new_test) mse = abs(pred-y_test) score[i] = sum(mse)/mse.shape[0] i+=1 t2 = time.time() print('fold '+str(i)+':',t2-t1,'sec') print('='*40) print('MAE:',np.mean(score)) if predict_data: pca.fit(x) new_train = pca.transform(x) new_test = pca.transform(x_p) scaler.fit(new_train) new_train = scaler.transform(new_train) new_test = scaler.transform(new_test) rvr.fit(new_train,y) pred = rvr.predict(new_test) error = abs(pred-y_p) print('Test MAE:',sum(error)/error.shape[0]) return pred
def test_regression_linear(self): """Check regression works with a linear function.""" clf = RVR(kernel='linear', alpha=1e11) x = np.arange(1, 100) y = x + 5 X = x[:, np.newaxis] clf.fit(X, y) score = clf.score(X, y) m_target = np.array([1, 5]) self.assertGreater(score, 0.99) np.testing.assert_allclose(clf.m_, m_target) prediction, mse = clf.predict(np.array([[50]]), eval_MSE=True) self.assertAlmostEqual(prediction[0], 55, places=3) self.assertAlmostEqual(mse[0], 6.18e-6, places=3)
from skrvm import RVR from skrvm import RVC from sklearn.datasets import load_iris X = [[0, 0], [2, 2]] y = [0.5, 2.5] clf = RVR(kernel='linear') # clf = RVR(kernel='rbf') # clf = RVR(kernel='poly') clf.fit(X, y) RVR(alpha=1e-06, beta=1e-06, beta_fixed=False, bias_used=True, coef0=0.0, coef1=None, degree=3, kernel='linear', n_iter=3000, threshold_alpha=1000000000.0, tol=0.001, verbose=True) print(clf.predict([[1, 1]])) # clf = RVC() # clf.fit(load_iris().data, load_iris().target) # RVC(alpha=1e-06, beta=1e-06, beta_fixed=False, bias_used=True, coef0=0.0, # coef1=None, degree=3, kernel='rbf', n_iter=3000, n_iter_posterior=50, # threshold_alpha=1000000000.0, tol=0.001, verbose=False)
def benchmark(): # Any integer value between 1 and 3 to select the number of subplots to show: num_figures = 2 # Parameters to generate training data num_samples = 100 noise_level = 0.1 training_data_range = 10 # Training data X, y = generate_training_data(num_samples, noise_level, training_data_range) # Fit gpr = GaussianProcessRegressor(kernel=RBF() + WhiteKernel()) gpr.fit(X, y) ## Implementation of RVR by skrvm rvr = RVR(kernel='rbf') rvr.fit(X, y) ## Implementation of RVR by sklearn_rvm # Caveat: Since sklearn v.0.22, the default value of gamma changed from ‘auto’ to ‘scale’. # Reference: https://github.com/Mind-the-Pineapple/sklearn-rvm/issues/9 emrvr = EMRVR(kernel='rbf', gamma='auto') emrvr.fit(X, y) # Predict plot_params = get_plot_params() X_plot = np.linspace(plot_params['x_low'], plot_params['x_high'], 10000)[:, None] # Caveat: # generating the variance of the predictive distribution takes considerably longer than just predicting the mean. # Reference: # https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_compare_gpr_krr.html y_gpr, y_gpr_std = gpr.predict(X_plot, return_std=True) ## Implementation of RVR by skrvm y_rvr = rvr.predict(X_plot) y_rvr_std = None ## Implementation of RVR by sklearn_rvm y_emrvr, y_emrvr_std = emrvr.predict(X_plot, return_std=True) # Plot fig, axs = plt.subplots(num_figures, 1, figsize=(15, 7)) try: # In case there are stricly more than 1 subplot, there is no issue. num_sub_plots = len(axs) except TypeError: # In case there is exactly 1 subplot, we have to ensure that axs is a list, for code compatibility. axs = [axs] num_sub_plots = len(axs) print('Plotting {} subplots.'.format(num_sub_plots)) plot_results(X, y, emrvr, gpr, X_plot, y_emrvr, y_gpr, "sklearn_rvm", "GPR", y_emrvr_std, y_gpr_std, rvr_color='navy', gpr_color='darkorange', training_data_range=training_data_range, ax=axs[0]) if len(axs) > 1: plot_results(X, y, emrvr, rvr, X_plot, y_emrvr, y_rvr, "sklearn_rvm", "skrvm", y_emrvr_std, y_rvr_std, rvr_color='navy', gpr_color='purple', training_data_range=training_data_range, ax=axs[1]) if len(axs) > 2: plot_results(X, y, rvr, gpr, X_plot, y_rvr, y_gpr, "skrvm", "GPR", y_rvr_std, y_gpr_std, rvr_color='purple', gpr_color='darkorange', training_data_range=training_data_range, ax=axs[2]) plt.show() return
def final_train(x, y, x_test, y_test, out_list, mn, age_group_all): model = [] best_score = [] if mn == 'LAD': print(out_list) [C_list, score_list] = zip(*[(item[6]['C'], item[5]) for item in out_list]) C_final = np.median(C_list) best_score = np.mean(score_list) print('in final LAD') print('para', C_list, C_final, 'score', score_list, best_score) model = LAD(epsilon=0.0, tol=0.0001, C=C_final, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'RFR': [n_est_list, score_list] = zip(*[(item[6]['n_estimators'], item[5]) for item in out_list]) n_est = int(np.median(n_est_list)) best_score = np.mean(score_list) print('in final RFR') print('n_est_list', n_est_list, n_est, 'score', score_list, best_score) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [n_est]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'PLSR': [n_comp_list, score_list] = zip(*[(item[6]['n_components'], item[5]) for item in out_list]) n_comp = int(np.median(n_comp_list)) best_score = np.mean(score_list) print('in final PLSR') print('n_comp_list', n_comp_list, n_comp, 'score', score_list, best_score) pls_reg = PLSRegression() params = {'n_components': [n_comp]} model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'RR': from sklearn.linear_model import Ridge, RidgeCV [n_comp_list, score_list] = zip(*[(item[6]['alpha'], item[5]) for item in out_list]) n_comp = int(np.median(n_comp_list)) best_score = np.mean(score_list) print('in final RR') print('n_comp_list', n_comp_list, n_comp, 'score', score_list, best_score) ridge = Ridge() params = {'alpha': [n_comp]} model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0) model.fit(x, y) pred_var = predict(mn, model, x_test, y_test) elif mn == 'RVM': from skrvm import RVR print('in final RVM') model = RVR(kernel='linear') model.fit(x, y) best_score = 0 pred_var = predict(mn, model, x_test, y_test) elif mn == 'COMB': print('IN COMB') group_lad = dict() from mord import LAD from sklearn.ensemble import RandomForestRegressor print('shapes', x.shape, y.shape) lad1 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} broad_lad = GridSearchCV(lad1, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) broad_lad.fit(x, y) for ages in age_group_all: # print('ages', ages) idx_grp = list() for item in ages: # for every age in the age group collect the training data by getting the indices for idx, val in enumerate(y): if val == item: idx_grp.append(idx) key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages)) x_samples_train = x[idx_grp] y_samples_train = y[idx_grp] # print('y_samples_train', y_samples_train) lad2 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]} specific_lad = GridSearchCV(lad2, param_grid=params2, cv=5, scoring='neg_mean_absolute_error', verbose=0) specific_lad.fit(x_samples_train, y_samples_train) group_lad[key_age_grp] = specific_lad pred_all = make_predictions(x, broad_lad, group_lad) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(pred_all, y) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) best_score = model.best_score_ pred_all_test = make_predictions(x_test, broad_lad, group_lad) pred_var = predict(mn, model, pred_all_test, y_test) return model, best_score, pred_var
def __init__(self): from skrvm import RVR self.model = RVR( verbose=False, kernel="rbf", )
def train(m, x_train, y_train, x_test, y_test): print('training', m) model = [] pred_var = {} if m == 'LAD': from mord import LAD lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) y_train = y_train.astype(float).round() y_train = y_train.astype(int) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LAD grid search best parameters: {}".format( model.best_params_)) elif m == 'MCLog': # this class is not avaialble from sklearn.linear_model import LogisticRegression mcl = LogisticRegression(multi_class='multinomial', max_iter=10000, solver='newton-cg', fit_intercept=True) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(mcl, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] MCLog grid search best parameters: {}".format( model.best_params_)) elif m == 'LogAT': # takes quite some time from mord import LogisticAT lat = LogisticAT() params = {"alpha": np.linspace(0, 1, 5)} model = GridSearchCV(lat, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LogAT grid search best parameters: {}".format( model.best_params_)) elif m == 'LinearSVC': from sklearn.svm import LinearSVC svm = LinearSVC() params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LinearSVC grid search best parameters: {}".format( model.best_params_)) elif m == 'RFC': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() params = {"n_estimators": [10, 100, 500, 1000]} model = GridSearchCV(rfc, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFC grid search best parameters: {}".format( model.best_params_)) elif m == 'Lasso': from sklearn.linear_model import Lasso from sklearn.linear_model import LassoCV svm = Lasso() params = {"alpha": [10]} model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) # model = LassoCV(n_alphas=10, cv=5, verbose=3) # model.fit(x_train, y_train) # print("[INFO] Lasso path search best parameter: {}".format(model.alpha_)) elif m == 'RFR': from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) elif m == 'RR': from sklearn.linear_model import Ridge, RidgeCV ridge = Ridge() params = { 'alpha': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] } model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) print("[INFO] Ridge Regression grid search best parameters: {}".format( model.best_params_)) # model = RidgeCV(alphas=(0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), cv=5) # model.fit(x_train, y_train) # print("[INFO] Ridge Regression grid search best parameters: {}".format(model.alpha_)) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) elif m == 'PLSR': from sklearn.cross_decomposition import PLSRegression pls_reg = PLSRegression() params = { 'n_components': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ] } model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0) # pdb.set_trace() model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) print("[INFO] PLS Regression grid search best parameters: {}".format( model.best_params_)) pred_var = predict(m, model, x_test, y_test) elif m == 'RVM': from skrvm import RVR print('in RVM') model = RVR(kernel='linear') # avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss='mse', # num_rounds=3, random_seed=123) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) # print('Average expected loss: %.3f' % avg_expected_loss) # print('Average bias: %.3f' % avg_bias) # print('Average variance: %.3f' % avg_var) elif m == 'DTR': from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() # params = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2], # "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100]} # params = {"max_depth": [2,4,6]} # model = GridSearchCV(dtr, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) elif m == 'COMB': from sklearn.ensemble import RandomForestRegressor from mord import LAD from group_pred import create_age_groups print('IN COMB') group_lad = dict() print('shapes', x_train.shape, y_train.shape) lad1 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} broad_lad = GridSearchCV(lad1, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) y_train_r = y_train.astype(float).round() y_train_r = y_train_r.astype(int) broad_lad.fit(x_train, y_train_r) age_group_all = create_age_groups(y_train_r, 10, 5) for ages in age_group_all: # print('ages', ages) idx_grp = list() for item in ages: # for every age in the age group collect the training data by getting the indices for idx, val in enumerate(y_train_r): if val == item: idx_grp.append(idx) print('group info', ages, len(idx_grp)) if len(idx_grp) > 5: key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages)) x_samples_train = x_train[idx_grp] y_samples_train = y_train_r[idx_grp] # print('y_samples_train', y_samples_train) lad2 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]} specific_lad = GridSearchCV(lad2, param_grid=params2, cv=5, scoring='neg_mean_absolute_error', verbose=0) specific_lad.fit(x_samples_train, y_samples_train) group_lad[key_age_grp] = specific_lad print('len_groups', len(group_lad)) pred_all = make_predictions(x_train, broad_lad, group_lad) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model_2 = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model_2.fit(pred_all, y_train) # lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, # intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) # params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} # model_2 = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) # model_2.fit(pred_all, y_train_r) train_var = predict(m, model_2, pred_all, y_train) print("[INFO] RFR grid search best parameters: {}".format( model_2.best_params_)) pred_all_test = make_predictions(x_test, broad_lad, group_lad) pred_var = predict(m, model_2, pred_all_test, y_test) model = [broad_lad, group_lad, model_2] else: print('unknown model') if m == 'RVM' or 'DTR': return model, 0, 0, pred_var, train_var elif m == 'COMB': return model, model_2.best_score_, model_2.best_params_, pred_var, train_var else: return model, model.best_score_, model.best_params_, pred_var, train_var
#Scale the entire input dataset Coulomb_df = scaler.transform(Coulomb_df) X_train_scaled, X_test_scaled, y_train, y_test = train_test_split( Coulomb_df, Output_df, test_size=.2, random_state=None) reports_df = pd.DataFrame( columns=['Name', 'MARE', 'MSE', 'R2']) for regr_choice in range(5): regr_names = ['RF', 'SVM', 'RVM', 'Huber', 'XGBOOST'] regr_objects = [RandomForestRegressor(n_estimators=400, max_depth=1000, random_state=0), svm.SVR(kernel='rbf', epsilon=0.1, verbose=True), RVR(kernel='rbf', n_iter=10000, tol=0.0001, verbose=True), linear_model.HuberRegressor( epsilon=1.35, max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, tol=1e-05), XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=400, alpha=10, n_estimators=400) ] regr = regr_objects[regr_choice] regr_name = regr_names[regr_choice] if reusingModels: regr = joblib.load('SavedModels_'+regr_name+'.pkl') else: regr.fit(X_train_scaled, y_train) if 'XGB' in regr_name:
def train(x, y): model = RVR(kernel='rbf') model.fit(x, y) return model
X = data[[Label.Rain.value, Label.Wind.value]] y = data[Label.PM2_5.value] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2) sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) # Random Forest, RVR, Lassoで機械学習 model1 = RandomForestRegressor(bootstrap=True, criterion="mse") model2 = RVR(kernel="rbf") model3 = Lasso(alpha=0.1) model1.fit(X_train_std, y_train) model2.fit(X_train_std, y_train) model3.fit(X_train_std, y_train) y_train_pred = model1.predict(X_train_std) y_test_pred = model1.predict(X_test_std) print("Random Forest MSE train: {0}, test: {1}".format( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) y_train_pred = model2.predict(X_train_std) y_test_pred = model2.predict(X_test_std)
y = boston.target[:20] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=2 ) sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) model = LinearRegression() model2 = SVR(kernel="rbf", C=1000.0, epsilon=6.5) model3 = RVR(kernel="rbf") model.fit(X_train_std, y_train) model2.fit(X_train_std, y_train) model3.fit(X_train_std, y_train) y_train_pred = model2.predict(X_train_std) y_test_pred = model2.predict(X_test_std) print("SVR MSE train: {0}, test: {1}".format( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred) )) y_train_pred = model3.predict(X_train_std) y_test_pred = model3.predict(X_test_std)
def rvr_analysis(random_seed, save_path, n_folds, analysis): save_path = save_path / ('random_seed_%03d' % random_seed) print('Random seed: %03d' % random_seed) # Load the saved validation dataset project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset) with open(save_path / ('splitted_dataset_%s.pickle' % dataset), 'rb') as handle: splitted_dataset = pickle.load(handle) kf = KFold(n_splits=n_folds, random_state=random_seed) mae_cv = np.zeros((n_folds, 1)) pearsons_corr = np.zeros((n_folds, 1)) pearsons_pval = np.zeros((n_folds, 1)) # Set target and features x = splitted_dataset['Xtest_scaled'] y = splitted_dataset['Ytest'] t_time_train = [] t_time_test = [] for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)): x_train, x_test = x[train_idx, :], x[test_idx, :] y_train, y_test = y[train_idx], y[test_idx] print('CV iteration: %d' % (i_fold + 1)) print('Shape of the trainig and test dataset') print(y_train.shape, y_test.shape) # train the model model = RVR(kernel='linear') cv_time_train = time.process_time() model.fit(x_train, y_train) elapsed_time = time.process_time() - cv_time_train print('CV - Elapased time in seconds to train:') t_time_train.append(elapsed_time) print('%.03f' % elapsed_time) # test the model cv_time_test = time.process_time() y_predicted = model.predict(x_test) elapsed_time = time.process_time() - cv_time_test t_time_test.append(elapsed_time) print('CV - Elapased time in seconds to test:') print('%.03f' % elapsed_time) mae_kfold = mean_absolute_error(y_test, y_predicted) mae_cv[i_fold, :] = mae_kfold # now look at the pearson's correlation r_test, r_p_value_test = pearsonr(y_test, y_predicted) pearsons_corr[i_fold, :] = r_test pearsons_pval[i_fold, :] = r_p_value_test print('CV results') print('MAE: Mean(SD) = %.3f(%.3f)' % (mae_cv.mean(), mae_cv.std())) print('Pearson\'s Correlation: Mean(SD) = %.3f(%.3f)' % (r_test.mean(), r_test.std())) print('Mean CV time: %.3f s ' % np.mean(t_time_train)) print('SD CV time: %.3f s' % np.std(t_time_train)) print('Mean CV time: %.3f s ' % np.mean(t_time_test)) print('SD CV time: %.3f s' % np.std(t_time_test)) print('') if analysis == 'vanilla_combi': # Train the entire dataset x_train_all, x_test_all, y_train_all, y_test_all = \ train_test_split(x, y, test_size=.85, random_state=random_seed) print('All: Shape of the trainig and test dataset') print(y_train_all.shape, y_test_all.shape) elif analysis == 'uniform_dist': # Train the entire dataset x_train_all, x_test_all, y_train_all, y_test_all = \ train_test_split(x, y, test_size=.20, random_state=random_seed) print('ALL: Shape of the trainig and test dataset') print(y_train_all.shape, y_test_all.shape) print('Training RVR model:') model_all = RVR(kernel='linear') model_all.fit(x_train_all, y_train_all) # plot predicted vs true for the test (Entire sample) print('Plotting Predicted Vs True Age for all the sample') y_predicted_test = model.predict(x_test_all) output_path_test = save_path / ( 'rvr_test_predicted_true_age_rnd_seed%d.eps' % random_seed) plot_predicted_vs_true(y_test_all, y_predicted_test, output_path_test, 'Age') return mae_cv, r_test, t_time_train, t_time_test
normal_data_all = preprocessing.scale(full_data_matrix)#normalize pca = PCA(10, svd_solver='auto') pca.fit(normal_data_all) normal_data_pca = pca.transform(normal_data_all)#transform data to xx components ##################################################################################### ##We use the image_sematics from each to train after n_subject_to_use = 1 n_observations = n_subject_to_use*690 X_train, X_test, y_train_index, y_test_index = train_test_split(normal_data_pca[range(n_observations),:],range(n_observations),test_size=0.2) mean_err = np.zeros((2048,1)) for i in range(2048): n_semantic_as_y = i #the 1st semantic is used as output clf1=RVR(kernel='rbf') clf1.fit(X_train,full_semantics_matrix[y_train_index,n_semantic_as_y]) predicted_out = clf1.predict(X_test) #full_semantics_matrix[y_test_index,n_semantic_as_y] ## calc error from test output err = predicted_out-full_semantics_matrix[y_test_index,n_semantic_as_y] mean_err[i] = np.mean(err) print(i) plt.figure() plt.plot(mean_err) plt.show()