def gp_fit_sklearn(x_input, x_tar, y_input, y_tar, params=None, title=''): k1 = kernels.DotProduct(sigma_0=1, sigma_0_bounds=(1e-05, 5)) k2 = kernels.RBF(length_scale=10, length_scale_bounds=(1e-3, x_tar[-1])) k3 = kernels.RationalQuadratic(alpha=1, length_scale=10, length_scale_bounds=(1e-3, x_tar[-1])) kernel = k1 * k2 * k3 gp1 = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True, alpha=0) if params: gp1.set_params(params) gp1.fit(x_input.reshape(-1, 1), y_input) pred, std = gp1.predict(x_tar.reshape(-1, 1), return_std=True) plt.plot(x_input, y_input, 'bo', label='Input', alpha=0.4) plt.plot(x_tar, y_tar, 'go', label='Target', alpha=0.4) plt.plot(x_tar, pred, 'ro', label='Prediction', alpha=0.4) plt.gca().fill_between(x_tar, pred.reshape(-1) - 2 * std, pred.reshape(-1) + 2 * std, color='lightblue', alpha=0.5, label=r"$2\sigma$") plt.title(title) plt.legend() plt.show() return gp1, pred
def get_gpr(kernel_type, X, y): mean, _, std = get_distribution_measures(y) if kernel_type == 'rbf': kernel = kernels.ConstantKernel(mean) * kernels.RBF(std) elif kernel_type == 'dot': kernel = kernels.ConstantKernel(mean) * kernels.DotProduct(std) gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.05, optimizer=None) gpr.fit(X, y) return gpr
def gp_fit_sklearn_xy(x_input, x_tar, y_input, y_tar, title='', route=None, gp=None): if gp: gp1 = gp else: k1 = kernels.DotProduct(sigma_0=1., sigma_0_bounds=(1e-3, 1e1)) k3 = kernels.RationalQuadratic(alpha=1.5, length_scale=2.5, length_scale_bounds=(1e-3, 20), alpha_bounds=(1e-3, 10)) k4 = kernels.ConstantKernel(1., (1e-3, 1e2)) k5 = kernels.ConstantKernel(1., (1e-2, 1e2)) kernel = k1 * k4 + k3 * k5 gp1 = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0, random_state=0) x_input = x_input.reshape(-1, 1) x_tar = x_tar.reshape(-1, 1) gp1.fit(x_input, y_input) pred, std = gp1.predict(x_tar, return_std=True) if route.any(): plt.plot(route[:, 0], route[:, 1], 'b', label='Prediction', alpha=0.2) plt.plot(y_input[:, 0], y_input[:, 1], 'bo', label='Input', alpha=0.4) plt.plot(y_tar[:, 0], y_tar[:, 1], 'go', label='Target', alpha=0.4) plt.plot(pred[:, 0], pred[:, 1], 'ro', label='Prediction', alpha=0.4) # plt.gca().fill_between(pred[:, 0].reshape(-1) - 2 * std, pred[:, 0].reshape(-1) + 2 * std, # pred[:, 1].reshape(-1) - 2 * std, pred[:, 1].reshape(-1) + 2 * std, color='lightblue', # alpha=0.5, label=r"$2\sigma$") plt.title(title) plt.legend() plt.show() return gp1, pred
df = df.transpose() df[df.columns[-1]].head() df = df.drop('Unnamed: 0', axis=0) X, y = pre_processing(df.astype(str)) X = X.astype(float) skf = StratifiedKFold(n_splits=metrics.folds, shuffle=True) scorer = make_scorer(accuracy_score) #modelos a serem treinados nmodels = { 'gauss': [ GaussianProcessClassifier(n_jobs=2), { 'kernel': [ 1 * kernels.RBF(), 1 * kernels.DotProduct(), 1 * kernels.Matern(), 1 * kernels.RationalQuadratic(), 1 * kernels.WhiteKernel() ] } ], 'nb': [GaussianNB()], 'rf': [ RandomForestClassifier(), { 'n_estimators': [10, 50, 100, 200, 500], 'criterion': ["gini", "entropy"] } ], 'dt': [ DecisionTreeClassifier(), { "criterion": ["gini", "entropy"],
def to_sklearn(self): """Convert it to a sklearn kernel, if there is one""" return (self.variance * (self.slope * sklearn_kern.DotProduct(0) + +self.intercept)**self.degree)
from sklearn.gaussian_process import kernels from sklearn import gaussian_process from sklearn import preprocessing from sklearn.model_selection import cross_val_score import pandas wineData = pandas.read_csv('wine-combined.csv', sep=",") Y = wineData['quality'] X = wineData.drop(['quality'], axis=1) le = preprocessing.LabelEncoder() le.fit(["red", "white"]) X['type'] = le.transform(X['type']) rbf = kernels.RBF() dotp = kernels.DotProduct() gp_rbf = gaussian_process.GaussianProcessClassifier(kernel=rbf) gp_dotp = gaussian_process.GaussianProcessClassifier(kernel=dotp) print('Training with rbf...') scores = cross_val_score(gp_rbf, X, Y, cv=10) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)) print('Training with dot product...') scores = cross_val_score(gp_dotp, X, Y, cv=10) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
def run_baselines_particle_size(data, options): """ Run regression model(s) on single replica of data (no random resampling, but uses indexes). Good for testing training and testing on manually specified splits :param data: input dataset, training and test mixed, [x,y] labels last column :param kwargs: Dictionary containing options. Fields are: | SEP_METHOD : ['interpolation','prediction'] -> Mode of learning / testing | NUM_REP : N -> N random resampling of training and test sets (ONLY 'interpolation') | LEG_P : [1, 2, 3] -> On which leg to predict. Treated separately by the models | METHODS : [ridgereg', 'pls', 'lasso', 'rbfgpr', 'rbfgprard', 'rf'] -> Regression method | LOG_Y : BOOL : -> whether to take the log of y (e.g. concentrations) | NORMALIZE_Y : BOOL -> whether to normalize outputs y | NORMALIZE_X : BOOL -> whether to normalize inputs x | SAVEFOLDER : STRING -> folder address to store results | MODELNAME : STRING -> name of the model file and scores | SPLIT_SIZE : FLOAT [0,1] -> percentage of training to test datapoints | TRN_TEST_INDEX : DF -> list of integers containing wheter the point belongs to training (=1) or | SAVE_PRED : BOOL -> strore predicted values for trn and test (denormalized if needed) to the test set (=2). Has to be same size of data.shape[0] :returns: A dictionary containing weights, accuracy scores for training and test sets """ SEP_METHOD = options['SEP_METHOD'] NUM_REP = options['NUM_REP'] LEG_P = options['LEG_P'] METHODS = options['METHODS'] NORMALIZE_Y = options['NORMALIZE_Y'] NORMALIZE_X = options['NORMALIZE_X'] SAVEFOLDER = options['SAVEFOLDER'] MODELNAME = options['MODELNAME'] SPLIT_SIZE = options['SPLIT_SIZE'] TRN_TEST_INDEX = options['TRN_TEST_INDEX'] LOG_Y = options['LOG_Y'] SAVE_PRED = options['SAVE_PRED'] #SAVE_TEXT_DUMP = kwargs['SAVE_TEXT_DUMP'] if not os.path.isdir(SAVEFOLDER): os.mkdir(SAVEFOLDER) if os.path.exists(SAVEFOLDER / MODELNAME): print("file exists, overwriting") summ = {} #if SAVE_TEXT_DUMP: # results = pd.DataFrame(index=[],columns=['tst_r2','tst_rmse','trn_r2','trn_rmse','n_tr','n_ts']) # if LOG_Y: # data.parbin = data.parbin.apply(np.log) #(data.loc[:,'parbin'].copy()+10e-6) for sep_method in SEP_METHOD: # print(sep_method) for leg in LEG_P: # print(leg) for meth in METHODS: nre = 0 while nre < NUM_REP: np.random.seed(nre) string_exp = 'leg_' + str(leg) + '_' + sep_method + '_' + meth + '_' + str(nre) nre += 1 data_f = data.copy() # leg_whole_.loc[:,'parbin'] = np.log(leg_whole_.loc[:,'parbin']) if leg != 0: if 'leg' not in data.columns.tolist(): data_f = dataset.add_legs_index(data_f) data_f = data_f.loc[data_f['leg'] == leg] data_f.drop('leg', axis=1, inplace=True) else: if 'leg' in data.columns.tolist(): data_f.drop('leg', axis=1, inplace=True) leg_whole_ = data_f.dropna().copy() if LOG_Y: leg_whole_.loc[:,'parbin'] = leg_whole_.parbin.apply(lambda x: np.log(x + 1e-10)) s1, s2 = leg_whole_.shape if s1 < 10: continue if not TRN_TEST_INDEX.values.any(): # mode = 'interpolation', 'prediction', 'temporal_subset' inds = modeling.sample_trn_test_index(leg_whole_.index, split=SPLIT_SIZE, mode=sep_method, group='all', options=options['SUB_OPTIONS']) trn = leg_whole_.loc[(inds.iloc[:,0]==1),:].copy() tst = leg_whole_.loc[(inds.iloc[:,0]==2),:].copy() ###### INSERT SPLIT FUNCTION HERE: # separation = SPLIT_SIZE # trn_size = ceil(s1*separation) #; # # if sep_method.lower() == 'prediction': # # print('training data until ' + str(separation) + ', then test.') # trn = leg_whole_.iloc[:trn_size,:].copy() # tst = leg_whole_.iloc[trn_size:,:].copy() # # elif sep_method.lower() == 'interpolation': # # print('training data random %f pc subset, rest test'%(separation*100)) # leg_whole_ = shuffle(leg_whole_) # trn = leg_whole_.iloc[:trn_size,:].copy() # tst = leg_whole_.iloc[trn_size:,:].copy() elif TRN_TEST_INDEX.values.any(): trn = leg_whole_.loc[TRN_TEST_INDEX.values == 1,:].copy() tst = leg_whole_.loc[TRN_TEST_INDEX.values == 2,:].copy() inds_trn = trn.index inds_tst = tst.index # Standardize data to 0 mean unit variance based on training statistics (assuming stationarity) # SCALE TRAINING DATA X, y if NORMALIZE_X: scalerX = preprocessing.StandardScaler().fit(trn.iloc[:,:-1]) X = scalerX.transform(trn.iloc[:,:-1])#, columns=trn.iloc[:,:-1].columns, index=trn.index) else: X = trn.iloc[:,:-1] if NORMALIZE_Y: scalerY = preprocessing.StandardScaler().fit(trn.iloc[:,-1].values.reshape(-1, 1)) y = scalerY.transform(trn.iloc[:,-1].values.reshape(-1, 1)) else: y = trn.iloc[:,-1] ######### 1 : Ridge Regression if meth.lower() == 'ridgereg': MSE_error = make_scorer(mean_squared_error, greater_is_better=False) regModel = RidgeCV(alphas=np.logspace(-3,0), fit_intercept=True, normalize=False, store_cv_values=False, gcv_mode='svd', cv=5).fit(X,y) #(trn.iloc[:,:-1], trn.iloc[:,-1] regModel.coef_ = regModel.coef_[0] elif meth.lower() == 'bayesianreg': regModel = sk.linear_model.BayesianRidge(n_iter=500, tol=1.e-6, alpha_1=1.e-6, alpha_2=1.e-6, lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False, fit_intercept=False, normalize=False).fit(X,y.ravel()) elif meth.lower() == 'pls': n = 3 regModel = PLSRegression(n_components=n, scale=False).fit(X,y) regModel.coef_ = np.squeeze(np.transpose(regModel.coef_)) elif meth.lower() == 'lasso': regModel = LassoCV(alphas=np.logspace(-2,0,1), n_alphas=500, fit_intercept=True, max_iter=5000, cv=5).fit(X,y.ravel()) elif meth.lower() == 'lingpr': kernel = kernels.DotProduct(sigma_0 = 1, sigma_0_bounds=(1e-05, 1e05)) + \ 1.0 * kernels.WhiteKernel(noise_level=1e-3, noise_level_bounds=(1e-3, 1e+3)) regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b', alpha=0, n_restarts_optimizer=5).fit(X,y) str_kernel = str(kernel) # print(str_kernel) elif meth.lower() == 'rf': import sklearn.ensemble regModel = sklearn.ensemble.RandomForestRegressor(n_estimators=500, criterion='mse', max_features='sqrt', max_depth=15, min_samples_split=2, min_samples_leaf=1).fit(X,np.ravel(y)) regModel.coef_ = regModel.feature_importances_ elif meth.lower() == 'gpr': kernel = 1.0 * kernels.RBF(length_scale=1.0, length_scale_bounds=(1e0, 1e2)) + \ 1.0 * kernels.WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-2, 1e2)) # kernels.ExpSineSquared(length_scale=1, periodicity=1) + \ # 1.0 * kernels.DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-02, 1e2)) #* kernels.ExpSineSquared(length_scale=1, periodicity=1) + \ 1.0 * kernels.ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-02, 100.0)) + \ regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b', alpha=0.5, n_restarts_optimizer=5).fit(X,y) # print(regModel.kernel_) elif meth.lower() == 'gprard': #x = trn.iloc[:,:-1].values #y = trn.iloc[:,-1].values.reshape(-1,1) s1 = X.shape[1] k = (GPy.kern.RBF(s1, ARD=True) + GPy.kern.White(s1, 1) + GPy.kern.Bias(s1, 1)) #+ GPy.kern.Linear(s1, variances=0.001, ARD=False)) regModel = GPy.models.GPRegression(X, y, kernel=k) #regModel.optimize_restarts(parallel=True, robust=True, num_restarts=5, max_iters=200) regModel.optimize('scg', max_iters=200) # 'scg' regModel.coef_ = np.array(regModel.sum.rbf.lengthscale) else: print('method not implemented yet. Or check the spelling') break if NORMALIZE_X: x = scalerX.transform(tst.iloc[:,:-1])#, columns=tst.iloc[:,:-1].columns, index=tst.index) else: x = tst.iloc[:,:-1] y_ts_gt = tst.iloc[:,-1] if meth.lower() == 'gprard': # x_ = tst_.values # x = trn.iloc[:,:-1].values y_ts_h = regModel.predict(x)[0].reshape(-1,) y_tr_h = regModel.predict(X)[0].reshape(-1,) elif (meth.lower() == 'bayesianreg') or (meth.lower() == 'gpr'): [y_ts_h, y_ts_std] = regModel.predict(x,return_std=SAVE_PRED) y_ts_h, y_ts_std = y_ts_h.reshape(-1,), y_ts_std.reshape(-1,) [y_tr_h, y_tr_std] = regModel.predict(X,return_std=SAVE_PRED) y_tr_h, y_tr_std = y_tr_h.reshape(-1,), y_tr_std.reshape(-1,) else: y_ts_h = regModel.predict(x).reshape(-1,) y_tr_h = regModel.predict(X).reshape(-1,) if NORMALIZE_Y: y_tr_h = scalerY.inverse_transform(y_tr_h) y_ts_h = scalerY.inverse_transform(y_ts_h) y_tr_gt = scalerY.inverse_transform(y)#trn.iloc[:,-1] # print(trn.iloc[:,-1].values[0:10],y_tr_gt[0:10], y[0:10]) else: y_tr_gt = y#trn.iloc[:,-1] # print(y[:10], y_tr_gt[:10]) # Compute scores if LOG_Y: y_ts_gt = np.exp(y_ts_gt) - 1e-10 y_ts_h = np.exp(y_ts_h) - 1e-10 y_tr_gt = np.exp(y_tr_gt) - 1e-10 y_tr_h = np.exp(y_tr_h) - 1e-10 # print(np.min(y_tr_gt),np.max(y_tr_gt), ' -- ', np.min(y_tr_h),np.max(y_tr_h)) # print(np.min(y_ts_gt),np.max(y_ts_gt), ' -- ', np.min(y_ts_h),np.max(y_ts_h)) mse = np.sqrt(mean_squared_error(y_ts_gt, y_ts_h)) r2 = r2_score(y_ts_gt, y_ts_h) t_mse = np.sqrt(mean_squared_error(y_tr_gt, y_tr_h)) t_r2 = r2_score(y_tr_gt, y_tr_h) if hasattr(regModel, 'alpha_') & hasattr(regModel, 'coef_'): summ[string_exp] = {'regularizer': regModel.alpha_, 'weights': regModel.coef_, 'tr_RMSE': t_mse, 'tr_R2': t_r2, 'ts_RMSE': mse, 'ts_R2': r2, 'tr_size': trn.shape[0], 'ts_size': tst.shape[0]}#, # 'y_tr_hat': y_tr_h, # 'y_ts_hat': y_ts_h} if 'str_kernel' in locals(): summ[string_exp].update({'kernel': str_kernel}) elif hasattr(regModel, 'coef_') & ~hasattr(regModel, 'alpha_'): summ[string_exp] = {'weights': regModel.coef_, 'tr_RMSE': t_mse, 'tr_R2': t_r2, 'ts_RMSE': mse, 'ts_R2': r2, 'tr_size': trn.shape[0], 'ts_size': tst.shape[0]}#, # 'y_tr_hat': y_tr_h, # 'y_ts_hat': y_ts_h} else: summ[string_exp] = {'tr_RMSE': t_mse, 'tr_R2': t_r2, 'ts_RMSE': mse, 'ts_R2': r2, 'tr_size': trn.shape[0], 'ts_size': tst.shape[0]}#, # 'y_tr_hat': y_tr_h, # 'y_ts_hat': y_ts_h} if 'str_kernel' in locals(): summ[string_exp].update({'kernel': str_kernel}) if SAVE_PRED: # Convert to pandas # print(y_tr_gt[:10]) y_tr_h = pd.Series(y_tr_h,index=inds_trn) y_ts_h = pd.Series(y_ts_h,index=inds_tst) y_tr_gt = pd.Series(np.reshape(y_tr_gt,(-1,)),index=inds_trn) y_ts_gt = pd.Series(np.reshape(y_ts_gt,(-1,)),index=inds_tst) # print(y_tr_gt.iloc[:10]) if 'y_ts_std' in locals(): y_ts_std = pd.Series(y_ts_std,index=inds_tst) y_tr_std = pd.Series(y_tr_std,index=inds_trn) # Add to dictionary summ[string_exp].update({'y_tr_hat': y_tr_h, 'y_ts_hat': y_ts_h, 'y_tr_gt': y_tr_gt, 'y_ts_gt': y_ts_gt}) # print(summ[string_exp]['y_tr_gt'].head(), summ[string_exp]['y_ts_gt'].head()) if 'y_ts_std' in locals(): summ[string_exp].update({'y_tr_std': y_tr_std, 'y_ts_std': y_ts_std}) #if SAVE_TEXT_DUMP: # results = pd.DataFrame(index=[],columns=['n_ts', 'tst_r2','tst_rmse',' n_tr', 'trn_r2','trn_rmse']) # results.loc[nre-1] = [len(y_ts_h), r2, mse, len(y_tr_h), t_r2, t_mse] del leg_whole_, regModel, y_tr_gt, y_ts_gt, y_tr_h, y_ts_h, trn, tst # save_obj(summ, SAVEFOLDER / MODELNAME) #results.to_csv(path_or_buf=SAVEFOLDER + MODELNAME + '.csv', sep='\t') return summ
def run_regression_indexed_data(data, inds, regression_model, NORM_X=True, NORM_Y=True): """ Run regression model(s) on single replica of data (no random resampling). Good for testing training and testing on manually specified splits :param data: df, input dataset, training and test mixed, [x,y] labels last column :param inds: df or series, index vector of training (ind = 1) and test (ind = 2,...,S) for S test SPLITS :param regression_model: list of stings, regression method. Options are hard coded here, but can be extracted in a dict in the future :param NORM_X: bool, wheter to normalize input data :param NORM_Y: bool, wheter to normalize output data :returns: dict containing weights, accuracy scores for training and tests, and the time difference between first and last training points """ tr_ = data.loc[inds.loc[inds['ind'] == 1].index,:].copy() if NORM_X: scalerX = sk.preprocessing.StandardScaler().fit(tr_.iloc[:,:-1]) trn = pd.DataFrame(scalerX.transform(tr_.iloc[:,:-1]), columns=tr_.iloc[:,:-1].columns, index=tr_.index) else: trn = tr_.iloc[:,:-1] if NORM_Y: scalerY = sk.preprocessing.StandardScaler().fit(tr_.iloc[:,-1].values.reshape(-1, 1)) y_trn = scalerY.transform(tr_.iloc[:,-1].values.reshape(-1, 1)) else: y_trn = tr_.iloc[:,-1] trn = trn.assign(labels=y_trn) # print(trn.columns.tolist()) if regression_model.lower() == 'ridgereg': # MSE_error = make_scorer(mean_squared_error, greater_is_better=False) # regModel = RidgeCV(alphas=np.logspace(-6,6,13), fit_intercept=not NORM_Y, # normalize=False, store_cv_values=False, gcv_mode='svd', # cv=3, scoring=MSE_error).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) regModel = sk.linear_model.Ridge(alpha=0.1, fit_intercept=not NORM_Y, normalize=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) weights = regModel.coef_ elif regression_model.lower() == 'lasso': # regModel = LassoCV(alphas=np.logspace(-3,-1,3), n_alphas=200, # fit_intercept=not NORM_Y, cv=3).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) regModel = sk.linear_model.Lasso(alpha=0.1, fit_intercept=not NORM_Y, normalize=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) weights = regModel.coef_ elif regression_model.lower() == 'pls': n = 3 regModel = PLSRegression(n_components=n, scale=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) regModel.coef_ = np.squeeze(np.transpose(regModel.coef_)) weights = regModel.coef_ elif regression_model.lower() == 'rf': import sklearn.ensemble regModel = sklearn.ensemble.RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=10, min_samples_split=2, min_samples_leaf=1).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) elif regression_model.lower() == 'rbfgpr': kernel = 1.0 * kernels.RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + \ 1.0 * kernels.WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-1, 1e+4)) + \ 1.0 * kernels.ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-05, 100000.0)) + \ 1.0 * kernels.DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-05, 100000.0)) regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b', alpha=0, n_restarts_optimizer=5).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) elif regression_model.lower() == 'rbfgprard': inds_trn = trn.index x = trn.iloc[:,:-1].values y = trn.iloc[:,-1].values.reshape(-1,1) k = (GPy.kern.RBF(x.shape[1], ARD=True) + GPy.kern.White(x.shape[1], 0.01) + GPy.kern.Linear(x.shape[1], variances=0.01, ARD=False)) regModel = GPy.models.GPRegression(x,y,kernel=k) regModel.optimize('bfgs', max_iters=200) # print(regModel) weights = 50/regModel.sum.rbf.lengthscale else: print('method not implemented yet. Or check the spelling') return [] # from_to = [str(trn.index.tolist()[0]) + ' - ' + str(trn.index.tolist()[-1])] gap_time_delta = str(trn.index.tolist()[-1] - trn.index.tolist()[0]) # weights_summary[gap_num,:] = tr_r2 = []; tr_mse = []# [[],[]]; mse = [[],[]] ts_r2 = []; ts_mse = [] a = 1 for bb in np.setdiff1d(np.unique(inds),1): ts_ = data.loc[inds.loc[inds['ind'] == bb].index,:] if NORM_X: tst = pd.DataFrame(scalerX.transform(ts_.iloc[:,:-1]), columns=ts_.iloc[:,:-1].columns, index=ts_.index) else: tst = ts_.iloc[:,:-1] if NORM_Y: y_tst = scalerY.transform(ts_.iloc[:,-1].values.reshape(-1, 1)) else: y_tst = ts_.iloc[:,-1] tst = tst.assign(labels=y_tst) if regression_model.lower() == 'rbfgprard': inds_tst = tst.index x_ = tst.iloc[:,:-1].values y_ts_h = regModel.predict(x_)[0].reshape(-1,) y_ts_h = pd.Series(y_ts_h,index=inds_tst) y_tr_h = regModel.predict(x)[0].reshape(-1,) y_tr_h = pd.Series(y_tr_h,index=inds_trn) else: y_ts_h = regModel.predict(tst.iloc[:,:-1]) y_tr_h = regModel.predict(trn.iloc[:,:-1]) if NORM_Y: y_tr_h = scalerY.inverse_transform(y_tr_h) y_ts_h = scalerY.inverse_transform(y_ts_h) y_tr_gt = scalerY.inverse_transform(trn.iloc[:,-1]) y_ts_gt = scalerY.inverse_transform(tst.iloc[:,-1]) else: y_tr_gt = trn.iloc[:,-1] y_ts_gt = tst.iloc[:,-1] if a == 1: tr_r2.append(r2_score(y_tr_gt, y_tr_h)) tr_mse.append(np.sqrt(mean_squared_error(y_tr_gt, y_tr_h))) a = 2 ts_r2.append(r2_score(y_ts_gt, y_ts_h)) ts_mse.append(np.sqrt(mean_squared_error(y_ts_gt, y_ts_h))) if 0: print('trn: MSE %f, R2 %f' %(t_mse,t_r2)) print('%f -- trn: MSE %f, R2 %f' %(bb,t_mse,t_r2)) print('%f -- tst: MSE %f, R2 %f' %(bb,mse,r2)) del inds return {'weights': weights, 'gap_time_delta': gap_time_delta, 'tr_r2': tr_r2, 'ts_r2': ts_r2, 'tr_mse': tr_mse, 'ts_mse': ts_mse}
import code.DataHelper as dh import datetime import dateutil LIN = 0 RBF = 1 POLY2 = 2 POLY3 = 3 POLY4 = 4 Rand_F = 5 GPR = 6 SGD = 7 KRR = 8 DT = 9 gp_kernel = kernels.DotProduct() \ + kernels.WhiteKernel(1e-1) MODELS_DIC ={ #RBF:svm.SVR(kernel='rbf', C=8, epsilon=0.1, gamma=0.001), #RBF: svm.SVR(kernel='rbf', C=12, epsilon=0.16, gamma=0.0001), RBF: svm.SVR(kernel='rbf', C=50, epsilon=0.18, gamma=0.00012), POLY2:svm.SVR(kernel='poly', C=0.2, degree=2, epsilon=0.25), POLY3:svm.SVR(kernel='poly', C=0.12, degree=3, epsilon=0.33), POLY4:svm.SVR(kernel='poly', C=0.07, degree=4, epsilon=1.2), #LIN:svm.SVR(kernel='linear', C=5, epsilon=0.3), #LIN:svm.SVR(kernel='linear', C=120, epsilon=0.25), LIN:svm.SVR(kernel='linear', C=7, epsilon=0.2), #ensemble DT:DecisionTreeRegressor(), #LIN:svm.LinearSVR(C=1, epsilon=0.3), #LIN:svm.LinearSVR(C=0.009, epsilon=0.4),
mask = NMBA > 0 NMBA = mask * 1 X = pd.concat([NMBA, Age, Berlin, Sex, Weight], axis=1) collist = list(X.columns) imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(X) X = imp.transform(X) X = pd.DataFrame(X, columns=collist) X_train, X_test, Y_train, Y_test = \ train_test_split(X,Y,test_size=0.1,random_state=1) # Kernel # myKernel = kernels.Sum(kernels.Matern(), kernels.RBF()) # myKernel = kernels.Sum(myKernel,kernels.RationalQuadratic()) # myKernel = kernels.Sum(myKernel,kernels.DotProduct()) myKernel = kernels.RBF() myKernel = kernels.Sum(myKernel, kernels.DotProduct()) myKernel = kernels.Sum(myKernel, kernels.ConstantKernel()) # myKernel = kernels.Product(myKernel, kernels.DotProduct()) # myKernel = kernels.Sum(myKernel,kernels.ConstantKernel()) model = GaussianProcessClassifier(kernel=myKernel, warm_start=True, n_jobs=2) model.fit(X_train, Y_train) y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(Y_test, predictions) print(round(accuracy, 2)) # filename = 'gp.pkl' # pickle.dump(model, open(filename, 'wb'))
def gaussian_process(x_train, y_train): model = GaussianProcessRegressor(kernel=kernels.DotProduct()) model.fit(x_train, y_train) score = model.score(x_train, y_train) return score