# We take the log here because the error metric is between the log of the # SalePrice and the log of the predicted price. That does mean we need to # exp() the prediction to get an actual sale price. label_df = pd.DataFrame(index=train_df_munged.index, columns=['SalePrice']) label_df['SalePrice'] = np.log(train_df['SalePrice']) print(datetime.datetime.now() - start_time) print('Training set size:', train_df_munged.shape) print('Test set size:', test_df_munged.shape) ################################################################################ ridge = linear_model.RidgeCV() svr = SVR(kernel='rbf', degree=2, C=5, epsilon=1e-2, verbose=1) lasso = linear_model.LassoCV() regr4 = KernelRidge(alpha=0.3, kernel='polynomial', degree=2, coef0=1.85) regr3 = ElasticNet(alpha=0.001) ENSTest = linear_model.ElasticNetCV( alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(train_df_munged, label_df) GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber') regr = CustomEnsembleRegressor([lasso, ENSTest, GBest])
test_baseline_total_PANSS = np.zeros_like(predicted_followup_total_PANSS) test_followup_total_PANSS = np.zeros_like(predicted_followup_total_PANSS) # pull out targets (followup total PANSS), baseline total PANSS and subject ids baseline_total_PANSS = metadata['Baseline|total_panss'].values followup_total_PANSS = metadata[timepoint + '|total_panss'].values subjectids = metadata['Subject ID'].to_list() # initialise list of test subjects test_subjects = [] # precompute kernel matrix K = np.dot(logm_connectivity_data, np.transpose(logm_connectivity_data)) # initialise regressor rgr = KernelRidge(kernel='precomputed') # do MCCV for i in range(n_repeats) : train_index = train_inds_all[i] test_index = test_inds_all[i] print (i) # calculate output indices start_ind = i * test_size stop_ind = start_ind + test_size train_targets = followup_total_PANSS[train_index] test_targets = followup_total_PANSS[test_index]
X = 5 * rng.rand(10000, 1) y = np.sin(X).ravel() # Add noise to targets y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) X_plot = np.linspace(0, 5, 100000)[:, None] # ############################################################################# # Fit regression model train_size = 100 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5, param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}) kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)}) t0 = time.time() svr.fit(X[:train_size], y[:train_size]) svr_fit = time.time() - t0 print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit) t0 = time.time() kr.fit(X[:train_size], y[:train_size]) kr_fit = time.time() - t0 print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit)
rng = np.random.RandomState(0) # Generate sample data X = 15 * rng.rand(100, 1) y = np.sin(X).ravel() y += 3 * (0.5 - rng.rand(X.shape[0])) # add noise # Fit KernelRidge with parameter selection based on 5-fold cross validation param_grid = { "alpha": [1e0, 1e-1, 1e-2, 1e-3], "kernel": [ ExpSineSquared(l, p) for l in np.logspace(-2, 2, 10) for p in np.logspace(0, 2, 10) ] } kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid) stime = time.time() kr.fit(X, y) print("Time for KRR fitting: %.3f" % (time.time() - stime)) gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \ + WhiteKernel(1e-1) gpr = GaussianProcessRegressor(kernel=gp_kernel) stime = time.time() gpr.fit(X, y) print("Time for GPR fitting: %.3f" % (time.time() - stime)) # Predict using kernel ridge X_plot = np.linspace(0, 20, 10000)[:, None] stime = time.time() y_kr = kr.predict(X_plot)
def make_kernel_ridge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5): return KernelRidge(alpha=alpha, kernel=kernel, degree=degree, coef0=coef0)
# Kernel Ridge Regression with hyperparameter optimization and cross-validation using GridSearchCV. # # In[79]: ## Train Kernel Ridge Regression Model ## param_grid = { "alpha": [1e0, 1e-1, 1e-2, 1e-3], "kernel": [ ExpSineSquared(l, p) for l in np.logspace(-2, 2, 10) for p in np.logspace(0, 2, 10) ] } krr_opt = GridSearchCV(KernelRidge(), param_grid=param_grid, cv=5) krr_opt.fit(X_train_fl, Prop_train_fl) Pred_train_fl = krr_opt.predict(X_train_fl) Pred_test_fl = krr_opt.predict(X_test_fl) np.savetxt('Pred_train.csv', Pred_train_fl) np.savetxt('Pred_test.csv', Pred_test_fl) # To compare random forest with another ML technique: # # LASSO Regression with hyperparameter optimization and cross-validation using GridSearchCV. # In[21]: ## Train LASSO Regression Model ##
# Add noise to targets y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) X_plot = np.linspace(0, 5, 100000)[:, None] # ############################################################################# # Fit regression model train_size = 100 svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5, param_grid={ "C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5) }) kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, param_grid={ "alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5) }) t0 = time.time() svr.fit(X[:train_size], y[:train_size]) svr_fit = time.time() - t0 print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit) t0 = time.time() kr.fit(X[:train_size], y[:train_size]) kr_fit = time.time() - t0
def ml_krr(features, labels, train_test_ids, to_predict_features, to_predict_ids, alpha_list=np.logspace(-1, -9, 9), gamma_list=np.logspace(-1, -9, 9), kernel_list=['rbf'], sample_size=0.8, is_scaled=False, n_cv=5, path="."): """ Helper function to estimate the generalization error (MAE, MSE). The hyperparameters alpha and gamma are by default scanned on a logarithmic scale. The data set is split randomly into training and test set. The ratio of the split is defined by sample_size. The training set is used for cross validation. Args: features (2D ndarray) : descriptor input for the machine learning algorithm for training/testing labels (1D ndarray) : property labels for the machine learning algorithm for training/testing train_test_ids (1D ndarray) : pythonic ids (of features and labels) for training and testing. to_predict_features (1D ndarray) : descriptor input for the machine learning algorithm for prediction to_predict_ids (1D ndarray) : pythonic ids (of features and labels) ommited from training and testing. alpha_list (lsit) : Regularization parameter. Defaults to np.logspace(-1, -9, 9) gamma_list (list) : Kernel function scaling parameter. Defaults to np.logspace(-1, -9, 9) kernel_list (list) : List of kernel functions (see sklearn documentation for options). Defaults to ['rbf'] sample_size (float) : The ratio of the training-test split is defined by this. Defaults to 0.8 is_scaled (bool) : If set to True, the features are scaled. Defaults to False n_cv (int) : Number of cross-validation splits. Defaults to 5 path (str) : path whereto to write the machine learning output. Defaults to the current working directory Returns: dict : machine learning results with the following keys: ids_train, ids_test, ids_predicted, method_params, output (.label_predicted, .label_train, .label_test), metrics_test, metrics_validation, metrics_training """ # load, split and scale data x_train, x_test, y_train, y_test, ids_train, ids_test = split_scale_data( features, labels, train_test_ids, sample_size, is_scaled) # Create kernel linear ridge regression object learner = GridSearchCV(KernelRidge(kernel='rbf'), n_jobs=8, cv=n_cv, param_grid={ "alpha": alpha_list, "gamma": gamma_list, "kernel": kernel_list }, scoring='neg_mean_absolute_error', return_train_score=True) t_ml0 = time.time() learner.fit(x_train, y_train) t_ml1 = time.time() print("ml time", str(t_ml1 - t_ml0)) # getting best parameters learner_best = learner.best_estimator_ mae, mse, y_pred, train_y_pred, learner_best = predict_and_error( learner_best, x_test, x_train, y_test) # predict remaining datapoints y_to_predict = learner_best.predict(to_predict_features) ### OUTPUT ### write_output( learner, sample_size, "krr", mae, mse, "param", ids_test, y_test, y_pred, ids_train, y_train, train_y_pred, to_predict_ids, y_to_predict, path, ) ml_results = { "ids_train": ids_train, "ids_test": ids_test, "ids_predicted": to_predict_ids, "method_params": learner.best_params_, "output": { "label_predicted": y_to_predict.tolist(), "label_train": train_y_pred.tolist(), "label_test": y_pred.tolist() }, "metrics_test": { "mae": mae, "mse": mse }, "metrics_validation": { "mae": -1 * learner.cv_results_['mean_test_score'][learner.best_index_], "std": learner.cv_results_['std_test_score'][learner.best_index_] }, "metrics_training": { "mae": -1 * learner.cv_results_['mean_train_score'][learner.best_index_], "std": learner.cv_results_['std_train_score'][learner.best_index_] }, } return ml_results
x = 5 * rng.rand(100, 1) # 生成固定种子的随机数据 y = np.sin(x).ravel() # 标签是一条sin曲线 # print(x) print(y.shape) # 给目标添加噪声 y[::5] += 3 * (0.5 - rng.rand(20, 1).ravel()) print(y.shape) print(y[::5].shape) # (20,) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) kr = KernelRidge(kernel='sigmoid', alpha=0.3, gamma=0.3) kr = KernelRidge(kernel='linear', alpha=0.5, gamma=0.5) kr = KernelRidge(kernel='rbf', alpha=0.5, gamma=0.5) kr = GridSearchCV(KernelRidge(), param_grid={ "kernel": ['rbf', 'laplacian', 'polynomail', 'sigmoid'], "alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5) }) print(np.logspace(-2, 2, 5)) # 模型拟合 kr.fit(x_train, y_train) # 查看超级调参的结果:查看最好的分数和最好的参数 print(kr.best_score_, kr.best_params_)
def evaluate_algorithms(features, targets): cv = ShuffleSplit() #shuffle for crossval. n_splits=10, test_size='default', train_size=None, random_state=None #cv=10 print('Method\tMeanRelativeError\tMeanAbsoluteError') print regLR=LinearRegression() predicted = cross_val_predict(regLR, features, targets, cv=10) #print("%0.3f" % relative_error(targets,predicted)) crossValScore=cross_val_score(regLR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('LinearRegression\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) #print("Mean of mean absolute errors: %0.3f (+/- %0.3f)" % (crossValScore.mean(), crossValScore.std() * 2)) regLR.fit(features, targets) print('coefficients',regLR.coef_) print regL=Lasso() predicted = cross_val_predict(regL, features, targets, cv=10) crossValScore=cross_val_score(regL, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('Lasso\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regR=Ridge() predicted = cross_val_predict(regR, features, targets, cv=10) crossValScore=cross_val_score(regR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('Ridge\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regKR=KernelRidge() predicted = cross_val_predict(regKR, features, targets, cv=10) crossValScore=cross_val_score(regKR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('KernelRidge\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regSVR_Lin=SVR(kernel='linear') predicted = cross_val_predict(regSVR_Lin, features, targets, cv=10) crossValScore=cross_val_score(regSVR_Lin, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('SVR_Lin\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regSVR_Poly=SVR(kernel='poly') predicted = cross_val_predict(regSVR_Poly, features, targets, cv=10) crossValScore=cross_val_score(regSVR_Poly, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('SVR_Poly\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regSVR_RBF=SVR(kernel='rbf') predicted = cross_val_predict(regSVR_RBF, features, targets, cv=10) crossValScore=cross_val_score(regSVR_RBF, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('SVR_RBF\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regKNR_U=KNeighborsRegressor() predicted = cross_val_predict(regKNR_U, features, targets, cv=10) crossValScore=cross_val_score(regKNR_U, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('KNeighborsRegressor, weight uniform\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regKNR_D=KNeighborsRegressor(weights='distance') predicted = cross_val_predict(regKNR_D, features, targets, cv=10) crossValScore=cross_val_score(regKNR_D, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('KNeighborsRegressor, weight inversely proportional to distance\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regGPR=GaussianProcessRegressor() predicted = cross_val_predict(regGPR, features, targets, cv=10) crossValScore=cross_val_score(regGPR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('GaussianProcessRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regMLP=MLPRegressor() predicted = cross_val_predict(regMLP, features, targets, cv=10) crossValScore=cross_val_score(regMLP, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('MLPRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regDTR=DecisionTreeRegressor() predicted = cross_val_predict(regDTR, features, targets, cv=10) crossValScore=cross_val_score(regDTR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('DecisionTreeRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regRFR=RandomForestRegressor() predicted = cross_val_predict(regRFR, features, targets, cv=10) crossValScore=cross_val_score(regRFR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('RandomForestRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regB_RF=BaggingRegressor(RandomForestRegressor()) predicted = cross_val_predict(regB_RF, features, targets, cv=10) crossValScore=cross_val_score(regB_RF, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('BaggingRegressor with RandomForestRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regB_DTR=BaggingRegressor(DecisionTreeRegressor()) predicted = cross_val_predict(regB_DTR, features, targets, cv=10) crossValScore=cross_val_score(regB_DTR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('BaggingRegressor with DecisionTreeRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regB_Lin=BaggingRegressor(LinearRegression()) predicted = cross_val_predict(regB_Lin, features, targets, cv=10) crossValScore=cross_val_score(regB_Lin, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('BaggingRegressor with LinearRegression\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print regGBR=GradientBoostingRegressor() predicted = cross_val_predict(regGBR, features, targets, cv=10) crossValScore=cross_val_score(regGBR, features, targets, cv=cv, scoring='neg_mean_absolute_error') print('GradientBoostingRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean()))) print print print
def getvalue(): state = request.form['state_name'] district = request.form['district_name'] district = district.upper() crop = request.form['crop'] season = request.form['season'] area = request.form['area'] area_float = float(area) year = request.form['year'] year_int = int(year) import pandas as pd import numpy as np from sklearn.kernel_ridge import KernelRidge from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.ensemble import RandomForestRegressor import os os.chdir(r"C:\Users\Hp\Downloads\indian-farming-prediction-master") crop_data = pd.read_csv("crop_modified.csv") crop_data = crop_data.dropna() crop_data['State_Name'] = crop_data['State_Name'].str.rstrip() crop_data['Season'] = crop_data['Season'].str.rstrip() a = crop_data[crop_data['State_Name'] == state] b = a[a['District_Name'] == district] c = b[b['Season'] == season] f = c[c['Crop'] == crop]['Crop_Year'] x = c[c['Crop'] == crop]['Area'] y = c[c['Crop'] == crop]['Production'] from pandas import DataFrame variables = {'Crop_Year': f, 'Area': x, 'Production': y} final = DataFrame(variables, columns=['Crop_Year', 'Area', 'Production']) X = final[['Crop_Year', 'Area']] Y = final['Production'] class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_folds=5): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds # We again fit the data on clones of the original models def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156) # Train cloned base models then create out-of-fold predictions # that are needed to train the cloned meta-model out_of_fold_predictions = np.zeros( (X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, holdout_index in kfold.split(X, y): print(X.columns) instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[holdout_index]) out_of_fold_predictions[holdout_index, i] = y_pred # Now train the cloned meta-model using the out-of-fold predictions as new feature self.meta_model_.fit(out_of_fold_predictions, y) return self # Do the predictions of all base models on the test data and use the averaged predictions as # meta-features for the final prediction which is done by the meta-model def predict(self, X): meta_features = np.column_stack([ np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_ ]) return self.meta_model_.predict(meta_features) class StackedAveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, models): self.models = models # we define clones of the original models to fit the data in def fit(self, X, y): self.models_ = [clone(x) for x in self.models] # Train cloned base models for model in self.models_: model.fit(X, y) return self # Now we do the predictions for cloned models and average them def predict(self, X): predictions = np.column_stack( [model.predict(X) for model in self.models_]) return np.mean(predictions, axis=1) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # from mlxtend.regressor import StackingRegressor # stack = StackingRegressor(regressors=[ENet, KRR], meta_regressor=lasso) # model1=stack.fit(X,Y) # prod2 = model.predict([[year_int, area_float]]) averaged_models = StackedAveragingModels(models=(KRR, lasso)) # import pickle # pickle.dump(averaged_models,open('model.pkl','wb')) # model = pickle.load(open('model.pkl','rb')) model = averaged_models.fit(X, Y) prod2 = model.predict([[year_int, area_float]]) prod2 = abs(prod2) print("Prediction is: ", prod2) yld = prod2 / area_float return render_template("crop.html", pr=prod2, yl=yld)
cv_elastic.plot(title = "Validation") plt.xlabel("Alpha") plt.ylabel("Rmse") # # 4. Kernel ridge regression # Kernel ridge regression (KRR) combines Ridge Regression (linear least squares with l2-norm regularization) with the 'kernel trick' # In[8]: # Setting up list of alpha's alphas = [30,25,20,15,10,5,1,0.1,0.01,0.001] # Iterate over alpha's cv_krr = [rmse_cv(KernelRidge(alpha = alpha)).mean() for alpha in alphas] # Plot findings cv_krr = pd.Series(cv_krr, index = alphas) cv_krr.plot(title = "Validation") plt.xlabel("Alpha") plt.ylabel("Rmse") # # Model initazing # In[6]: #Differnet models that are initiazing #1. Ridge Regression
def test_featurizations_and_plot(featurization_dict, y, inner_cv=KFold(n_splits=5, shuffle=True), outer_cv=ShuffleSplit(n_splits=20, test_size=0.2), make_plots=False, save_plot=False, verbose=False, target_prop_name='', units='', make_combined_plot=False): ''' test a bunch of models and print out a sorted list of CV accuracies inputs: x: training data features, numpy array or Pandas dataframe y: training data labels, numpy array or Pandas dataframe model_dict: a dictionary of the form {name : model()}, where 'name' is a string and 'model()' is a sci-kit-learn model object. ''' RMSE = {} mean_abs_err = {} mean_abs_err_train = {} std_abs_err_train = {} std_abs_err = {} mean_MAPE = {} mean_R2train = {} mean_R2test = {} mean_rPtest = {} mean_rPtrain = {} percent_errors = {} model_dict = {} subplot_index = 1 num_featurizations = len(featurization_dict.keys()) num_fig_rows = 5 num_fig_columns = np.ceil((num_featurizations + 1) / num_fig_rows) if (make_combined_plot | make_plots): plt.clf() plt.figure(figsize=(6 * num_fig_columns, 6 * num_fig_rows)) for (name, x) in featurization_dict.items(): if (verbose): print("running %s" % name) if (x.ndim == 1): x = x.reshape(-1, 1) #------ model selection & grid search ---- #------ older method - not nested CV #grid = np.concatenate([np.logspace(-14, -2, 12),np.logspace(-2, 2, 200)]) #KR_grid = {"alpha": np.logspace(-16, -2, 50), # "gamma": np.logspace(-15, -6, 10), # "kernel" : ['rbf','laplacian']} #model = grid_search(x, y, Lasso(), cv=cv, param_grid={"alpha": grid }, verbose=True) #model = grid_search(x, y, KernelRidge(), param_grid=KR_grid, verbose = True) #model = KernelRidge(**{'alpha': 9.8849590466255858e-11, 'gamma': 1.7433288221999873e-11, 'kernel': 'rbf'}) #model = grid_search(x, y,SVR(), param_grid={"C": np.logspace(-1, 3, 40), "epsilon": np.logspace(-2, 1, 40)}, name = "SVR", verbose=True, cv=cv) #model = grid_search(x, y, RandomForestRegressor(), param_grid={"n_estimators": np.linspace(10, 50,5).astype('int')}, verbose=True) #model = BayesianRidge() #scores_dict = cross_validate(model, x, y, cv=cv, n_jobs=-1, scoring=scorers_dict, return_train_score=True) model = KernelRidge() param_grid = { "alpha": np.logspace(-15, 2, 200), "gamma": np.logspace(-15, -2, 50), "kernel": ['rbf'] } scores_dict = nested_grid_search_CV(x, y, model, param_grid, inner_cv=inner_cv, outer_cv=outer_cv, verbose=verbose) RMSE[name] = np.sqrt(-1 * scores_dict['RMSE'].mean()) mean_MAPE[name] = -1 * scores_dict['MAPE'].mean() mean_abs_err_train[name] = -1 * scores_dict['MAE_train'].mean() mean_abs_err[name] = -1 * scores_dict['MAE'].mean() std_abs_err_train[name] = np.std(-1 * scores_dict['MAE_std_train']) std_abs_err[name] = np.std(-1 * scores_dict['MAE_std']) mean_R2test[name] = scores_dict['R2'].mean() mean_R2train[name] = scores_dict['R2_train'].mean() mean_rPtrain[name] = scores_dict['rP_train'].mean() mean_rPtest[name] = scores_dict['rP'].mean() model_dict[name] = model sorted_names = sorted(mean_abs_err, key=mean_abs_err.__getitem__, reverse=False) if (make_plots): for name in sorted_names: x = featurization_dict[name] if (x.ndim == 1): x = x.reshape(-1, 1) model = model_dict[name] ax = plt.subplot(num_fig_rows, num_fig_columns, subplot_index) subplot_index += 1 plt.xlabel('Actual ' + target_prop_name, fontsize=19) plt.ylabel('Predicted ' + target_prop_name, fontsize=19) #label = '\n mean % error: '+str(mean_MAPE[name]) #name+'\n'+ label = r'$\langle$MAE$\rangle$ (test) = ' + " %4.2f " % ( mean_abs_err[name] ) + units + "\n" + r'$\langle r\rangle$ (test) = %4.2f' % ( mean_rPtest[name]) plt.text(.045, .85, label, fontsize=21, transform=ax.transAxes) kf = outer_cv train, test = kf.split(x).__next__() #first in the generator model.fit(x[train], y[train]) y_pred_test = model.predict(x[test]) y_pred_train = model.predict(x[train]) plt.scatter(y[test], y_pred_test, label='Test', c='blue', alpha=0.7) plt.scatter(y[train], y_pred_train, label='Train', c='lightgreen', alpha=0.7) plt.legend(loc=4, fontsize=21) #square axes maxy = 1.05 * max([max(y_pred_train), max(y_pred_test), max(y)]) miny = .95 * min([min(y_pred_train), min(y_pred_test), min(y)]) #reference line plt.plot([miny, maxy], [miny, maxy], 'k-') plt.xlim([miny, maxy]) plt.ylim([miny, maxy]) plt.tight_layout() if (save_plot): plt.savefig('model_comparison' + target_prop_name.strip() + '.pdf') plt.show() print("\\begin{tabular}{c c c c c c c c c}") print( " name & MAE_{\\ff{train}} & MAE_{\\ff{test}} & MAPE_{\\ff{test}} & RMSE_{\\ff{test}} & R^2_{\\ff{train}} & R^2_{\\ff{test}} & r_{\\ff{train}} & r_{\\ff{test}} \\\\ " ) print("\\hline") for i in range(len(sorted_names)): name = sorted_names[i] print( "%30s & %5.3f $\\pm$ %3.2f & %5.3f $\\pm$ %3.2f & %5.2f & %5.3f & %5.2f & %5.2f & %5.2f & %5.2f \\\\" % (name, mean_abs_err_train[name], std_abs_err_train[name], mean_abs_err[name], std_abs_err[name], mean_MAPE[name], RMSE[name], mean_R2train[name], mean_R2test[name], mean_rPtrain[name], mean_rPtest[name])) print("\\end{tabular}")
X, Y = boston.data, boston.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3) ''' 核岭回归: 在l2正则化的线性模型(岭回归)的基础上,引入了核技术的概念 在岭回归中,用w* = ∑β*z,也就是β代替w 代价函数随之替换一下即可 使用梯度下降求解,β = (λI + K)^-1 * y 特点: 对于中型数据集较快,但对于大数据集就很吃力了 训练时间复杂度O(n^3),挺高的 预测时间复杂度O(n) ''' rg = KernelRidge(alpha=1, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None) rg.fit(X_train, Y_train) Y_pre = rg.predict(X_test) rg.score(X_test, Y_test) ''' alpha 惩罚项系数 kernel 核函数的选定 gamma 核函数的中的一个参数项 degree 多项式核的程度 coef0 多项式核和sigmoid核中一个参数设定 kernel_params 核函数的附加参数 '''
def test_generalization_across_time(): """Test time generalization decoding """ from sklearn.svm import SVC from sklearn.base import is_classifier # KernelRidge is used for testing 1) regression analyses 2) n-dimensional # predictions. from sklearn.kernel_ridge import KernelRidge from sklearn.preprocessing import LabelEncoder from sklearn.metrics import roc_auc_score, mean_squared_error epochs = make_epochs() y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) if check_version('sklearn', '0.18'): from sklearn.model_selection import (KFold, StratifiedKFold, ShuffleSplit, LeaveOneGroupOut) cv = LeaveOneGroupOut() cv_shuffle = ShuffleSplit() # XXX we cannot pass any other parameters than X and y to cv.split # so we have to build it before hand cv_lolo = [ (train, test) for train, test in cv.split(y_4classes, y_4classes, y_4classes) ] # With sklearn >= 0.17, `clf` can be identified as a regressor, and # the scoring metrics can therefore be automatically assigned. scorer_regress = None else: from sklearn.cross_validation import (KFold, StratifiedKFold, ShuffleSplit, LeaveOneLabelOut) cv_shuffle = ShuffleSplit(len(epochs)) cv_lolo = LeaveOneLabelOut(y_4classes) # With sklearn < 0.17, `clf` cannot be identified as a regressor, and # therefore the scoring metrics cannot be automatically assigned. scorer_regress = mean_squared_error # Test default running with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(picks='foo') assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat) assert_raises(ValueError, gat.fit, epochs) with warnings.catch_warnings(record=True): # check classic fit + check manual picks gat.picks = [0] gat.fit(epochs) # check optional y as array gat.picks = None gat.fit(epochs, y=epochs.events[:, 2]) # check optional y as list gat.fit(epochs, y=epochs.events[:, 2].tolist()) assert_equal(len(gat.picks_), len(gat.ch_names), 1) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no " "prediction, no score>", '%s' % gat) assert_equal(gat.ch_names, epochs.ch_names) # test different predict function: with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(predict_method='decision_function') gat.fit(epochs) # With classifier, the default cv is StratifiedKFold assert_true(gat.cv_.__class__ == StratifiedKFold) gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1)) gat.predict_method = 'predict_proba' gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 2)) gat.predict_method = 'foo' assert_raises(NotImplementedError, gat.predict, epochs) gat.predict_method = 'predict' gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1)) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs, no score>", "%s" % gat) gat.score(epochs) assert_true(gat.scorer_.__name__ == 'accuracy_score') # check clf / predict_method combinations for which the scoring metrics # cannot be inferred. gat.scorer = None gat.predict_method = 'decision_function' assert_raises(ValueError, gat.score, epochs) # Check specifying y manually gat.predict_method = 'predict' gat.score(epochs, y=epochs.events[:, 2]) gat.score(epochs, y=epochs.events[:, 2].tolist()) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs,\n scored " "(accuracy_score)>", "%s" % gat) with warnings.catch_warnings(record=True): gat.fit(epochs, y=epochs.events[:, 2]) old_mode = gat.predict_mode gat.predict_mode = 'super-foo-mode' assert_raises(ValueError, gat.predict, epochs) gat.predict_mode = old_mode gat.score(epochs, y=epochs.events[:, 2]) assert_true("accuracy_score" in '%s' % gat.scorer_) epochs2 = epochs.copy() # check _DecodingTime class assert_equal( "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.050 (s), length: 0.050 (s), n_time_windows: 15>", "%s" % gat.train_times_) assert_equal( "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.050 (s), length: 0.050 (s), n_time_windows: 15 x 15>", "%s" % gat.test_times_) # the y-check gat.predict_mode = 'mean-prediction' epochs2.events[:, 2] += 10 gat_ = copy.deepcopy(gat) with use_log_level('error'): assert_raises(ValueError, gat_.score, epochs2) gat.predict_mode = 'cross-validation' # Test basics # --- number of trials assert_true(gat.y_train_.shape[0] == gat.y_true_.shape[0] == len( gat.y_pred_[0][0]) == 14) # --- number of folds assert_true(np.shape(gat.estimators_)[1] == gat.cv) # --- length training size assert_true( len(gat.train_times_['slices']) == 15 == np.shape(gat.estimators_)[0]) # --- length testing sizes assert_true( len(gat.test_times_['slices']) == 15 == np.shape(gat.scores_)[0]) assert_true( len(gat.test_times_['slices'][0]) == 15 == np.shape(gat.scores_)[1]) # Test score_mode gat.score_mode = 'foo' assert_raises(ValueError, gat.score, epochs) gat.score_mode = 'fold-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15, 5]) gat.score_mode = 'mean-sample-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15]) gat.score_mode = 'mean-fold-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15]) gat.predict_mode = 'mean-prediction' with warnings.catch_warnings(record=True) as w: gat.score(epochs) assert_true( any("score_mode changed from " in str(ww.message) for ww in w)) # Test longer time window with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times={'length': .100}) with warnings.catch_warnings(record=True): gat2 = gat.fit(epochs) assert_true(gat is gat2) # return self assert_true(hasattr(gat2, 'cv_')) assert_true(gat2.cv_ != gat.cv) with warnings.catch_warnings(record=True): # not vectorizing scores = gat.score(epochs) assert_true(isinstance(scores, np.ndarray)) # type check assert_equal(len(scores[0]), len(scores)) # shape check assert_equal(len(gat.test_times_['slices'][0][0]), 2) # Decim training steps with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times={'step': .100}) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.score(epochs) assert_true(len(gat.scores_) == len(gat.estimators_) == 8) # training time assert_equal(len(gat.scores_[0]), 15) # testing time # Test start stop training & test cv without n_fold params y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) train_times = dict(start=0.090, stop=0.250) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_lolo, train_times=train_times) # predict without fit assert_raises(RuntimeError, gat.predict, epochs) with warnings.catch_warnings(record=True): gat.fit(epochs, y=y_4classes) gat.score(epochs) assert_equal(len(gat.scores_), 4) assert_equal(gat.train_times_['times'][0], epochs.times[6]) assert_equal(gat.train_times_['times'][-1], epochs.times[9]) # Test score without passing epochs & Test diagonal decoding with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(test_times='diagonal') with warnings.catch_warnings(record=True): # not vectorizing gat.fit(epochs) assert_raises(RuntimeError, gat.score) with warnings.catch_warnings(record=True): # not vectorizing gat.predict(epochs) scores = gat.score() assert_true(scores is gat.scores_) assert_equal(np.shape(gat.scores_), (15, 1)) assert_array_equal( [tim for ttime in gat.test_times_['times'] for tim in ttime], gat.train_times_['times']) # Test generalization across conditions with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(predict_mode='mean-prediction', cv=2) with warnings.catch_warnings(record=True): gat.fit(epochs[0:6]) with warnings.catch_warnings(record=True): # There are some empty test folds because of n_trials gat.predict(epochs[7:]) gat.score(epochs[7:]) # Test training time parameters gat_ = copy.deepcopy(gat) # --- start stop outside time range gat_.train_times = dict(start=-999.) with use_log_level('error'): assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(start=999.) assert_raises(ValueError, gat_.fit, epochs) # --- impossible slices gat_.train_times = dict(step=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=999.) assert_raises(ValueError, gat_.fit, epochs) # Test testing time parameters # --- outside time range gat.test_times = dict(start=-999.) with warnings.catch_warnings(record=True): # no epochs in fold assert_raises(ValueError, gat.predict, epochs) gat.test_times = dict(start=999.) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) # --- impossible slices gat.test_times = dict(step=.000001) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) gat_ = copy.deepcopy(gat) gat_.train_times_['length'] = .000001 gat_.test_times = dict(length=.000001) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat_.predict, epochs) # --- test time region of interest gat.test_times = dict(step=.150) with warnings.catch_warnings(record=True): # not vectorizing gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1)) # --- silly value gat.test_times = 'foo' with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) assert_raises(RuntimeError, gat.score) # --- unmatched length between training and testing time gat.test_times = dict(length=.150) assert_raises(ValueError, gat.predict, epochs) # --- irregular length training and testing times # 2 estimators, the first one is trained on two successive time samples # whereas the second one is trained on a single time sample. train_times = dict(slices=[[0, 1], [1]]) # The first estimator is tested once, the second estimator is tested on # two successive time samples. test_times = dict(slices=[[[0, 1]], [[0], [1]]]) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times=train_times, test_times=test_times) gat.fit(epochs) with warnings.catch_warnings(record=True): # not vectorizing gat.score(epochs) assert_array_equal(np.shape(gat.y_pred_[0]), [1, len(epochs), 1]) assert_array_equal(np.shape(gat.y_pred_[1]), [2, len(epochs), 1]) # check cannot Automatically infer testing times for adhoc training times gat.test_times = None assert_raises(ValueError, gat.predict, epochs) svc = SVC(C=1, kernel='linear', probability=True) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction') with warnings.catch_warnings(record=True): gat.fit(epochs) # sklearn needs it: c.f. # https://github.com/scikit-learn/scikit-learn/issues/2723 # and http://bit.ly/1u7t8UT with use_log_level('error'): assert_raises(ValueError, gat.score, epochs2) gat.score(epochs) assert_true(0.0 <= np.min(scores) <= 1.0) assert_true(0.0 <= np.max(scores) <= 1.0) # Test that error if cv is not partition with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_shuffle, predict_mode='cross-validation') gat.fit(epochs) assert_raises(ValueError, gat.predict, epochs) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_shuffle, predict_mode='mean-prediction') gat.fit(epochs) gat.predict(epochs) # Test that gets error if train on one dataset, test on another, and don't # specify appropriate cv: with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime() gat.fit(epochs) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.predict(epochs) assert_raises(ValueError, gat.predict, epochs[:10]) # Make CV with some empty train and test folds: # --- empty test fold(s) should warn when gat.predict() gat._cv_splits[0] = [gat._cv_splits[0][0], np.empty(0)] with warnings.catch_warnings(record=True) as w: gat.predict(epochs) assert_true(len(w) > 0) assert_true( any('do not have any test epochs' in str(ww.message) for ww in w)) # --- empty train fold(s) should raise when gat.fit() with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=[([0], [1]), ([], [0])]) assert_raises(ValueError, gat.fit, epochs[:2]) # Check that still works with classifier that output y_pred with # shape = (n_trials, 1) instead of (n_trials,) if check_version('sklearn', '0.17'): # no is_regressor before v0.17 with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(clf=KernelRidge(), cv=2) epochs.crop(None, epochs.times[2]) gat.fit(epochs) # With regression the default cv is KFold and not StratifiedKFold assert_true(gat.cv_.__class__ == KFold) gat.score(epochs) # with regression the default scoring metrics is mean squared error assert_true(gat.scorer_.__name__ == 'mean_squared_error') # Test combinations of complex scenarios # 2 or more distinct classes n_classes = [2, 4] # 4 tested # nicely ordered labels or not le = LabelEncoder() y = le.fit_transform(epochs.events[:, 2]) y[len(y) // 2:] += 2 ys = (y, y + 1000) # Univariate and multivariate prediction svc = SVC(C=1, kernel='linear', probability=True) reg = KernelRidge() def scorer_proba(y_true, y_pred): return roc_auc_score(y_true, y_pred[:, 0]) # We re testing 3 scenario: default, classifier + predict_proba, regressor scorers = [None, scorer_proba, scorer_regress] predict_methods = [None, 'predict_proba', None] clfs = [svc, svc, reg] # Test all combinations for clf, predict_method, scorer in zip(clfs, predict_methods, scorers): for y in ys: for n_class in n_classes: for predict_mode in ['cross-validation', 'mean-prediction']: # Cannot use AUC for n_class > 2 if (predict_method == 'predict_proba' and n_class != 2): continue y_ = y % n_class with warnings.catch_warnings(record=True): gat = GeneralizationAcrossTime( cv=2, clf=clf, scorer=scorer, predict_mode=predict_mode) gat.fit(epochs, y=y_) gat.score(epochs, y=y_) # Check that scorer is correctly defined manually and # automatically. scorer_name = gat.scorer_.__name__ if scorer is None: if is_classifier(clf): assert_equal(scorer_name, 'accuracy_score') else: assert_equal(scorer_name, 'mean_squared_error') else: assert_equal(scorer_name, scorer.__name__)
def kernel_ridge_gamma(gamma): return KernelRidge(kernel='rbf', gamma=gamma, alpha=0.001)
import numpy as np from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error #importing the dataset dataset = pd.read_csv('regressionDataSet.csv') x = dataset.iloc[:, 1:].values y = dataset.iloc[:, 0].values #splitting the dataset into training set and test set from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 / 5) #fitting the model on the training set from sklearn.kernel_ridge import KernelRidge regressor = KernelRidge() regressor.fit(x_train, y_train) #predicting the test set results y_pred = regressor.predict(x_test) #calculating r2 r2 = r2_score(y_test, y_pred) #calculating r r = m.sqrt(r2) #calculating error error = mean_absolute_error(y_test, y_pred) #calculating accuracy
def kernel_ridge_alpha(alpha): return KernelRidge(kernel='rbf', gamma=0.1, alpha=alpha)
px = [] py = [] with open('/home/redwards/Desktop/genus_species_analysis/pseudo_coverage.txt', 'r') as fin: for l in fin: p = l.strip().split("\t") px.append(float(p[0])) py.append(float(p[1])) ny = np.array(y) nx = np.array(x) pnx = np.array(px) pny = np.array(py) kr = KernelRidge(kernel='rbf', gamma=7.5e-5, alpha=0.001) kr.fit(nx[:, None], ny[:, None]) x_pred = np.linspace(min(x), max(x), 10000)[:, None] y_pred = kr.predict(x_pred) kr.fit(pnx[:, None], pny[:, None]) px_pred = np.linspace(min(px), max(px), 10000)[:, None] py_pred = kr.predict(px_pred) fig = plt.figure() ax = fig.add_subplot(111) """
def test_kernel_ridge(): pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X) pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X) assert_array_almost_equal(pred, pred2)
def kernelridge(xtrain, ytrain, xtest, ytest, alp): ridge = KernelRidge(alpha=alp) ridge.fit(xtrain, ytrain) y_pred = ridge.predict(xtest) print('MAE:', metrics.mean_absolute_error(ytest, y_pred)) print('MSE:', metrics.mean_squared_error(ytest, y_pred))
def test_kernel_ridge_csc(): pred = (Ridge(alpha=1, fit_intercept=False, solver="cholesky").fit(Xcsc, y).predict(Xcsc)) pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsc, y).predict(Xcsc) assert_array_almost_equal(pred, pred2)
for i in range(len(train_data)): if i % 8 == 0: week_data[i] = str(int(data[i, 0])) + '-' + str(int(data[i, 1])) X_train_weeks, Y_train_weeks = reshape_dataset(week_data, lags, steps_ahead) alpha = np.linspace(1e-15, 5, 100) gamma = np.linspace(1e-15, 1e-1, 100) best_r2 = -1000 training_size = int(X_train.shape[0] * 0.75) validation_size = X_train.shape[0] - training_size for a in alpha: for g in gamma: kr = KernelRidge(kernel='rbf', gamma=g, alpha=a) j = training_size validation_predictions = np.zeros(validation_size) for i in range(validation_size): kr.fit(X_train[i:j], Y_train[i:j]) validation_predictions[i] = kr.predict(np.array([X_train[j]])) j += 1 r2 = metrics.r2_score(Y_train[training_size:], validation_predictions) if r2 > best_r2: best_r2 = r2 best_params = (lags, a, g) best_predictions = np.copy(validation_predictions) print(best_params) mape = np.mean(
def test_kernel_ridge_precomputed(): for kernel in ["linear", "rbf", "poly", "cosine"]: K = pairwise_kernels(X, X, metric=kernel) pred = KernelRidge(kernel=kernel).fit(X, y).predict(X) pred2 = KernelRidge(kernel="precomputed").fit(K, y).predict(K) assert_array_almost_equal(pred, pred2)
Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split( X, y, train_size=0.5, test_size=0.5, random_state=i) score = [ sum(x) for x in zip( score, house_prices_functions.find_cv_error(Xtrain, ytrain)) ] score = [x / k_fold for x in score] print(score[0], " ", score[1]) #final Crossvalidation clfList = [ linear_model.LinearRegression(), ensemble.RandomForestRegressor(), ensemble.GradientBoostingRegressor(), xgb.XGBRegressor(), KernelRidge(), linear_model.BayesianRidge(), lgb.LGBMRegressor(verbose=-1) ] cvSplit = model_selection.ShuffleSplit(n_splits=10, train_size=0.5, test_size=0.5, random_state=0) maxDepthList = [2, 4] nEstimatorsList = [400, 500] num_leavesList = [4, 5] etaList = [0.1, 0.05, 0.01] rndStateList = [0, 1, 2] gammaList = [0] colsample_bytreeList = [0.4] alphaList = [4]
def test_kernel_ridge_precomputed_kernel_unchanged(): K = np.dot(X, X.T) K2 = K.copy() KernelRidge(kernel="precomputed").fit(K, y) assert_array_almost_equal(K, K2)
X = pd.DataFrame(dataset, columns=[ 'mass_density', 'ratio_oxygen_by_transition_metal_atom', 'ratio_atoms_cell_by_cell_vol', 'electronic_energy_band_gap', 'energy_atom', 'point_group', 'c/a_ratio', 'AGL_bulk_mod' ]) y = pd.DataFrame(dataset["AGL_thermal_conductivity"]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_test.head() scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) print(X_train_scaled) print(X_test_scaled) from sklearn.kernel_ridge import KernelRidge from sklearn.model_selection import GridSearchCV #Replace with kernel = 'rbf' for rbf and 'linear' for linear kernel krr = GridSearchCV(KernelRidge(kernel='poly', gamma=0.1), param_grid={ "alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5) }) krr.fit(X_train_scaled, y_train) print("KRR with Polynomial Kernel Model accuracy = ", krr.score(X_test_scaled, y_test))
loss='huber') ''' LASSO ''' lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)).fit(x_train_st, y_train_st) #Version 24 -> alpha = 0.2, degree = 2 y coef = 1 #Version 25 -> Grdient boosting n_stimators = 5000 #Versión 26 -> Lasso alpha = 0.001 ''' KRR ''' KRR = KernelRidge(alpha=0.2, kernel='polynomial', degree=2, coef0=1).fit(x_train_st, y_train_st) # Retraining models GB_model = GBest.fit(train_features, train_labels) ENST_model = ENSTest.fit(train_features_st, train_labels) lasso_model = lasso.fit(train_features_st, train_labels) ## Getting our SalePrice estimation Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st)) + np.exp(lasso_model.predict(test_features_st)) + np.exp(KRR_model.predict(test_features_st))) / 4 Final_labels_train = (np.exp(GB_model.predict(train_features)) + np.exp(ENST_model.predict(train_features_st)) + np.exp(lasso_model.predict(train_features_st)) + np.exp(KRR_model.predict(train_features_st))) / 4
def rmsle_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score( model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf)) return (rmse) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817,
GBest = ensemble.GradientBoostingRegressor(n_estimators=5000, learning_rate=0.05, max_depth=3, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber') ''' LASSO ''' lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ''' KRR ''' KRR = KernelRidge(alpha=0.1, kernel='polynomial', degree=4, coef0=1) # Retraining models GB_model = GBest.fit(train_features, train_labels) KRR_model = KRR.fit(train_features_st, train_labels) ENST_model = ENSTest.fit(train_features_st, train_labels) lasso_model = lasso.fit(train_features_st, train_labels) ## Getting our SalePrice estimation Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st)) + np.exp(lasso_model.predict(test_features_st)) + np.exp(KRR_model.predict(test_features_st))) / 4 Final_labels_train = (np.exp(GB_model.predict(train_features)) + np.exp(ENST_model.predict(train_features_st)) + np.exp(lasso_model.predict(train_features_st)) +