Пример #1
0
# We take the log here because the error metric is between the log of the
# SalePrice and the log of the predicted price. That does mean we need to
# exp() the prediction to get an actual sale price.
label_df = pd.DataFrame(index=train_df_munged.index, columns=['SalePrice'])
label_df['SalePrice'] = np.log(train_df['SalePrice'])
print(datetime.datetime.now() - start_time)

print('Training set size:', train_df_munged.shape)
print('Test set size:', test_df_munged.shape)

################################################################################

ridge = linear_model.RidgeCV()
svr = SVR(kernel='rbf', degree=2, C=5, epsilon=1e-2, verbose=1)
lasso = linear_model.LassoCV()
regr4 = KernelRidge(alpha=0.3, kernel='polynomial', degree=2, coef0=1.85)
regr3 = ElasticNet(alpha=0.001)
ENSTest = linear_model.ElasticNetCV(
    alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],
    l1_ratio=[.01, .1, .5, .9, .99],
    max_iter=5000).fit(train_df_munged, label_df)
GBest = ensemble.GradientBoostingRegressor(n_estimators=3000,
                                           learning_rate=0.05,
                                           max_depth=3,
                                           max_features='sqrt',
                                           min_samples_leaf=15,
                                           min_samples_split=10,
                                           loss='huber')

regr = CustomEnsembleRegressor([lasso, ENSTest, GBest])
test_baseline_total_PANSS = np.zeros_like(predicted_followup_total_PANSS)
test_followup_total_PANSS = np.zeros_like(predicted_followup_total_PANSS)

# pull out targets (followup total PANSS), baseline total PANSS and subject ids
baseline_total_PANSS = metadata['Baseline|total_panss'].values
followup_total_PANSS = metadata[timepoint + '|total_panss'].values
subjectids = metadata['Subject ID'].to_list()

# initialise list of test subjects
test_subjects = []

# precompute kernel matrix
K = np.dot(logm_connectivity_data, np.transpose(logm_connectivity_data))

# initialise regressor
rgr = KernelRidge(kernel='precomputed')

# do MCCV
for i in range(n_repeats) :
    
    train_index = train_inds_all[i]
    test_index = test_inds_all[i]

    print (i)    
    
    # calculate output indices
    start_ind = i * test_size
    stop_ind = start_ind + test_size
    
    train_targets = followup_total_PANSS[train_index]
    test_targets = followup_total_PANSS[test_index]
X = 5 * rng.rand(10000, 1)
y = np.sin(X).ravel()

# Add noise to targets
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))

X_plot = np.linspace(0, 5, 100000)[:, None]

# #############################################################################
# Fit regression model
train_size = 100
svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})

kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})

t0 = time.time()
svr.fit(X[:train_size], y[:train_size])
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
      % svr_fit)

t0 = time.time()
kr.fit(X[:train_size], y[:train_size])
kr_fit = time.time() - t0
print("KRR complexity and bandwidth selected and model fitted in %.3f s"
      % kr_fit)
Пример #4
0
rng = np.random.RandomState(0)

# Generate sample data
X = 15 * rng.rand(100, 1)
y = np.sin(X).ravel()
y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise

# Fit KernelRidge with parameter selection based on 5-fold cross validation
param_grid = {
    "alpha": [1e0, 1e-1, 1e-2, 1e-3],
    "kernel": [
        ExpSineSquared(l, p) for l in np.logspace(-2, 2, 10)
        for p in np.logspace(0, 2, 10)
    ]
}
kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
stime = time.time()
kr.fit(X, y)
print("Time for KRR fitting: %.3f" % (time.time() - stime))

gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
    + WhiteKernel(1e-1)
gpr = GaussianProcessRegressor(kernel=gp_kernel)
stime = time.time()
gpr.fit(X, y)
print("Time for GPR fitting: %.3f" % (time.time() - stime))

# Predict using kernel ridge
X_plot = np.linspace(0, 20, 10000)[:, None]
stime = time.time()
y_kr = kr.predict(X_plot)
def make_kernel_ridge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5):
    return KernelRidge(alpha=alpha, kernel=kernel, degree=degree, coef0=coef0)
Пример #6
0
# Kernel Ridge Regression with hyperparameter optimization and cross-validation using GridSearchCV.
#

# In[79]:

##  Train Kernel Ridge Regression Model  ##

param_grid = {
    "alpha": [1e0, 1e-1, 1e-2, 1e-3],
    "kernel": [
        ExpSineSquared(l, p) for l in np.logspace(-2, 2, 10)
        for p in np.logspace(0, 2, 10)
    ]
}

krr_opt = GridSearchCV(KernelRidge(), param_grid=param_grid, cv=5)

krr_opt.fit(X_train_fl, Prop_train_fl)
Pred_train_fl = krr_opt.predict(X_train_fl)
Pred_test_fl = krr_opt.predict(X_test_fl)

np.savetxt('Pred_train.csv', Pred_train_fl)
np.savetxt('Pred_test.csv', Pred_test_fl)

# To compare random forest with another ML technique:
#
# LASSO Regression with hyperparameter optimization and cross-validation using GridSearchCV.

# In[21]:

##  Train LASSO Regression Model  ##
Пример #7
0
# Add noise to targets
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5))

X_plot = np.linspace(0, 5, 100000)[:, None]

# #############################################################################
# Fit regression model
train_size = 100
svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1),
                   cv=5,
                   param_grid={
                       "C": [1e0, 1e1, 1e2, 1e3],
                       "gamma": np.logspace(-2, 2, 5)
                   })

kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1),
                  cv=5,
                  param_grid={
                      "alpha": [1e0, 0.1, 1e-2, 1e-3],
                      "gamma": np.logspace(-2, 2, 5)
                  })

t0 = time.time()
svr.fit(X[:train_size], y[:train_size])
svr_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s" %
      svr_fit)

t0 = time.time()
kr.fit(X[:train_size], y[:train_size])
kr_fit = time.time() - t0
Пример #8
0
def ml_krr(features,
           labels,
           train_test_ids,
           to_predict_features,
           to_predict_ids,
           alpha_list=np.logspace(-1, -9, 9),
           gamma_list=np.logspace(-1, -9, 9),
           kernel_list=['rbf'],
           sample_size=0.8,
           is_scaled=False,
           n_cv=5,
           path="."):
    """
    Helper function to estimate the generalization error (MAE, MSE). The hyperparameters alpha and gamma are
    by default scanned on a logarithmic scale. The data set is split randomly into training and test set.
    The ratio of the split is defined by sample_size.
    The training set is used for cross validation.

    Args:
        features (2D ndarray) : descriptor input for the machine learning algorithm for training/testing
        labels (1D ndarray) :   property labels for the machine learning algorithm for training/testing
        train_test_ids (1D ndarray) :   pythonic ids (of features and labels) for training and
                                        testing. 
        to_predict_features (1D ndarray) :  descriptor input for the machine learning algorithm 
                                            for prediction
        to_predict_ids (1D ndarray) :   pythonic ids (of features and labels) ommited from training and
                                        testing. 
        alpha_list (lsit) :     Regularization parameter. Defaults to np.logspace(-1, -9, 9)
        gamma_list (list) :     Kernel function scaling parameter. Defaults to np.logspace(-1, -9, 9)
        kernel_list (list) :    List of kernel functions (see sklearn documentation for options).
                                Defaults to ['rbf']
        sample_size (float) : The ratio of the training-test split is defined by this. Defaults to 0.8
        is_scaled (bool) : If set to True, the features are scaled. Defaults to False
        n_cv (int) :    Number of cross-validation splits. Defaults to 5
        path (str) :    path whereto to write the machine learning output. Defaults to the
                        current working directory

    Returns:
        dict :  machine learning results with the following keys:
                ids_train, ids_test, ids_predicted, method_params, 
                output (.label_predicted, .label_train, .label_test),
                metrics_test, metrics_validation, metrics_training
    """
    # load, split and scale data
    x_train, x_test, y_train, y_test, ids_train, ids_test = split_scale_data(
        features, labels, train_test_ids, sample_size, is_scaled)

    # Create kernel linear ridge regression object
    learner = GridSearchCV(KernelRidge(kernel='rbf'),
                           n_jobs=8,
                           cv=n_cv,
                           param_grid={
                               "alpha": alpha_list,
                               "gamma": gamma_list,
                               "kernel": kernel_list
                           },
                           scoring='neg_mean_absolute_error',
                           return_train_score=True)

    t_ml0 = time.time()
    learner.fit(x_train, y_train)
    t_ml1 = time.time()
    print("ml time", str(t_ml1 - t_ml0))

    # getting best parameters
    learner_best = learner.best_estimator_

    mae, mse, y_pred, train_y_pred, learner_best = predict_and_error(
        learner_best, x_test, x_train, y_test)

    # predict remaining datapoints
    y_to_predict = learner_best.predict(to_predict_features)

    ### OUTPUT ###
    write_output(
        learner,
        sample_size,
        "krr",
        mae,
        mse,
        "param",
        ids_test,
        y_test,
        y_pred,
        ids_train,
        y_train,
        train_y_pred,
        to_predict_ids,
        y_to_predict,
        path,
    )

    ml_results = {
        "ids_train": ids_train,
        "ids_test": ids_test,
        "ids_predicted": to_predict_ids,
        "method_params": learner.best_params_,
        "output": {
            "label_predicted": y_to_predict.tolist(),
            "label_train": train_y_pred.tolist(),
            "label_test": y_pred.tolist()
        },
        "metrics_test": {
            "mae": mae,
            "mse": mse
        },
        "metrics_validation": {
            "mae":
            -1 * learner.cv_results_['mean_test_score'][learner.best_index_],
            "std": learner.cv_results_['std_test_score'][learner.best_index_]
        },
        "metrics_training": {
            "mae":
            -1 * learner.cv_results_['mean_train_score'][learner.best_index_],
            "std":
            learner.cv_results_['std_train_score'][learner.best_index_]
        },
    }

    return ml_results
Пример #9
0
x = 5 * rng.rand(100, 1)  # 生成固定种子的随机数据
y = np.sin(x).ravel()  # 标签是一条sin曲线

# print(x)
print(y.shape)

# 给目标添加噪声
y[::5] += 3 * (0.5 - rng.rand(20, 1).ravel())
print(y.shape)
print(y[::5].shape)  # (20,)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

kr = KernelRidge(kernel='sigmoid', alpha=0.3, gamma=0.3)
kr = KernelRidge(kernel='linear', alpha=0.5, gamma=0.5)
kr = KernelRidge(kernel='rbf', alpha=0.5, gamma=0.5)

kr = GridSearchCV(KernelRidge(),
                  param_grid={
                      "kernel": ['rbf', 'laplacian', 'polynomail', 'sigmoid'],
                      "alpha": [1e0, 0.1, 1e-2, 1e-3],
                      "gamma": np.logspace(-2, 2, 5)
                  })
print(np.logspace(-2, 2, 5))
# 模型拟合
kr.fit(x_train, y_train)
# 查看超级调参的结果:查看最好的分数和最好的参数
print(kr.best_score_, kr.best_params_)
def evaluate_algorithms(features, targets):
	
	cv = ShuffleSplit() #shuffle for crossval. n_splits=10, test_size='default', train_size=None, random_state=None
	#cv=10
	
	print('Method\tMeanRelativeError\tMeanAbsoluteError')
	print
	
	regLR=LinearRegression()
	predicted = cross_val_predict(regLR, features, targets, cv=10)
	#print("%0.3f" % relative_error(targets,predicted))
	crossValScore=cross_val_score(regLR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('LinearRegression\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	#print("Mean of mean absolute errors: %0.3f (+/- %0.3f)" % (crossValScore.mean(), crossValScore.std() * 2))
	regLR.fit(features, targets)
	print('coefficients',regLR.coef_)
	
	print
	
	regL=Lasso()
	predicted = cross_val_predict(regL, features, targets, cv=10)
	crossValScore=cross_val_score(regL, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('Lasso\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regR=Ridge()
	predicted = cross_val_predict(regR, features, targets, cv=10)
	crossValScore=cross_val_score(regR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('Ridge\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regKR=KernelRidge()
	predicted = cross_val_predict(regKR, features, targets, cv=10)
	crossValScore=cross_val_score(regKR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('KernelRidge\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regSVR_Lin=SVR(kernel='linear')
	predicted = cross_val_predict(regSVR_Lin, features, targets, cv=10)
	crossValScore=cross_val_score(regSVR_Lin, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('SVR_Lin\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regSVR_Poly=SVR(kernel='poly')
	predicted = cross_val_predict(regSVR_Poly, features, targets, cv=10)
	crossValScore=cross_val_score(regSVR_Poly, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('SVR_Poly\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regSVR_RBF=SVR(kernel='rbf')
	predicted = cross_val_predict(regSVR_RBF, features, targets, cv=10)
	crossValScore=cross_val_score(regSVR_RBF, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('SVR_RBF\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regKNR_U=KNeighborsRegressor()
	predicted = cross_val_predict(regKNR_U, features, targets, cv=10)
	crossValScore=cross_val_score(regKNR_U, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('KNeighborsRegressor, weight uniform\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regKNR_D=KNeighborsRegressor(weights='distance')
	predicted = cross_val_predict(regKNR_D, features, targets, cv=10)
	crossValScore=cross_val_score(regKNR_D, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('KNeighborsRegressor, weight inversely proportional to distance\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regGPR=GaussianProcessRegressor()
	predicted = cross_val_predict(regGPR, features, targets, cv=10)
	crossValScore=cross_val_score(regGPR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('GaussianProcessRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regMLP=MLPRegressor()
	predicted = cross_val_predict(regMLP, features, targets, cv=10)
	crossValScore=cross_val_score(regMLP, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('MLPRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regDTR=DecisionTreeRegressor()
	predicted = cross_val_predict(regDTR, features, targets, cv=10)
	crossValScore=cross_val_score(regDTR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('DecisionTreeRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regRFR=RandomForestRegressor()
	predicted = cross_val_predict(regRFR, features, targets, cv=10)
	crossValScore=cross_val_score(regRFR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('RandomForestRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regB_RF=BaggingRegressor(RandomForestRegressor())
	predicted = cross_val_predict(regB_RF, features, targets, cv=10)
	crossValScore=cross_val_score(regB_RF, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('BaggingRegressor with RandomForestRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regB_DTR=BaggingRegressor(DecisionTreeRegressor())
	predicted = cross_val_predict(regB_DTR, features, targets, cv=10)
	crossValScore=cross_val_score(regB_DTR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('BaggingRegressor with DecisionTreeRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
	
	regB_Lin=BaggingRegressor(LinearRegression())
	predicted = cross_val_predict(regB_Lin, features, targets, cv=10)
	crossValScore=cross_val_score(regB_Lin, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('BaggingRegressor with LinearRegression\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print

	regGBR=GradientBoostingRegressor()
	predicted = cross_val_predict(regGBR, features, targets, cv=10)
	crossValScore=cross_val_score(regGBR, features, targets, cv=cv, scoring='neg_mean_absolute_error')
	print('GradientBoostingRegressor\t%0.3f\t%0.3f'% (relative_error(targets,predicted), abs(crossValScore.mean())))
	
	print
		
		
	print
	print
def getvalue():
    state = request.form['state_name']
    district = request.form['district_name']
    district = district.upper()
    crop = request.form['crop']
    season = request.form['season']
    area = request.form['area']
    area_float = float(area)
    year = request.form['year']
    year_int = int(year)
    import pandas as pd
    import numpy as np
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import RobustScaler
    from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
    from sklearn.ensemble import RandomForestRegressor
    import os
    os.chdir(r"C:\Users\Hp\Downloads\indian-farming-prediction-master")
    crop_data = pd.read_csv("crop_modified.csv")
    crop_data = crop_data.dropna()
    crop_data['State_Name'] = crop_data['State_Name'].str.rstrip()
    crop_data['Season'] = crop_data['Season'].str.rstrip()
    a = crop_data[crop_data['State_Name'] == state]
    b = a[a['District_Name'] == district]
    c = b[b['Season'] == season]
    f = c[c['Crop'] == crop]['Crop_Year']
    x = c[c['Crop'] == crop]['Area']
    y = c[c['Crop'] == crop]['Production']
    from pandas import DataFrame
    variables = {'Crop_Year': f, 'Area': x, 'Production': y}
    final = DataFrame(variables, columns=['Crop_Year', 'Area', 'Production'])
    X = final[['Crop_Year', 'Area']]
    Y = final['Production']

    class StackingAveragedModels(BaseEstimator, RegressorMixin,
                                 TransformerMixin):
        def __init__(self, base_models, meta_model, n_folds=5):
            self.base_models = base_models
            self.meta_model = meta_model
            self.n_folds = n_folds

        # We again fit the data on clones of the original models
        def fit(self, X, y):
            self.base_models_ = [list() for x in self.base_models]
            self.meta_model_ = clone(self.meta_model)
            kfold = KFold(n_splits=self.n_folds,
                          shuffle=True,
                          random_state=156)

            # Train cloned base models then create out-of-fold predictions
            # that are needed to train the cloned meta-model
            out_of_fold_predictions = np.zeros(
                (X.shape[0], len(self.base_models)))
            for i, model in enumerate(self.base_models):
                for train_index, holdout_index in kfold.split(X, y):
                    print(X.columns)
                    instance = clone(model)
                    self.base_models_[i].append(instance)
                    instance.fit(X[train_index], y[train_index])
                    y_pred = instance.predict(X[holdout_index])
                    out_of_fold_predictions[holdout_index, i] = y_pred

            # Now train the cloned  meta-model using the out-of-fold predictions as new feature
            self.meta_model_.fit(out_of_fold_predictions, y)
            return self

        # Do the predictions of all base models on the test data and use the averaged predictions as
        # meta-features for the final prediction which is done by the meta-model
        def predict(self, X):
            meta_features = np.column_stack([
                np.column_stack([model.predict(X)
                                 for model in base_models]).mean(axis=1)
                for base_models in self.base_models_
            ])
            return self.meta_model_.predict(meta_features)

    class StackedAveragingModels(BaseEstimator, RegressorMixin,
                                 TransformerMixin):
        def __init__(self, models):
            self.models = models

        # we define clones of the original models to fit the data in
        def fit(self, X, y):
            self.models_ = [clone(x) for x in self.models]

            # Train cloned base models
            for model in self.models_:
                model.fit(X, y)

            return self

        # Now we do the predictions for cloned models and average them
        def predict(self, X):
            predictions = np.column_stack(
                [model.predict(X) for model in self.models_])
            return np.mean(predictions, axis=1)

    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
    ENet = make_pipeline(RobustScaler(),
                         ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
    # from mlxtend.regressor import StackingRegressor
    # stack = StackingRegressor(regressors=[ENet, KRR],  meta_regressor=lasso)
    # model1=stack.fit(X,Y)
    # prod2 = model.predict([[year_int, area_float]])
    averaged_models = StackedAveragingModels(models=(KRR, lasso))

    # import pickle
    # pickle.dump(averaged_models,open('model.pkl','wb'))
    # model = pickle.load(open('model.pkl','rb'))
    model = averaged_models.fit(X, Y)
    prod2 = model.predict([[year_int, area_float]])
    prod2 = abs(prod2)
    print("Prediction is: ", prod2)
    yld = prod2 / area_float
    return render_template("crop.html", pr=prod2, yl=yld)
Пример #12
0
cv_elastic.plot(title = "Validation")
plt.xlabel("Alpha")
plt.ylabel("Rmse")


# # 4. Kernel ridge regression
# Kernel ridge regression (KRR) combines Ridge Regression (linear least squares with l2-norm regularization) with the 'kernel trick'

# In[8]:


# Setting up list of alpha's
alphas = [30,25,20,15,10,5,1,0.1,0.01,0.001]

# Iterate over alpha's
cv_krr = [rmse_cv(KernelRidge(alpha = alpha)).mean() for alpha in alphas]

# Plot findings
cv_krr = pd.Series(cv_krr, index = alphas)
cv_krr.plot(title = "Validation")
plt.xlabel("Alpha")
plt.ylabel("Rmse")


# # Model initazing

# In[6]:


#Differnet models that are initiazing 
#1. Ridge Regression
Пример #13
0
def test_featurizations_and_plot(featurization_dict,
                                 y,
                                 inner_cv=KFold(n_splits=5, shuffle=True),
                                 outer_cv=ShuffleSplit(n_splits=20,
                                                       test_size=0.2),
                                 make_plots=False,
                                 save_plot=False,
                                 verbose=False,
                                 target_prop_name='',
                                 units='',
                                 make_combined_plot=False):
    ''' test a bunch of models and print out a sorted list of CV accuracies
        inputs:
            x: training data features, numpy array or Pandas dataframe
            y: training data labels, numpy array or Pandas dataframe
            model_dict: a dictionary of the form {name : model()}, where 'name' is a string
                        and 'model()' is a sci-kit-learn model object.
    '''
    RMSE = {}
    mean_abs_err = {}
    mean_abs_err_train = {}
    std_abs_err_train = {}
    std_abs_err = {}
    mean_MAPE = {}
    mean_R2train = {}
    mean_R2test = {}
    mean_rPtest = {}
    mean_rPtrain = {}
    percent_errors = {}
    model_dict = {}
    subplot_index = 1

    num_featurizations = len(featurization_dict.keys())

    num_fig_rows = 5
    num_fig_columns = np.ceil((num_featurizations + 1) / num_fig_rows)

    if (make_combined_plot | make_plots):
        plt.clf()
        plt.figure(figsize=(6 * num_fig_columns, 6 * num_fig_rows))

    for (name, x) in featurization_dict.items():
        if (verbose): print("running %s" % name)

        if (x.ndim == 1):
            x = x.reshape(-1, 1)

        #------ model selection & grid search ----
        #------ older method - not nested CV
        #grid = np.concatenate([np.logspace(-14, -2, 12),np.logspace(-2, 2, 200)])
        #KR_grid = {"alpha": np.logspace(-16, -2, 50),
        #                 "gamma": np.logspace(-15, -6, 10),
        #                "kernel" : ['rbf','laplacian']}
        #model = grid_search(x, y, Lasso(), cv=cv, param_grid={"alpha": grid }, verbose=True)
        #model = grid_search(x, y, KernelRidge(), param_grid=KR_grid, verbose = True)
        #model = KernelRidge(**{'alpha': 9.8849590466255858e-11, 'gamma': 1.7433288221999873e-11, 'kernel': 'rbf'})
        #model = grid_search(x, y,SVR(), param_grid={"C": np.logspace(-1, 3, 40), "epsilon": np.logspace(-2, 1, 40)}, name = "SVR", verbose=True, cv=cv)
        #model = grid_search(x, y, RandomForestRegressor(), param_grid={"n_estimators": np.linspace(10, 50,5).astype('int')}, verbose=True)
        #model = BayesianRidge()
        #scores_dict = cross_validate(model, x, y, cv=cv, n_jobs=-1, scoring=scorers_dict, return_train_score=True)

        model = KernelRidge()
        param_grid = {
            "alpha": np.logspace(-15, 2, 200),
            "gamma": np.logspace(-15, -2, 50),
            "kernel": ['rbf']
        }

        scores_dict = nested_grid_search_CV(x,
                                            y,
                                            model,
                                            param_grid,
                                            inner_cv=inner_cv,
                                            outer_cv=outer_cv,
                                            verbose=verbose)

        RMSE[name] = np.sqrt(-1 * scores_dict['RMSE'].mean())
        mean_MAPE[name] = -1 * scores_dict['MAPE'].mean()
        mean_abs_err_train[name] = -1 * scores_dict['MAE_train'].mean()
        mean_abs_err[name] = -1 * scores_dict['MAE'].mean()
        std_abs_err_train[name] = np.std(-1 * scores_dict['MAE_std_train'])
        std_abs_err[name] = np.std(-1 * scores_dict['MAE_std'])
        mean_R2test[name] = scores_dict['R2'].mean()
        mean_R2train[name] = scores_dict['R2_train'].mean()
        mean_rPtrain[name] = scores_dict['rP_train'].mean()
        mean_rPtest[name] = scores_dict['rP'].mean()
        model_dict[name] = model

    sorted_names = sorted(mean_abs_err,
                          key=mean_abs_err.__getitem__,
                          reverse=False)

    if (make_plots):
        for name in sorted_names:
            x = featurization_dict[name]
            if (x.ndim == 1):
                x = x.reshape(-1, 1)
            model = model_dict[name]
            ax = plt.subplot(num_fig_rows, num_fig_columns, subplot_index)
            subplot_index += 1
            plt.xlabel('Actual ' + target_prop_name, fontsize=19)
            plt.ylabel('Predicted ' + target_prop_name, fontsize=19)
            #label = '\n mean % error: '+str(mean_MAPE[name])
            #name+'\n'+
            label = r'$\langle$MAE$\rangle$ (test) = ' + " %4.2f " % (
                mean_abs_err[name]
            ) + units + "\n" + r'$\langle r\rangle$ (test) = %4.2f' % (
                mean_rPtest[name])
            plt.text(.045, .85, label, fontsize=21, transform=ax.transAxes)

            kf = outer_cv
            train, test = kf.split(x).__next__()  #first in the generator
            model.fit(x[train], y[train])
            y_pred_test = model.predict(x[test])
            y_pred_train = model.predict(x[train])
            plt.scatter(y[test],
                        y_pred_test,
                        label='Test',
                        c='blue',
                        alpha=0.7)
            plt.scatter(y[train],
                        y_pred_train,
                        label='Train',
                        c='lightgreen',
                        alpha=0.7)
            plt.legend(loc=4, fontsize=21)

            #square axes
            maxy = 1.05 * max([max(y_pred_train), max(y_pred_test), max(y)])
            miny = .95 * min([min(y_pred_train), min(y_pred_test), min(y)])

            #reference line
            plt.plot([miny, maxy], [miny, maxy], 'k-')
            plt.xlim([miny, maxy])
            plt.ylim([miny, maxy])

    plt.tight_layout()
    if (save_plot):
        plt.savefig('model_comparison' + target_prop_name.strip() + '.pdf')
    plt.show()

    print("\\begin{tabular}{c c c c c c c c c}")
    print(
        "                   name          & MAE_{\\ff{train}}   &  MAE_{\\ff{test}}  & MAPE_{\\ff{test}} & RMSE_{\\ff{test}}  & R^2_{\\ff{train}} &  R^2_{\\ff{test}} &  r_{\\ff{train}} & r_{\\ff{test}}          \\\\  "
    )
    print("\\hline")
    for i in range(len(sorted_names)):
        name = sorted_names[i]
        print(
            "%30s &   %5.3f $\\pm$ %3.2f & %5.3f $\\pm$ %3.2f & %5.2f &  %5.3f &  %5.2f & %5.2f & %5.2f & %5.2f  \\\\"
            % (name, mean_abs_err_train[name], std_abs_err_train[name],
               mean_abs_err[name], std_abs_err[name], mean_MAPE[name],
               RMSE[name], mean_R2train[name], mean_R2test[name],
               mean_rPtrain[name], mean_rPtest[name]))
    print("\\end{tabular}")
Пример #14
0
X, Y = boston.data, boston.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3)
'''
    核岭回归:
        在l2正则化的线性模型(岭回归)的基础上,引入了核技术的概念
        在岭回归中,用w* = ∑β*z,也就是β代替w
        代价函数随之替换一下即可
        使用梯度下降求解,β = (λI + K)^-1 * y
        特点:
            对于中型数据集较快,但对于大数据集就很吃力了
            训练时间复杂度O(n^3),挺高的
            预测时间复杂度O(n)
'''

rg = KernelRidge(alpha=1,
                 kernel='linear',
                 gamma=None,
                 degree=3,
                 coef0=1,
                 kernel_params=None)
rg.fit(X_train, Y_train)
Y_pre = rg.predict(X_test)
rg.score(X_test, Y_test)
'''
    alpha               惩罚项系数
    kernel              核函数的选定
    gamma               核函数的中的一个参数项
    degree              多项式核的程度
    coef0               多项式核和sigmoid核中一个参数设定
    kernel_params       核函数的附加参数
'''
Пример #15
0
def test_generalization_across_time():
    """Test time generalization decoding
    """
    from sklearn.svm import SVC
    from sklearn.base import is_classifier
    # KernelRidge is used for testing 1) regression analyses 2) n-dimensional
    # predictions.
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import roc_auc_score, mean_squared_error

    epochs = make_epochs()
    y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1))
    if check_version('sklearn', '0.18'):
        from sklearn.model_selection import (KFold, StratifiedKFold,
                                             ShuffleSplit, LeaveOneGroupOut)
        cv = LeaveOneGroupOut()
        cv_shuffle = ShuffleSplit()
        # XXX we cannot pass any other parameters than X and y to cv.split
        # so we have to build it before hand
        cv_lolo = [
            (train, test)
            for train, test in cv.split(y_4classes, y_4classes, y_4classes)
        ]

        # With sklearn >= 0.17, `clf` can be identified as a regressor, and
        # the scoring metrics can therefore be automatically assigned.
        scorer_regress = None
    else:
        from sklearn.cross_validation import (KFold, StratifiedKFold,
                                              ShuffleSplit, LeaveOneLabelOut)
        cv_shuffle = ShuffleSplit(len(epochs))
        cv_lolo = LeaveOneLabelOut(y_4classes)

        # With sklearn < 0.17, `clf` cannot be identified as a regressor, and
        # therefore the scoring metrics cannot be automatically assigned.
        scorer_regress = mean_squared_error
    # Test default running
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(picks='foo')
    assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat)
    assert_raises(ValueError, gat.fit, epochs)
    with warnings.catch_warnings(record=True):
        # check classic fit + check manual picks
        gat.picks = [0]
        gat.fit(epochs)
        # check optional y as array
        gat.picks = None
        gat.fit(epochs, y=epochs.events[:, 2])
        # check optional y as list
        gat.fit(epochs, y=epochs.events[:, 2].tolist())
    assert_equal(len(gat.picks_), len(gat.ch_names), 1)
    assert_equal(
        "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no "
        "prediction, no score>", '%s' % gat)
    assert_equal(gat.ch_names, epochs.ch_names)
    # test different predict function:
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(predict_method='decision_function')
    gat.fit(epochs)
    # With classifier, the default cv is StratifiedKFold
    assert_true(gat.cv_.__class__ == StratifiedKFold)
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1))
    gat.predict_method = 'predict_proba'
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 2))
    gat.predict_method = 'foo'
    assert_raises(NotImplementedError, gat.predict, epochs)
    gat.predict_method = 'predict'
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1))
    assert_equal(
        "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), "
        "predicted 14 epochs, no score>", "%s" % gat)
    gat.score(epochs)
    assert_true(gat.scorer_.__name__ == 'accuracy_score')
    # check clf / predict_method combinations for which the scoring metrics
    # cannot be inferred.
    gat.scorer = None
    gat.predict_method = 'decision_function'
    assert_raises(ValueError, gat.score, epochs)
    # Check specifying y manually
    gat.predict_method = 'predict'
    gat.score(epochs, y=epochs.events[:, 2])
    gat.score(epochs, y=epochs.events[:, 2].tolist())
    assert_equal(
        "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), "
        "predicted 14 epochs,\n scored "
        "(accuracy_score)>", "%s" % gat)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs, y=epochs.events[:, 2])

    old_mode = gat.predict_mode
    gat.predict_mode = 'super-foo-mode'
    assert_raises(ValueError, gat.predict, epochs)
    gat.predict_mode = old_mode

    gat.score(epochs, y=epochs.events[:, 2])
    assert_true("accuracy_score" in '%s' % gat.scorer_)
    epochs2 = epochs.copy()

    # check _DecodingTime class
    assert_equal(
        "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: "
        "0.050 (s), length: 0.050 (s), n_time_windows: 15>",
        "%s" % gat.train_times_)
    assert_equal(
        "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: "
        "0.050 (s), length: 0.050 (s), n_time_windows: 15 x 15>",
        "%s" % gat.test_times_)

    # the y-check
    gat.predict_mode = 'mean-prediction'
    epochs2.events[:, 2] += 10
    gat_ = copy.deepcopy(gat)
    with use_log_level('error'):
        assert_raises(ValueError, gat_.score, epochs2)
    gat.predict_mode = 'cross-validation'

    # Test basics
    # --- number of trials
    assert_true(gat.y_train_.shape[0] == gat.y_true_.shape[0] == len(
        gat.y_pred_[0][0]) == 14)
    # ---  number of folds
    assert_true(np.shape(gat.estimators_)[1] == gat.cv)
    # ---  length training size
    assert_true(
        len(gat.train_times_['slices']) == 15 == np.shape(gat.estimators_)[0])
    # ---  length testing sizes
    assert_true(
        len(gat.test_times_['slices']) == 15 == np.shape(gat.scores_)[0])
    assert_true(
        len(gat.test_times_['slices'][0]) == 15 == np.shape(gat.scores_)[1])

    # Test score_mode
    gat.score_mode = 'foo'
    assert_raises(ValueError, gat.score, epochs)
    gat.score_mode = 'fold-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15, 5])
    gat.score_mode = 'mean-sample-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15])
    gat.score_mode = 'mean-fold-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15])
    gat.predict_mode = 'mean-prediction'
    with warnings.catch_warnings(record=True) as w:
        gat.score(epochs)
        assert_true(
            any("score_mode changed from " in str(ww.message) for ww in w))

    # Test longer time window
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times={'length': .100})
    with warnings.catch_warnings(record=True):
        gat2 = gat.fit(epochs)
    assert_true(gat is gat2)  # return self
    assert_true(hasattr(gat2, 'cv_'))
    assert_true(gat2.cv_ != gat.cv)
    with warnings.catch_warnings(record=True):  # not vectorizing
        scores = gat.score(epochs)
    assert_true(isinstance(scores, np.ndarray))  # type check
    assert_equal(len(scores[0]), len(scores))  # shape check
    assert_equal(len(gat.test_times_['slices'][0][0]), 2)
    # Decim training steps
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times={'step': .100})
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)
    gat.score(epochs)
    assert_true(len(gat.scores_) == len(gat.estimators_) == 8)  # training time
    assert_equal(len(gat.scores_[0]), 15)  # testing time

    # Test start stop training & test cv without n_fold params
    y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1))
    train_times = dict(start=0.090, stop=0.250)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_lolo, train_times=train_times)
    # predict without fit
    assert_raises(RuntimeError, gat.predict, epochs)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs, y=y_4classes)
    gat.score(epochs)
    assert_equal(len(gat.scores_), 4)
    assert_equal(gat.train_times_['times'][0], epochs.times[6])
    assert_equal(gat.train_times_['times'][-1], epochs.times[9])

    # Test score without passing epochs & Test diagonal decoding
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(test_times='diagonal')
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.fit(epochs)
    assert_raises(RuntimeError, gat.score)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.predict(epochs)
    scores = gat.score()
    assert_true(scores is gat.scores_)
    assert_equal(np.shape(gat.scores_), (15, 1))
    assert_array_equal(
        [tim for ttime in gat.test_times_['times'] for tim in ttime],
        gat.train_times_['times'])
    # Test generalization across conditions
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(predict_mode='mean-prediction', cv=2)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs[0:6])
    with warnings.catch_warnings(record=True):
        # There are some empty test folds because of n_trials
        gat.predict(epochs[7:])
        gat.score(epochs[7:])

    # Test training time parameters
    gat_ = copy.deepcopy(gat)
    # --- start stop outside time range
    gat_.train_times = dict(start=-999.)
    with use_log_level('error'):
        assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(start=999.)
    assert_raises(ValueError, gat_.fit, epochs)
    # --- impossible slices
    gat_.train_times = dict(step=.000001)
    assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(length=.000001)
    assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(length=999.)
    assert_raises(ValueError, gat_.fit, epochs)

    # Test testing time parameters
    # --- outside time range
    gat.test_times = dict(start=-999.)
    with warnings.catch_warnings(record=True):  # no epochs in fold
        assert_raises(ValueError, gat.predict, epochs)
    gat.test_times = dict(start=999.)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    # --- impossible slices
    gat.test_times = dict(step=.000001)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    gat_ = copy.deepcopy(gat)
    gat_.train_times_['length'] = .000001
    gat_.test_times = dict(length=.000001)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat_.predict, epochs)
    # --- test time region of interest
    gat.test_times = dict(step=.150)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1))
    # --- silly value
    gat.test_times = 'foo'
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    assert_raises(RuntimeError, gat.score)
    # --- unmatched length between training and testing time
    gat.test_times = dict(length=.150)
    assert_raises(ValueError, gat.predict, epochs)
    # --- irregular length training and testing times
    # 2 estimators, the first one is trained on two successive time samples
    # whereas the second one is trained on a single time sample.
    train_times = dict(slices=[[0, 1], [1]])
    # The first estimator is tested once, the second estimator is tested on
    # two successive time samples.
    test_times = dict(slices=[[[0, 1]], [[0], [1]]])
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times=train_times,
                                       test_times=test_times)
    gat.fit(epochs)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.score(epochs)
    assert_array_equal(np.shape(gat.y_pred_[0]), [1, len(epochs), 1])
    assert_array_equal(np.shape(gat.y_pred_[1]), [2, len(epochs), 1])
    # check cannot Automatically infer testing times for adhoc training times
    gat.test_times = None
    assert_raises(ValueError, gat.predict, epochs)

    svc = SVC(C=1, kernel='linear', probability=True)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction')
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)

    # sklearn needs it: c.f.
    # https://github.com/scikit-learn/scikit-learn/issues/2723
    # and http://bit.ly/1u7t8UT
    with use_log_level('error'):
        assert_raises(ValueError, gat.score, epochs2)
        gat.score(epochs)
    assert_true(0.0 <= np.min(scores) <= 1.0)
    assert_true(0.0 <= np.max(scores) <= 1.0)

    # Test that error if cv is not partition
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_shuffle,
                                       predict_mode='cross-validation')
    gat.fit(epochs)
    assert_raises(ValueError, gat.predict, epochs)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_shuffle,
                                       predict_mode='mean-prediction')
    gat.fit(epochs)
    gat.predict(epochs)

    # Test that gets error if train on one dataset, test on another, and don't
    # specify appropriate cv:
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime()
    gat.fit(epochs)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)

    gat.predict(epochs)
    assert_raises(ValueError, gat.predict, epochs[:10])

    # Make CV with some empty train and test folds:
    # --- empty test fold(s) should warn when gat.predict()
    gat._cv_splits[0] = [gat._cv_splits[0][0], np.empty(0)]
    with warnings.catch_warnings(record=True) as w:
        gat.predict(epochs)
        assert_true(len(w) > 0)
        assert_true(
            any('do not have any test epochs' in str(ww.message) for ww in w))
    # --- empty train fold(s) should raise when gat.fit()
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=[([0], [1]), ([], [0])])
    assert_raises(ValueError, gat.fit, epochs[:2])

    # Check that still works with classifier that output y_pred with
    # shape = (n_trials, 1) instead of (n_trials,)
    if check_version('sklearn', '0.17'):  # no is_regressor before v0.17
        with warnings.catch_warnings(record=True):  # dep
            gat = GeneralizationAcrossTime(clf=KernelRidge(), cv=2)
        epochs.crop(None, epochs.times[2])
        gat.fit(epochs)
        # With regression the default cv is KFold and not StratifiedKFold
        assert_true(gat.cv_.__class__ == KFold)
        gat.score(epochs)
        # with regression the default scoring metrics is mean squared error
        assert_true(gat.scorer_.__name__ == 'mean_squared_error')

    # Test combinations of complex scenarios
    # 2 or more distinct classes
    n_classes = [2, 4]  # 4 tested
    # nicely ordered labels or not
    le = LabelEncoder()
    y = le.fit_transform(epochs.events[:, 2])
    y[len(y) // 2:] += 2
    ys = (y, y + 1000)
    # Univariate and multivariate prediction
    svc = SVC(C=1, kernel='linear', probability=True)
    reg = KernelRidge()

    def scorer_proba(y_true, y_pred):
        return roc_auc_score(y_true, y_pred[:, 0])

    # We re testing 3 scenario: default, classifier + predict_proba, regressor
    scorers = [None, scorer_proba, scorer_regress]
    predict_methods = [None, 'predict_proba', None]
    clfs = [svc, svc, reg]
    # Test all combinations
    for clf, predict_method, scorer in zip(clfs, predict_methods, scorers):
        for y in ys:
            for n_class in n_classes:
                for predict_mode in ['cross-validation', 'mean-prediction']:
                    # Cannot use AUC for n_class > 2
                    if (predict_method == 'predict_proba' and n_class != 2):
                        continue

                    y_ = y % n_class

                    with warnings.catch_warnings(record=True):
                        gat = GeneralizationAcrossTime(
                            cv=2,
                            clf=clf,
                            scorer=scorer,
                            predict_mode=predict_mode)
                        gat.fit(epochs, y=y_)
                        gat.score(epochs, y=y_)

                    # Check that scorer is correctly defined manually and
                    # automatically.
                    scorer_name = gat.scorer_.__name__
                    if scorer is None:
                        if is_classifier(clf):
                            assert_equal(scorer_name, 'accuracy_score')
                        else:
                            assert_equal(scorer_name, 'mean_squared_error')
                    else:
                        assert_equal(scorer_name, scorer.__name__)
Пример #16
0
def kernel_ridge_gamma(gamma):
    return KernelRidge(kernel='rbf', gamma=gamma, alpha=0.001)
Пример #17
0
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

#importing the dataset
dataset = pd.read_csv('regressionDataSet.csv')
x = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

#splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 / 5)

#fitting the model on the training set
from sklearn.kernel_ridge import KernelRidge
regressor = KernelRidge()
regressor.fit(x_train, y_train)

#predicting the test set results
y_pred = regressor.predict(x_test)

#calculating r2
r2 = r2_score(y_test, y_pred)

#calculating r
r = m.sqrt(r2)

#calculating error
error = mean_absolute_error(y_test, y_pred)

#calculating accuracy
Пример #18
0
def kernel_ridge_alpha(alpha):
    return KernelRidge(kernel='rbf', gamma=0.1, alpha=alpha)
Пример #19
0
px = []
py = []
with open('/home/redwards/Desktop/genus_species_analysis/pseudo_coverage.txt', 'r') as fin:
    for l in fin:
        p = l.strip().split("\t")
        px.append(float(p[0]))
        py.append(float(p[1]))

ny = np.array(y)
nx = np.array(x)
pnx = np.array(px)
pny = np.array(py)


kr = KernelRidge(kernel='rbf', gamma=7.5e-5, alpha=0.001)
kr.fit(nx[:, None], ny[:, None])

x_pred = np.linspace(min(x), max(x), 10000)[:, None]
y_pred = kr.predict(x_pred)


kr.fit(pnx[:, None], pny[:, None])
px_pred = np.linspace(min(px), max(px), 10000)[:, None]
py_pred = kr.predict(px_pred)

fig = plt.figure()
ax = fig.add_subplot(111)


"""
Пример #20
0
def test_kernel_ridge():
    pred = Ridge(alpha=1, fit_intercept=False).fit(X, y).predict(X)
    pred2 = KernelRidge(kernel="linear", alpha=1).fit(X, y).predict(X)
    assert_array_almost_equal(pred, pred2)
Пример #21
0
def kernelridge(xtrain, ytrain, xtest, ytest, alp):
    ridge = KernelRidge(alpha=alp)
    ridge.fit(xtrain, ytrain)
    y_pred = ridge.predict(xtest)
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred))
Пример #22
0
def test_kernel_ridge_csc():
    pred = (Ridge(alpha=1, fit_intercept=False,
                  solver="cholesky").fit(Xcsc, y).predict(Xcsc))
    pred2 = KernelRidge(kernel="linear", alpha=1).fit(Xcsc, y).predict(Xcsc)
    assert_array_almost_equal(pred, pred2)
Пример #23
0
for i in range(len(train_data)):
    if i % 8 == 0:
        week_data[i] = str(int(data[i, 0])) + '-' + str(int(data[i, 1]))

X_train_weeks, Y_train_weeks = reshape_dataset(week_data, lags, steps_ahead)

alpha = np.linspace(1e-15, 5, 100)
gamma = np.linspace(1e-15, 1e-1, 100)

best_r2 = -1000

training_size = int(X_train.shape[0] * 0.75)
validation_size = X_train.shape[0] - training_size
for a in alpha:
    for g in gamma:
        kr = KernelRidge(kernel='rbf', gamma=g, alpha=a)
        j = training_size
        validation_predictions = np.zeros(validation_size)
        for i in range(validation_size):
            kr.fit(X_train[i:j], Y_train[i:j])
            validation_predictions[i] = kr.predict(np.array([X_train[j]]))
            j += 1
            r2 = metrics.r2_score(Y_train[training_size:],
                                  validation_predictions)
        if r2 > best_r2:
            best_r2 = r2
            best_params = (lags, a, g)
            best_predictions = np.copy(validation_predictions)

print(best_params)
mape = np.mean(
Пример #24
0
def test_kernel_ridge_precomputed():
    for kernel in ["linear", "rbf", "poly", "cosine"]:
        K = pairwise_kernels(X, X, metric=kernel)
        pred = KernelRidge(kernel=kernel).fit(X, y).predict(X)
        pred2 = KernelRidge(kernel="precomputed").fit(K, y).predict(K)
        assert_array_almost_equal(pred, pred2)
    Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(
        X, y, train_size=0.5, test_size=0.5, random_state=i)
    score = [
        sum(x) for x in zip(
            score, house_prices_functions.find_cv_error(Xtrain, ytrain))
    ]
score = [x / k_fold for x in score]
print(score[0], " ", score[1])

#final Crossvalidation
clfList = [
    linear_model.LinearRegression(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor(),
    xgb.XGBRegressor(),
    KernelRidge(),
    linear_model.BayesianRidge(),
    lgb.LGBMRegressor(verbose=-1)
]
cvSplit = model_selection.ShuffleSplit(n_splits=10,
                                       train_size=0.5,
                                       test_size=0.5,
                                       random_state=0)
maxDepthList = [2, 4]
nEstimatorsList = [400, 500]
num_leavesList = [4, 5]
etaList = [0.1, 0.05, 0.01]
rndStateList = [0, 1, 2]
gammaList = [0]
colsample_bytreeList = [0.4]
alphaList = [4]
Пример #26
0
def test_kernel_ridge_precomputed_kernel_unchanged():
    K = np.dot(X, X.T)
    K2 = K.copy()
    KernelRidge(kernel="precomputed").fit(K, y)
    assert_array_almost_equal(K, K2)
X = pd.DataFrame(dataset,
                 columns=[
                     'mass_density', 'ratio_oxygen_by_transition_metal_atom',
                     'ratio_atoms_cell_by_cell_vol',
                     'electronic_energy_band_gap', 'energy_atom',
                     'point_group', 'c/a_ratio', 'AGL_bulk_mod'
                 ])
y = pd.DataFrame(dataset["AGL_thermal_conductivity"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test.head()

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_train_scaled)
print(X_test_scaled)

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

#Replace with kernel = 'rbf' for rbf and 'linear' for linear kernel
krr = GridSearchCV(KernelRidge(kernel='poly', gamma=0.1),
                   param_grid={
                       "alpha": [1e0, 0.1, 1e-2, 1e-3],
                       "gamma": np.logspace(-2, 2, 5)
                   })

krr.fit(X_train_scaled, y_train)
print("KRR with Polynomial Kernel Model accuracy = ",
      krr.score(X_test_scaled, y_test))
Пример #28
0
                                           loss='huber')
'''
    LASSO
'''
lasso = make_pipeline(RobustScaler(),
                      Lasso(alpha=0.0005,
                            random_state=1)).fit(x_train_st, y_train_st)

#Version 24 -> alpha = 0.2, degree = 2 y coef = 1
#Version 25 -> Grdient boosting n_stimators = 5000
#Versión 26 -> Lasso alpha = 0.001
'''
    KRR
'''

KRR = KernelRidge(alpha=0.2, kernel='polynomial', degree=2,
                  coef0=1).fit(x_train_st, y_train_st)

# Retraining models
GB_model = GBest.fit(train_features, train_labels)
ENST_model = ENSTest.fit(train_features_st, train_labels)
lasso_model = lasso.fit(train_features_st, train_labels)

## Getting our SalePrice estimation
Final_labels = (np.exp(GB_model.predict(test_features)) +
                np.exp(ENST_model.predict(test_features_st)) +
                np.exp(lasso_model.predict(test_features_st)) +
                np.exp(KRR_model.predict(test_features_st))) / 4
Final_labels_train = (np.exp(GB_model.predict(train_features)) +
                      np.exp(ENST_model.predict(train_features_st)) +
                      np.exp(lasso_model.predict(train_features_st)) +
                      np.exp(KRR_model.predict(train_features_st))) / 4
Пример #29
0

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True,
               random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(
        model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)


lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))

ENet = make_pipeline(RobustScaler(),
                     ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

GBoost = GradientBoostingRegressor(n_estimators=3000,
                                   learning_rate=0.05,
                                   max_depth=4,
                                   max_features='sqrt',
                                   min_samples_leaf=15,
                                   min_samples_split=10,
                                   loss='huber',
                                   random_state=5)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
                             gamma=0.0468,
                             learning_rate=0.05,
                             max_depth=3,
                             min_child_weight=1.7817,
Пример #30
0
GBest = ensemble.GradientBoostingRegressor(n_estimators=5000,
                                           learning_rate=0.05,
                                           max_depth=3,
                                           max_features='sqrt',
                                           min_samples_leaf=15,
                                           min_samples_split=10,
                                           loss='huber')
'''
    LASSO
'''
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
'''
    KRR
'''

KRR = KernelRidge(alpha=0.1, kernel='polynomial', degree=4, coef0=1)

# Retraining models
GB_model = GBest.fit(train_features, train_labels)
KRR_model = KRR.fit(train_features_st, train_labels)
ENST_model = ENSTest.fit(train_features_st, train_labels)
lasso_model = lasso.fit(train_features_st, train_labels)

## Getting our SalePrice estimation
Final_labels = (np.exp(GB_model.predict(test_features)) +
                np.exp(ENST_model.predict(test_features_st)) +
                np.exp(lasso_model.predict(test_features_st)) +
                np.exp(KRR_model.predict(test_features_st))) / 4
Final_labels_train = (np.exp(GB_model.predict(train_features)) +
                      np.exp(ENST_model.predict(train_features_st)) +
                      np.exp(lasso_model.predict(train_features_st)) +