def test_check_gcv_mode_error(mode): X, y = make_regression(n_samples=5, n_features=2) gcv = RidgeCV(gcv_mode=mode) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): gcv.fit(X, y) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): _check_gcv_mode(X, mode)
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) gs = GridSearchCV(Ridge(normalize=True), cv=3, param_grid={'alpha': ridge_cv.alphas}) gs.fit(filter_(10. * X_diabetes), y_diabetes) assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
def test_ridgecv_int_alphas(): X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] # Integers ridge = RidgeCV(alphas=(1, 10, 100)) ridge.fit(X, y)
def pred_sand(train, val, test, all_vars, loop): data = (val, test, train) # variable selection sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001) univ_selector = SelectKBest(score_func=f_regression, k=1200) univ_selector.fit(train[all_vars], train['Sand']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest nieghbors #neigh = KNeighborsRegressor(n_neighbors=2) #neigh.fit(train.ix[:, chosen], train['Sand']) #for dset in data: # dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen]) # SVM #svr = svm.SVR() #svr.fit(train.ix[:, lass_only], train['Sand']) #for dset in data: #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only]) # randomforest forst = RandomForestRegressor(n_estimators=200) forst.fit(train.ix[:, chosen], train['Sand']) for dset in data: dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen]) # SVM svr = svm.SVR(C=23000) svr.fit(train.ix[:, all_vars], train['Sand']) for dset in data: dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars]) # lasso #lass = Lasso(alpha=.0000001, positive=True) #lass.fit(train[all_vars], train['Sand']) #for dset in data: # dset['sand_las_prds'] = lass.predict(dset[all_vars]) # ridge sand_ridge = RidgeCV(np.array([1.135]), normalize=True) sand_ridge.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars]) # combination models = [ 'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds' ] #print train.ix[0:20, models] name = 'sand_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Sand')
def pred_sand(train, val, test, all_vars, loop): data = (val, test, train) # variable selection sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['Sand']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest nieghbors #neigh = KNeighborsRegressor(n_neighbors=2) #neigh.fit(train.ix[:, chosen], train['Sand']) #for dset in data: # dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen]) # SVM #svr = svm.SVR() #svr.fit(train.ix[:, lass_only], train['Sand']) #for dset in data: #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only]) # randomforest forst = RandomForestRegressor(n_estimators=200) forst.fit(train.ix[:, chosen], train['Sand']) for dset in data: dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen]) # SVM svr = svm.SVR(C=23000) svr.fit(train.ix[:, all_vars], train['Sand']) for dset in data: dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars]) # lasso #lass = Lasso(alpha=.0000001, positive=True) #lass.fit(train[all_vars], train['Sand']) #for dset in data: # dset['sand_las_prds'] = lass.predict(dset[all_vars]) # ridge sand_ridge = RidgeCV(np.array([1.135]), normalize=True) sand_ridge.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars]) # combination models= [ 'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds'] #print train.ix[0:20, models] name = 'sand_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Sand')
def pred_SOC(train, val, test, all_vars, loop): data = (val, test, train) # variable selection SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001) univ_selector = SelectKBest(score_func = f_regression, k = 4500) univ_selector.fit(train[all_vars], train['SOC']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['SOC']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x]: lass_only.append(all_vars[x]) #randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen]) gbr = GradientBoostingRegressor(n_estimators = 900, learning_rate = .0785, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4, subsample = .4) gbr.fit(train[chosen2], train['SOC']) for dset in data: dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # lasso #lass = Lasso(alpha=.00000025, positive=True) #lass.fit(train[all_vars], train['SOC']) #for dset in data: # dset['SOC_las_prds'] = lass.predict(dset[all_vars]) # ridge SOC_ridge = RidgeCV(np.array([.315]), normalize=True) SOC_ridge.fit(train[all_vars], train['SOC']) for dset in data: dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars]) # SVR svr = svm.SVR(C=9000, epsilon=.1) svr.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= ['SOC_rdg_prds', 'SOC_svr_prds', 'SOC_gbr_prds', 'SOC_for_prds', 'SOC_svr_prds' ] name = 'SOC_prds' + str(object=loop) write_preds(models, name, train, val, test, 'SOC')
def test_ridgecv_negative_alphas(): X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] # Negative integers ridge = RidgeCV(alphas=(-1, -10, -100)) assert_raises_regex(ValueError, "alphas must be positive", ridge.fit, X, y) # Negative floats ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0)) assert_raises_regex(ValueError, "alphas must be positive", ridge.fit, X, y)
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] fit_intercept = filter_ == DENSE_FILTER ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights if filter_ == DENSE_FILTER: ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5) return ret
def bag_of_words_ridge(variable): vectorizer = TfidfVectorizer(min_df=.1, max_df=.9) #use a vectorizer to count word usage instances and create sparse matrix bag_of_words_X = vectorizer.fit(train_and_validation[variable][pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-1')]) # normalization of vectorizer is fit using train only bag_of_words_X = vectorizer.transform(train_and_validation[variable]) test_bag_of_words= vectorizer.transform(test[variable]) ridge= RidgeCV(array([18]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time ridge.fit(bag_of_words_X[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')]) var_nm = "b_of_wds_prds_" + variable # put predictions into samples for use later as base classifiers in ada boost train_and_validation[var_nm]=ridge.predict(bag_of_words_X) test[var_nm]=ridge.predict(test_bag_of_words)
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func=f_regression, k=5000) univ_selector.fit(train[all_vars], train['Ca']) univ_selector2 = SelectKBest(score_func=f_regression, k=200) univ_selector2.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=.1695, max_depth=1, random_state=42, verbose=0, min_samples_leaf=4) gbr.fit(train[chosen2], train['Ca']) for dset in data: dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # ridge Ca_ridge = RidgeCV(np.array([4.925]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # SVR model svr = svm.SVR(C=9500) svr.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models = [ 'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def test_ridge_loo_cv_asym_scoring(): # checking on asymmetric scoring scoring = 'explained_variance' n_samples, n_features = 10, 5 n_targets = 1 X, y = _make_sparse_offset_regression(n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=1, n_informative=5) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=True, alphas=alphas, scoring=scoring, normalize=True) gcv_ridge = RidgeCV(fit_intercept=True, alphas=alphas, scoring=scoring, normalize=True) loo_ridge.fit(X, y) gcv_ridge.fit(X, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_ridge_gcv_vs_ridge_loo_cv(gcv_mode, X_constructor, X_shape, y_shape, fit_intercept, normalize, noise): n_samples, n_features = X_shape n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression(n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise, n_informative=5) y = y.reshape(y_shape) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, alphas=alphas, scoring='neg_mean_squared_error', normalize=normalize) gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, alphas=alphas, normalize=normalize) loo_ridge.fit(X, y) X_gcv = X_constructor(X) gcv_ridge.fit(X_gcv, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_ridgecv_store_cv_values(): rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_alphas) # with len(y.shape) == 2 n_targets = 3 y = rng.randn(n_samples, n_targets) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) r = RidgeCV(cv=3, store_cv_values=True) assert_raises_regex(ValueError, 'cv!=None and store_cv_values', r.fit, x, y)
def run(self): m_attrib = {'None': None} r_attrib = {'None': None} try: m_state = int(self.maxNumOfIterationslineEdit.text()) except: m_state = m_attrib[self.maxNumOfIterationslineEdit.text()] try: r_state = int(self.randomStateLineEdit.text()) except: r_state = r_attrib[self.randomStateLineEdit.text()] if self.crossValidateCheckBox.isChecked(): params = {'alphas': ast.literal_eval(self.alphasLineEdit_cv.text()), 'fit_intercept': self.fitInterceptCheckBox_cv.isChecked(), 'normalize': self.normalizeCheckBox_cv.isChecked(), 'scoring': {'None': None}.get(self.scoringComboBox_cv.currentText()), 'gcv_mode': {'None': None}.get(self.gCVModeComboBox_cv.currentText()), 'store_cv_values': self.storeCVValuesCheckBox_cv.isChecked(), 'CV': self.crossValidateCheckBox.isChecked()} return params, self.getChangedValues(params, RidgeCV()) else: params = {'alpha': self.alphaDoubleSpinBox.value(), 'copy_X': self.copyXCheckBox.isChecked(), 'fit_intercept': self.fitInterceptCheckBox.isChecked(), 'max_iter': m_state, 'normalize': self.normalizeCheckBox.isChecked(), 'solver': self.solverComboBox.currentText(), 'tol': self.toleranceDoubleSpinBox.value(), 'random_state': r_state, 'CV': self.crossValidateCheckBox.isChecked()} return params, self.getChangedValues(params, Ridge())
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 5000) univ_selector.fit(train[all_vars], train['Ca']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) gbr = GradientBoostingRegressor(n_estimators = 1000, learning_rate = .1695, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4) gbr.fit(train[chosen2], train['Ca']) for dset in data: dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # ridge Ca_ridge = RidgeCV(np.array([4.925]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # SVR model svr = svm.SVR(C=9500) svr.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= [ 'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def test_ridge_gcv_sample_weights(gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression(n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV(alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0]) ] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV(alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def pred_P(train, val, test, all_vars, loop): data = (val, test, train) # variable selection P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001) univ_selector = SelectKBest(score_func=f_regression, k=1600) univ_selector.fit(train[all_vars], train['P']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x]: lass_only.append(all_vars[x]) chosen.append('sand_prds' + str(object=loop)) chosen.append('pH_prds' + str(object=loop)) chosen.append('SOC_prds' + str(object=loop)) chosen.append('Ca_prds' + str(object=loop)) # SVM svr = svm.SVR(C=10000, epsilon=.1) svr.fit(train.ix[:, all_vars], train['P']) for dset in data: dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars]) gbr = GradientBoostingRegressor(n_estimators=60, learning_rate=0.1, max_depth=5, random_state=42, verbose=0, min_samples_leaf=4) gbr.fit(train.ix[:, chosen], train['P']) for dset in data: dset['P_gbr_prds'] = gbr.predict(dset.ix[:, chosen]) # ridge P_ridge = RidgeCV(np.array([.55]), normalize=True) P_ridge.fit(train[all_vars], train['P']) for dset in data: dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars]) # combination models = ['P_rdg_prds', 'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds' name = 'P_prds' + str(object=loop) write_preds(models, name, train, val, test, 'P')
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func=f_regression, k=1200) # intentionally unchanged univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest nieghbors #neigh = KNeighborsRegressor(n_neighbors=4) #neigh.fit(train.ix[:, chosen], train['pH']) #for dset in data: #dset['pH_ngh_prds'] = neigh.predict(dset.ix[:, chosen]) # nearest randomforest forst = RandomForestRegressor(n_estimators=200) forst.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso #lass = Lasso(alpha=.000000275, positive=True) #lass.fit(train[all_vars], train['pH']) #for dset in data: # dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) svr = svm.SVR(C=11000, epsilon=.1) svr.fit(train.ix[:, all_vars], train['pH']) for dset in data: dset['pH_svr_prds'] = svr.predict(dset.ix[:, all_vars]) # combination models = ['pH_rdg_prds', 'pH_svr_prds', 'pH_svr_prds', 'pH_for_prds'] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func=f_regression, k=1400) univ_selector.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() chosen = [] for x in range(1, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) #print forst.feature_importances_ for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_las_prds'] = lass.predict(dset[all_vars]) # ridge Ca_ridge = RidgeCV(np.array([.5]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # combination models = [ 'Ca_las_prds', 'Ca_rdg_prds', 'Ca_for_prds', 'Ca_for_prds', ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def pred_P(train, val, test, all_vars, loop): data = (val, test, train) # variable selection P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1600) univ_selector.fit(train[all_vars], train['P']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x]: lass_only.append(all_vars[x]) chosen.append('sand_prds' + str(object=loop)) chosen.append('pH_prds' + str(object=loop)) chosen.append('SOC_prds' + str(object=loop)) chosen.append('Ca_prds' + str(object=loop)) # SVM svr = svm.SVR(C=10000, epsilon=.1) svr.fit(train.ix[:, all_vars], train['P']) for dset in data: dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars]) gbr = GradientBoostingRegressor(n_estimators = 60, learning_rate = 0.1, max_depth =5, random_state = 42, verbose = 0, min_samples_leaf=4) gbr.fit(train.ix[:, chosen], train['P']) for dset in data: dset['P_gbr_prds'] = gbr.predict(dset.ix[:,chosen]) # ridge P_ridge = RidgeCV(np.array([.55]), normalize=True) P_ridge.fit(train[all_vars], train['P']) for dset in data: dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars]) # combination models= [ 'P_rdg_prds', 'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds' name = 'P_prds' + str(object=loop) write_preds(models, name, train, val, test, 'P')
def test_ridge_gcv_vs_ridge_loo_cv( gcv_mode, X_constructor, X_shape, y_shape, fit_intercept, normalize, noise): n_samples, n_features = X_shape n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise, n_informative=5 ) y = y.reshape(y_shape) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, alphas=alphas, scoring='neg_mean_squared_error', normalize=normalize) gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, alphas=alphas, normalize=normalize) loo_ridge.fit(X, y) X_gcv = X_constructor(X) gcv_ridge.fit(X_gcv, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
class RidgeCVImpl(): def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False): self._hyperparams = { 'alphas': alphas, 'fit_intercept': fit_intercept, 'normalize': normalize, 'scoring': scoring, 'cv': cv, 'gcv_mode': gcv_mode, 'store_cv_values': store_cv_values} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def test_solver_consistency( solver, proportion_nonzero, n_samples, dtype, sparse_X, seed): alpha = 1. noise = 50. if proportion_nonzero > .9 else 500. X, y = _make_sparse_offset_regression( bias=10, n_features=30, proportion_nonzero=proportion_nonzero, noise=noise, random_state=seed, n_samples=n_samples) svd_ridge = Ridge( solver='svd', normalize=True, alpha=alpha).fit(X, y) X = X.astype(dtype, copy=False) y = y.astype(dtype, copy=False) if sparse_X: X = sp.csr_matrix(X) if solver == 'ridgecv': ridge = RidgeCV(alphas=[alpha], normalize=True) else: ridge = Ridge(solver=solver, tol=1e-10, normalize=True, alpha=alpha) ridge.fit(X, y) assert_allclose( ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3) assert_allclose( ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3)
def test_ridgecv_store_cv_values(): # Test _RidgeCV's store_cv_values attribute. rng = rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert_equal(r.cv_values_.shape, (n_samples, n_alphas)) # with len(y.shape) == 2 n_responses = 3 y = rng.randn(n_samples, n_responses) r.fit(x, y) assert_equal(r.cv_values_.shape, (n_samples, n_responses, n_alphas))
def test_ridgecv_sample_weight(): rng = np.random.RandomState(0) alphas = (0.1, 1.0, 10.0) # There are different algorithms for n_samples > n_features # and the opposite, so test them both. for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) cv = KFold(5) ridgecv = RidgeCV(alphas=alphas, cv=cv) ridgecv.fit(X, y, sample_weight=sample_weight) # Check using GridSearchCV directly parameters = {'alpha': alphas} gs = GridSearchCV(Ridge(), parameters, cv=cv) gs.fit(X, y, sample_weight=sample_weight) assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha) assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 1400) univ_selector.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() chosen = [] for x in range(1, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) #print forst.feature_importances_ for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_las_prds'] = lass.predict(dset[all_vars]) # ridge Ca_ridge = RidgeCV(np.array([.5]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # combination models= ['Ca_las_prds', 'Ca_rdg_prds', 'Ca_for_prds', 'Ca_for_prds', ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False): self._hyperparams = { 'alphas': alphas, 'fit_intercept': fit_intercept, 'normalize': normalize, 'scoring': scoring, 'cv': cv, 'gcv_mode': gcv_mode, 'store_cv_values': store_cv_values } self._wrapped_model = SKLModel(**self._hyperparams)
def test_ridge_gcv_sample_weights( gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV( alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV( alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def connectWidgets(self): self.Ridge.setVisible(False) ridgecv = RidgeCV() self.alphasLineEdit_cv.setText(str(ridgecv.alphas)) self.fitInterceptCheckBox_cv.setChecked(ridgecv.fit_intercept) self.normalizeCheckBox_cv.setChecked(ridgecv.normalize) self.defaultComboItem(self.scoringComboBox_cv, ridgecv.scoring) self.defaultComboItem(self.gCVModeComboBox_cv, ridgecv.gcv_mode) self.storeCVValuesCheckBox_cv.setChecked(ridgecv.store_cv_values) ridge = Ridge() self.alphaDoubleSpinBox.setValue(ridge.alpha) self.fitInterceptCheckBox.setChecked(ridge.fit_intercept) self.normalizeCheckBox.setChecked(ridge.normalize) self.copyXCheckBox.setChecked(ridge.copy_X) self.defaultComboItem(self.solverComboBox, ridge.solver) self.toleranceDoubleSpinBox.setValue(ridge.tol) self.randomStateLineEdit.setText(str(ridge.random_state))
def bag_of_words_ridge(variable): vectorizer = TfidfVectorizer( min_df=.1, max_df=.9 ) #use a vectorizer to count word usage instances and create sparse matrix bag_of_words_X = vectorizer.fit( train_and_validation[variable][pd.to_datetime( train_and_validation.date_posted) > pd.to_datetime('2013-11-1')]) # normalization of vectorizer is fit using train only bag_of_words_X = vectorizer.transform(train_and_validation[variable]) test_bag_of_words = vectorizer.transform(test[variable]) ridge = RidgeCV(array([18]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time ridge.fit( bag_of_words_X[pd.to_datetime(train_and_validation.date_posted) > pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime( train_and_validation.date_posted) > pd.to_datetime('2013-11-8')]) var_nm = "b_of_wds_prds_" + variable # put predictions into samples for use later as base classifiers in ada boost train_and_validation[var_nm] = ridge.predict(bag_of_words_X) test[var_nm] = ridge.predict(test_bag_of_words)
def test_ridgecv_store_cv_values(): rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_alphas) # with len(y.shape) == 2 n_targets = 3 y = rng.randn(n_samples, n_targets) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value)**2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
from .._utils import CacheMixin from .._utils.cache_mixin import _check_memory from .._utils.param_validation import (_adjust_screening_percentile, check_feature_screening) from ..input_data.masker_validation import check_embedded_nifti_masker SUPPORTED_ESTIMATORS = dict( svc_l1=LinearSVC(penalty='l1', dual=False, max_iter=1e4), svc_l2=LinearSVC(penalty='l2', max_iter=1e4), svc=LinearSVC(penalty='l2', max_iter=1e4), logistic_l1=LogisticRegression(penalty='l1', solver='liblinear'), logistic_l2=LogisticRegression(penalty='l2', solver='liblinear'), logistic=LogisticRegression(penalty='l2', solver='liblinear'), ridge_classifier=RidgeClassifierCV(), ridge_regressor=RidgeCV(), ridge=RidgeCV(), svr=SVR(kernel='linear', max_iter=1e4), ) def _check_param_grid(estimator, X, y, param_grid=None): """Check param_grid and return sensible default if param_grid is None. Parameters ----------- estimator: str, optional The estimator to choose among: 'svc', 'svc_l2', 'svc_l1', 'logistic', 'logistic_l1', 'logistic_l2', 'ridge', 'ridge_classifier', 'ridge_regressor', and 'svr'. Note that the 'svc' and 'svc_l2'; 'logistic' and 'logistic_l2'; 'ridge' and 'ridge_regressor'
def ensemble_ridge(penalty): ridge= RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(data_for_ensemble, train.is_exciting) predictions = ridge.predict(data_for_ensemble) return np.sqrt(np.mean((train.is_exciting-predictions)**2))
lassoed_geo, lassoed_crime, dum_dict['dummy_dummies'], dum_dict['var8_dummies'], dum_dict['var1_dummies'], dum_dict['var2_dummies'], dum_dict['var3_dummies'], dum_dict['var4_dummies'], dum_dict['var5_dummies'], dum_dict['var6_dummies'], dum_dict['var9_dummies']) for var_type in var_types_for_svc: for cols in var_type: svc_feats.append(cols) ##################### Run classifiers ############################ weights = np.array([fire_train_TRAIN_smp['var11']]).squeeze() # Ridge ## using weights crashes this, don't know why ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True) ridge.fit(fire_train_TRAIN_smp[ridge_feats], fire_train_TRAIN_smp.target) write_preds_allsamps(ridge, "fin_rdg_preds", ridge_feats) # claim size conditional on claim ridge ones_only = fire_train_TRAIN_smp['target']>0 size_train = fire_train_TRAIN_smp.ix[ones_only, :] size_ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True) size_ridge.fit(size_train[ridge_feats], size_train.target) write_preds_allsamps(size_ridge, "size_rdg_preds", ridge_feats) # Lasso lass = Lasso(alpha=.0000001, positive=True, max_iter=100000 , tol=.001, normalize=True) lass.fit(np.array(fire_train_TRAIN_smp[ridge_feats]), np.array(fire_train_TRAIN_smp.target))
def pc_ridge(penalty): # this function takes a complexity penalty as an input amd outputs RMSE ridge= RidgeCV(alphas= penalty, store_cv_values=True, normalize=True) ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) predictions = ridge.predict(train_tokens) return np.sqrt(np.mean((train.is_exciting-predictions)**2))
def _test_ridge_cv(filter_): n_samples = X_diabetes.shape[0] ridge_cv = RidgeCV() ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64) cv = KFold(n_samples, 5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64)
print "It took {time} minutes to run forests".format(time=(time.time()-t0)/60) forest_features = len(train_features.columns) #run logistic logit = LogisticRegression() logit.fit(train_features, train_outcome) logit_feats = len(train_features.columns) validation['predictions']=logit.predict_proba(validation_for_p)[:,1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr,tpr) auc_score # run ridge full_ridge= RidgeCV(np.array([7]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time full_ridge.fit(train_features, train_outcome) validation['predictions']=logit.predict_proba(validation_for_p)[:,1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr,tpr) auc_score # add predictions to train features ens_train_features['Adaboost'] = pd.DataFrame(clf.predict_proba(train_features.iloc[:,0:30])[:,1]) validation_for_p['Adaboost'] = pd.DataFrame(clf.predict_proba(validation_for_p.iloc[:,0:30])[:,1]) test_X['Adaboost'] = pd.DataFrame(clf.predict_proba(test_X.iloc[:,0:30])[:,1]) ens_train_features['Forest'] = rndm_forest_clf.predict_proba(train_features.iloc[:,0:forest_features])[:,1] validation_for_p['Forest'] = rndm_forest_clf.predict_proba(validation_for_p.iloc[:,0:forest_features])[:,1]
def test_ridge_cv_sparse_svd(): X = sp.csr_matrix(X_diabetes) ridge = RidgeCV(gcv_mode="svd") assert_raises(TypeError, ridge.fit, X)
# init_guess initializes the opimization with a guess of the optimal penalty t0 = time.time() optimizer = minimize(pc_ridge, init_guess, method='nelder-mead', options={ 'xtol': 1e-2, 'disp': True }) print "It took {time} minutes to optimize".format(time=(time.time() - t0) / 60) # run ridge with optimal penalization t0 = time.time() ridge = RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True) # optimizer.x is the ridge penalty that minimized rmse ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) print "It took {time} minutes to run the optimized ridge".format( time=(time.time() - t0) / 60) # create an OLS regression for word count ols = sm.regression.linear_model.OLS(train.is_exciting, train.word_count) results = ols.fit() # add ols and ridge predictions to train and test data train['ridge_predictions'] = ridge.predict(train_tokens) train['length_predictions'] = train.word_count * results.params[0] test['ridge_predictions'] = ridge.predict(test_tokens) test['length_predictions'] = test.word_count * results.params[0]
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def pc_ridge(penalty): # this function takes a complexity penalty as an input amd outputs RMSE ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) predictions = ridge.predict(train_tokens) return np.sqrt(np.mean((train.is_exciting - predictions)**2))
predictions = ridge.predict(data_for_ensemble) return np.sqrt(np.mean((train.is_exciting-predictions)**2)) # we run an optimizer to find the penalty that minimizes rmse of ridge init_guess = array([35]) # init_guess initializes the opimization with a guess of the optimal penalty t0= time.time() optimizer = minimize(pc_ridge, init_guess, method='nelder-mead', options= {'xtol':1e-2, 'disp':True}) print "It took {time} minutes to optimize".format(time=(time.time()-t0)/60) # run ridge with optimal penalization t0= time.time() ridge= RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True) # optimizer.x is the ridge penalty that minimized rmse ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) print "It took {time} minutes to run the optimized ridge".format(time=(time.time()-t0)/60) # create an OLS regression for word count ols= sm.regression.linear_model.OLS(train.is_exciting, train.word_count) results= ols.fit() # add ols and ridge predictions to train and test data train['ridge_predictions']=ridge.predict(train_tokens) train['length_predictions'] = train.word_count*results.params[0] test['ridge_predictions']=ridge.predict(test_tokens) test['length_predictions'] = test.word_count*results.params[0]
def ensemble_ridge(penalty): ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(data_for_ensemble, train.is_exciting) predictions = ridge.predict(data_for_ensemble) return np.sqrt(np.mean((train.is_exciting - predictions)**2))