def test_check_gcv_mode_error(mode):
    X, y = make_regression(n_samples=5, n_features=2)
    gcv = RidgeCV(gcv_mode=mode)
    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
        gcv.fit(X, y)
    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
        _check_gcv_mode(X, mode)
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
示例#3
0
def _test_ridge_cv_normalize(filter_):
    ridge_cv = RidgeCV(normalize=True, cv=3)
    ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)

    gs = GridSearchCV(Ridge(normalize=True), cv=3,
                      param_grid={'alpha': ridge_cv.alphas})
    gs.fit(filter_(10. * X_diabetes), y_diabetes)
    assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
示例#4
0
def _test_ridge_cv_normalize(filter_):
    ridge_cv = RidgeCV(normalize=True, cv=3)
    ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)

    gs = GridSearchCV(Ridge(normalize=True), cv=3,
                      param_grid={'alpha': ridge_cv.alphas})
    gs.fit(filter_(10. * X_diabetes), y_diabetes)
    assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
示例#5
0
def test_ridgecv_int_alphas():
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                  [1.0, 1.0], [1.0, 0.0]])
    y = [1, 1, 1, -1, -1]

    # Integers
    ridge = RidgeCV(alphas=(1, 10, 100))
    ridge.fit(X, y)
示例#6
0
def test_ridgecv_int_alphas():
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                  [1.0, 1.0], [1.0, 0.0]])
    y = [1, 1, 1, -1, -1]

    # Integers
    ridge = RidgeCV(alphas=(1, 10, 100))
    ridge.fit(X, y)
def pred_sand(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1200)
    univ_selector.fit(train[all_vars], train['Sand'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x]:
            lass_only.append(all_vars[x])

    # nearest nieghbors
    #neigh = KNeighborsRegressor(n_neighbors=2)
    #neigh.fit(train.ix[:, chosen], train['Sand'])
    #for dset in data:
    #  dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen])

    # SVM
    #svr = svm.SVR()
    #svr.fit(train.ix[:, lass_only], train['Sand'])
    #for dset in data:
    #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only])
    # randomforest
    forst = RandomForestRegressor(n_estimators=200)
    forst.fit(train.ix[:, chosen], train['Sand'])
    for dset in data:
        dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen])

    # SVM
    svr = svm.SVR(C=23000)
    svr.fit(train.ix[:, all_vars], train['Sand'])
    for dset in data:
        dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars])

    # lasso
    #lass = Lasso(alpha=.0000001, positive=True)
    #lass.fit(train[all_vars], train['Sand'])
    #for dset in data:
    #    dset['sand_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    sand_ridge = RidgeCV(np.array([1.135]), normalize=True)
    sand_ridge.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars])
    # combination
    models = [
        'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds'
    ]
    #print train.ix[0:20, models]
    name = 'sand_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Sand')
def pred_sand(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['Sand'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x]:
            lass_only.append(all_vars[x]) 
   
    # nearest nieghbors
    #neigh = KNeighborsRegressor(n_neighbors=2)
    #neigh.fit(train.ix[:, chosen], train['Sand'])
    #for dset in data:
      #  dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen])
        
    # SVM
    #svr = svm.SVR()
    #svr.fit(train.ix[:, lass_only], train['Sand'])
    #for dset in data:
        #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only])
    # randomforest
    forst = RandomForestRegressor(n_estimators=200)
    forst.fit(train.ix[:, chosen], train['Sand'])
    for dset in data:
        dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # SVM
    svr = svm.SVR(C=23000)
    svr.fit(train.ix[:, all_vars], train['Sand'])
    for dset in data:
        dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars])
        
    # lasso
    #lass = Lasso(alpha=.0000001, positive=True)
    #lass.fit(train[all_vars], train['Sand'])
    #for dset in data:
    #    dset['sand_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    sand_ridge = RidgeCV(np.array([1.135]), normalize=True)
    sand_ridge.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars])
    # combination
    models= [ 'sand_rdg_prds', 'sand_svr_prds',
             'sand_for_prds',  'sand_svr_prds'] 
    #print train.ix[0:20, models]
    name = 'sand_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Sand')
def pred_SOC(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 4500)
    univ_selector.fit(train[all_vars], train['SOC'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['SOC'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x]:
            lass_only.append(all_vars[x])    
    #randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen])
    gbr = GradientBoostingRegressor(n_estimators = 900,
            learning_rate = .0785, max_depth =1, random_state = 42, 
            verbose = 0, min_samples_leaf=4, subsample = .4)
    gbr.fit(train[chosen2], train['SOC'])
    for dset in data:
        dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])    
    # lasso
    #lass = Lasso(alpha=.00000025, positive=True)
    #lass.fit(train[all_vars], train['SOC'])
    #for dset in data:
    #    dset['SOC_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    SOC_ridge = RidgeCV(np.array([.315]), normalize=True)
    SOC_ridge.fit(train[all_vars], train['SOC'])
    for dset in data:
        dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars])
    # SVR
    svr = svm.SVR(C=9000, epsilon=.1)
    svr.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen])
    # combination
    models= ['SOC_rdg_prds', 'SOC_svr_prds',
              'SOC_gbr_prds', 'SOC_for_prds',  'SOC_svr_prds' ]
    name = 'SOC_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'SOC')
示例#10
0
def test_ridgecv_negative_alphas():
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0,
                                                                     0.0]])
    y = [1, 1, 1, -1, -1]

    # Negative integers
    ridge = RidgeCV(alphas=(-1, -10, -100))
    assert_raises_regex(ValueError, "alphas must be positive", ridge.fit, X, y)

    # Negative floats
    ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0))
    assert_raises_regex(ValueError, "alphas must be positive", ridge.fit, X, y)
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    fit_intercept = filter_ == DENSE_FILTER
    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('neg_mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with sample weights
    if filter_ == DENSE_FILTER:
        ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                      sample_weight=np.ones(n_samples))
        assert ridge_gcv.alpha_ == pytest.approx(alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_allclose(np.vstack((y_pred, y_pred)).T,
                    Y_pred, rtol=1e-5)

    return ret
def bag_of_words_ridge(variable):
    vectorizer = TfidfVectorizer(min_df=.1, max_df=.9) #use a vectorizer to count word usage instances and create sparse matrix
    bag_of_words_X = vectorizer.fit(train_and_validation[variable][pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-1')])
    # normalization of vectorizer is fit using train only
    bag_of_words_X = vectorizer.transform(train_and_validation[variable])
    test_bag_of_words= vectorizer.transform(test[variable])
    ridge= RidgeCV(array([18]), store_cv_values=True, normalize=True)
    # using data range to gaurantee recency and also run time 
    ridge.fit(bag_of_words_X[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')])
    var_nm = "b_of_wds_prds_" + variable
    # put predictions into samples for use later as base classifiers in ada boost    
    train_and_validation[var_nm]=ridge.predict(bag_of_words_X)
    test[var_nm]=ridge.predict(test_bag_of_words)
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func=f_regression, k=5000)
    univ_selector.fit(train[all_vars], train['Ca'])
    univ_selector2 = SelectKBest(score_func=f_regression, k=200)
    univ_selector2.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 = []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    gbr = GradientBoostingRegressor(n_estimators=1000,
                                    learning_rate=.1695,
                                    max_depth=1,
                                    random_state=42,
                                    verbose=0,
                                    min_samples_leaf=4)
    gbr.fit(train[chosen2], train['Ca'])
    for dset in data:
        dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])

    # ridge
    Ca_ridge = RidgeCV(np.array([4.925]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # SVR model
    svr = svm.SVR(C=9500)
    svr.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen])

    # combination
    models = [
        'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds',
        'Ca_svr_prds'
    ]
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
示例#14
0
def test_ridge_loo_cv_asym_scoring():
    # checking on asymmetric scoring
    scoring = 'explained_variance'
    n_samples, n_features = 10, 5
    n_targets = 1
    X, y = _make_sparse_offset_regression(n_samples=n_samples,
                                          n_features=n_features,
                                          n_targets=n_targets,
                                          random_state=0,
                                          shuffle=False,
                                          noise=1,
                                          n_informative=5)

    alphas = [1e-3, .1, 1., 10., 1e3]
    loo_ridge = RidgeCV(cv=n_samples,
                        fit_intercept=True,
                        alphas=alphas,
                        scoring=scoring,
                        normalize=True)

    gcv_ridge = RidgeCV(fit_intercept=True,
                        alphas=alphas,
                        scoring=scoring,
                        normalize=True)

    loo_ridge.fit(X, y)
    gcv_ridge.fit(X, y)

    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
示例#15
0
def test_ridge_gcv_vs_ridge_loo_cv(gcv_mode, X_constructor, X_shape, y_shape,
                                   fit_intercept, normalize, noise):
    n_samples, n_features = X_shape
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(n_samples=n_samples,
                                          n_features=n_features,
                                          n_targets=n_targets,
                                          random_state=0,
                                          shuffle=False,
                                          noise=noise,
                                          n_informative=5)
    y = y.reshape(y_shape)

    alphas = [1e-3, .1, 1., 10., 1e3]
    loo_ridge = RidgeCV(cv=n_samples,
                        fit_intercept=fit_intercept,
                        alphas=alphas,
                        scoring='neg_mean_squared_error',
                        normalize=normalize)
    gcv_ridge = RidgeCV(gcv_mode=gcv_mode,
                        fit_intercept=fit_intercept,
                        alphas=alphas,
                        normalize=normalize)

    loo_ridge.fit(X, y)

    X_gcv = X_constructor(X)
    gcv_ridge.fit(X_gcv, y)

    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
示例#16
0
def test_ridgecv_store_cv_values():
    rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_alphas)

    # with len(y.shape) == 2
    n_targets = 3
    y = rng.randn(n_samples, n_targets)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)

    r = RidgeCV(cv=3, store_cv_values=True)
    assert_raises_regex(ValueError, 'cv!=None and store_cv_values', r.fit, x,
                        y)
示例#17
0
    def run(self):
        m_attrib = {'None': None}
        r_attrib = {'None': None}
        try:
            m_state = int(self.maxNumOfIterationslineEdit.text())
        except:
            m_state = m_attrib[self.maxNumOfIterationslineEdit.text()]
        try:
            r_state = int(self.randomStateLineEdit.text())
        except:
            r_state = r_attrib[self.randomStateLineEdit.text()]

        if self.crossValidateCheckBox.isChecked():
            params = {'alphas': ast.literal_eval(self.alphasLineEdit_cv.text()),
                      'fit_intercept': self.fitInterceptCheckBox_cv.isChecked(),
                      'normalize': self.normalizeCheckBox_cv.isChecked(),
                      'scoring': {'None': None}.get(self.scoringComboBox_cv.currentText()),
                      'gcv_mode': {'None': None}.get(self.gCVModeComboBox_cv.currentText()),
                      'store_cv_values': self.storeCVValuesCheckBox_cv.isChecked(),
                      'CV': self.crossValidateCheckBox.isChecked()}
            return params, self.getChangedValues(params, RidgeCV())

        else:
            params = {'alpha': self.alphaDoubleSpinBox.value(),
                      'copy_X': self.copyXCheckBox.isChecked(),
                      'fit_intercept': self.fitInterceptCheckBox.isChecked(),
                      'max_iter': m_state,
                      'normalize': self.normalizeCheckBox.isChecked(),
                      'solver': self.solverComboBox.currentText(),
                      'tol': self.toleranceDoubleSpinBox.value(),
                      'random_state': r_state,
                      'CV': self.crossValidateCheckBox.isChecked()}
            return params, self.getChangedValues(params, Ridge())
示例#18
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 5000)
    univ_selector.fit(train[all_vars], train['Ca'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    gbr = GradientBoostingRegressor(n_estimators = 1000,
        learning_rate = .1695, max_depth =1, random_state = 42, 
        verbose = 0, min_samples_leaf=4)
    gbr.fit(train[chosen2], train['Ca'])
    for dset in data:
       dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # ridge
    Ca_ridge = RidgeCV(np.array([4.925]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # SVR model
    svr = svm.SVR(C=9500)
    svr.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen])

    # combination
    models= [ 'Ca_rdg_prds', 'Ca_gbr_prds',  
              'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ]   
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
示例#20
0
def test_ridge_gcv_sample_weights(gcv_mode, X_constructor, fit_intercept,
                                  n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(n_samples=11,
                                          n_features=n_features,
                                          n_targets=n_targets,
                                          random_state=0,
                                          shuffle=False,
                                          noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(alphas=alphas,
                    cv=splits,
                    scoring='neg_mean_squared_error',
                    fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0)
        for i in np.arange(X.shape[0])
    ]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(alphas=alphas,
                        store_cv_values=True,
                        gcv_mode=gcv_mode,
                        fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def pred_P(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1600)
    univ_selector.fit(train[all_vars], train['P'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    chosen.append('sand_prds' + str(object=loop))
    chosen.append('pH_prds' + str(object=loop))
    chosen.append('SOC_prds' + str(object=loop))
    chosen.append('Ca_prds' + str(object=loop))
    # SVM
    svr = svm.SVR(C=10000, epsilon=.1)
    svr.fit(train.ix[:, all_vars], train['P'])
    for dset in data:
        dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars])

    gbr = GradientBoostingRegressor(n_estimators=60,
                                    learning_rate=0.1,
                                    max_depth=5,
                                    random_state=42,
                                    verbose=0,
                                    min_samples_leaf=4)
    gbr.fit(train.ix[:, chosen], train['P'])
    for dset in data:
        dset['P_gbr_prds'] = gbr.predict(dset.ix[:, chosen])
    # ridge
    P_ridge = RidgeCV(np.array([.55]), normalize=True)
    P_ridge.fit(train[all_vars], train['P'])
    for dset in data:
        dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars])
    # combination
    models = ['P_rdg_prds', 'P_svr_prds',
              'P_gbr_prds']  #, 'P_las_prds' , 'P_gbr_prds'
    name = 'P_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'P')
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func=f_regression,
                                k=1200)  # intentionally unchanged
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest nieghbors
    #neigh = KNeighborsRegressor(n_neighbors=4)
    #neigh.fit(train.ix[:, chosen], train['pH'])
    #for dset in data:
    #dset['pH_ngh_prds'] = neigh.predict(dset.ix[:, chosen])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=200)
    forst.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = forst.predict(dset.ix[:, chosen])
    # lasso
    #lass = Lasso(alpha=.000000275, positive=True)
    #lass.fit(train[all_vars], train['pH'])
    #for dset in data:
    #    dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    svr = svm.SVR(C=11000, epsilon=.1)
    svr.fit(train.ix[:, all_vars], train['pH'])
    for dset in data:
        dset['pH_svr_prds'] = svr.predict(dset.ix[:, all_vars])
    # combination
    models = ['pH_rdg_prds', 'pH_svr_prds', 'pH_svr_prds', 'pH_for_prds']
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1400)
    univ_selector.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(1, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    #print forst.feature_importances_
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])

    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    Ca_ridge = RidgeCV(np.array([.5]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # combination
    models = [
        'Ca_las_prds',
        'Ca_rdg_prds',
        'Ca_for_prds',
        'Ca_for_prds',
    ]
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
def pred_P(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1600)
    univ_selector.fit(train[all_vars], train['P'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    chosen.append('sand_prds' + str(object=loop))
    chosen.append('pH_prds' + str(object=loop))
    chosen.append('SOC_prds' + str(object=loop))
    chosen.append('Ca_prds' + str(object=loop))
    # SVM
    svr = svm.SVR(C=10000, epsilon=.1)
    svr.fit(train.ix[:, all_vars], train['P'])
    for dset in data:
        dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars])
  
    gbr = GradientBoostingRegressor(n_estimators = 60,
        learning_rate = 0.1, max_depth =5, random_state = 42, 
        verbose = 0, min_samples_leaf=4)
    gbr.fit(train.ix[:, chosen], train['P'])
    for dset in data:
        dset['P_gbr_prds'] = gbr.predict(dset.ix[:,chosen])
    # ridge
    P_ridge = RidgeCV(np.array([.55]), normalize=True)
    P_ridge.fit(train[all_vars], train['P'])
    for dset in data:
        dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars])
    # combination
    models= [ 'P_rdg_prds', 
              'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds'
    name = 'P_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'P')
def test_ridgecv_store_cv_values():
    rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_alphas)

    # with len(y.shape) == 2
    n_targets = 3
    y = rng.randn(n_samples, n_targets)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)

    r = RidgeCV(cv=3, store_cv_values=True)
    assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
                        r.fit, x, y)
def test_ridge_gcv_vs_ridge_loo_cv(
        gcv_mode, X_constructor, X_shape, y_shape,
        fit_intercept, normalize, noise):
    n_samples, n_features = X_shape
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=n_samples, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise, n_informative=5
    )
    y = y.reshape(y_shape)

    alphas = [1e-3, .1, 1., 10., 1e3]
    loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept,
                        alphas=alphas, scoring='neg_mean_squared_error',
                        normalize=normalize)
    gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept,
                        alphas=alphas, normalize=normalize)

    loo_ridge.fit(X, y)

    X_gcv = X_constructor(X)
    gcv_ridge.fit(X_gcv, y)

    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
示例#27
0
class RidgeCVImpl():

    def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False):
        self._hyperparams = {
            'alphas': alphas,
            'fit_intercept': fit_intercept,
            'normalize': normalize,
            'scoring': scoring,
            'cv': cv,
            'gcv_mode': gcv_mode,
            'store_cv_values': store_cv_values}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
示例#28
0
def test_solver_consistency(
        solver, proportion_nonzero, n_samples, dtype, sparse_X, seed):
    alpha = 1.
    noise = 50. if proportion_nonzero > .9 else 500.
    X, y = _make_sparse_offset_regression(
        bias=10, n_features=30, proportion_nonzero=proportion_nonzero,
        noise=noise, random_state=seed, n_samples=n_samples)
    svd_ridge = Ridge(
        solver='svd', normalize=True, alpha=alpha).fit(X, y)
    X = X.astype(dtype, copy=False)
    y = y.astype(dtype, copy=False)
    if sparse_X:
        X = sp.csr_matrix(X)
    if solver == 'ridgecv':
        ridge = RidgeCV(alphas=[alpha], normalize=True)
    else:
        ridge = Ridge(solver=solver, tol=1e-10, normalize=True, alpha=alpha)
    ridge.fit(X, y)
    assert_allclose(
        ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3)
    assert_allclose(
        ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3)
示例#29
0
def test_ridgecv_store_cv_values():
    # Test _RidgeCV's store_cv_values attribute.
    rng = rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert_equal(r.cv_values_.shape, (n_samples, n_alphas))

    # with len(y.shape) == 2
    n_responses = 3
    y = rng.randn(n_samples, n_responses)
    r.fit(x, y)
    assert_equal(r.cv_values_.shape, (n_samples, n_responses, n_alphas))
示例#30
0
def test_ridgecv_sample_weight():
    rng = np.random.RandomState(0)
    alphas = (0.1, 1.0, 10.0)

    # There are different algorithms for n_samples > n_features
    # and the opposite, so test them both.
    for n_samples, n_features in ((6, 5), (5, 10)):
        y = rng.randn(n_samples)
        X = rng.randn(n_samples, n_features)
        sample_weight = 1.0 + rng.rand(n_samples)

        cv = KFold(5)
        ridgecv = RidgeCV(alphas=alphas, cv=cv)
        ridgecv.fit(X, y, sample_weight=sample_weight)

        # Check using GridSearchCV directly
        parameters = {'alpha': alphas}
        gs = GridSearchCV(Ridge(), parameters, cv=cv)
        gs.fit(X, y, sample_weight=sample_weight)

        assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha)
        assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1400)
    univ_selector.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(1, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    #print forst.feature_importances_
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    Ca_ridge = RidgeCV(np.array([.5]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # combination
    models= ['Ca_las_prds', 'Ca_rdg_prds', 
             'Ca_for_prds', 'Ca_for_prds',  ] 
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
示例#32
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    fit_intercept = filter_ == DENSE_FILTER
    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('neg_mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with sample weights
    if filter_ == DENSE_FILTER:
        ridge_gcv.fit(filter_(X_diabetes),
                      y_diabetes,
                      sample_weight=np.ones(n_samples))
        assert ridge_gcv.alpha_ == pytest.approx(alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5)

    return ret
示例#33
0
 def __init__(self,
              alphas=[0.1, 1.0, 10.0],
              fit_intercept=True,
              normalize=False,
              scoring=None,
              cv=None,
              gcv_mode=None,
              store_cv_values=False):
     self._hyperparams = {
         'alphas': alphas,
         'fit_intercept': fit_intercept,
         'normalize': normalize,
         'scoring': scoring,
         'cv': cv,
         'gcv_mode': gcv_mode,
         'store_cv_values': store_cv_values
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
示例#35
0
    def connectWidgets(self):
        self.Ridge.setVisible(False)
        ridgecv = RidgeCV()

        self.alphasLineEdit_cv.setText(str(ridgecv.alphas))
        self.fitInterceptCheckBox_cv.setChecked(ridgecv.fit_intercept)
        self.normalizeCheckBox_cv.setChecked(ridgecv.normalize)
        self.defaultComboItem(self.scoringComboBox_cv, ridgecv.scoring)
        self.defaultComboItem(self.gCVModeComboBox_cv, ridgecv.gcv_mode)
        self.storeCVValuesCheckBox_cv.setChecked(ridgecv.store_cv_values)

        ridge = Ridge()

        self.alphaDoubleSpinBox.setValue(ridge.alpha)
        self.fitInterceptCheckBox.setChecked(ridge.fit_intercept)
        self.normalizeCheckBox.setChecked(ridge.normalize)
        self.copyXCheckBox.setChecked(ridge.copy_X)
        self.defaultComboItem(self.solverComboBox, ridge.solver)
        self.toleranceDoubleSpinBox.setValue(ridge.tol)
        self.randomStateLineEdit.setText(str(ridge.random_state))
示例#36
0
def bag_of_words_ridge(variable):
    vectorizer = TfidfVectorizer(
        min_df=.1, max_df=.9
    )  #use a vectorizer to count word usage instances and create sparse matrix
    bag_of_words_X = vectorizer.fit(
        train_and_validation[variable][pd.to_datetime(
            train_and_validation.date_posted) > pd.to_datetime('2013-11-1')])
    # normalization of vectorizer is fit using train only
    bag_of_words_X = vectorizer.transform(train_and_validation[variable])
    test_bag_of_words = vectorizer.transform(test[variable])
    ridge = RidgeCV(array([18]), store_cv_values=True, normalize=True)
    # using data range to gaurantee recency and also run time
    ridge.fit(
        bag_of_words_X[pd.to_datetime(train_and_validation.date_posted) >
                       pd.to_datetime('2013-11-8')],
        train_and_validation.is_exciting[pd.to_datetime(
            train_and_validation.date_posted) > pd.to_datetime('2013-11-8')])
    var_nm = "b_of_wds_prds_" + variable
    # put predictions into samples for use later as base classifiers in ada boost
    train_and_validation[var_nm] = ridge.predict(bag_of_words_X)
    test[var_nm] = ridge.predict(test_bag_of_words)
示例#37
0
def test_ridgecv_store_cv_values():
    rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_alphas)

    # with len(y.shape) == 2
    n_targets = 3
    y = rng.randn(n_samples, n_targets)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
示例#38
0
def test_ridgecv_store_cv_values():
    # Test _RidgeCV's store_cv_values attribute.
    rng = rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert_equal(r.cv_values_.shape, (n_samples, n_alphas))

    # with len(y.shape) == 2
    n_responses = 3
    y = rng.randn(n_samples, n_responses)
    r.fit(x, y)
    assert_equal(r.cv_values_.shape, (n_samples, n_responses, n_alphas))
示例#39
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(alpha=1.0, fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value)**2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes)
    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.alpha_, alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv3.alpha_, alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv4.alpha_, alpha_)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes),
                  y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.alpha_, alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5)

    return ret
示例#40
0
from .._utils import CacheMixin
from .._utils.cache_mixin import _check_memory
from .._utils.param_validation import (_adjust_screening_percentile,
                                       check_feature_screening)
from ..input_data.masker_validation import check_embedded_nifti_masker

SUPPORTED_ESTIMATORS = dict(
    svc_l1=LinearSVC(penalty='l1', dual=False, max_iter=1e4),
    svc_l2=LinearSVC(penalty='l2', max_iter=1e4),
    svc=LinearSVC(penalty='l2', max_iter=1e4),
    logistic_l1=LogisticRegression(penalty='l1', solver='liblinear'),
    logistic_l2=LogisticRegression(penalty='l2', solver='liblinear'),
    logistic=LogisticRegression(penalty='l2', solver='liblinear'),
    ridge_classifier=RidgeClassifierCV(),
    ridge_regressor=RidgeCV(),
    ridge=RidgeCV(),
    svr=SVR(kernel='linear', max_iter=1e4),
)


def _check_param_grid(estimator, X, y, param_grid=None):
    """Check param_grid and return sensible default if param_grid is None.

    Parameters
    -----------
    estimator: str, optional
        The estimator to choose among: 'svc', 'svc_l2', 'svc_l1', 'logistic',
        'logistic_l1', 'logistic_l2', 'ridge', 'ridge_classifier',
        'ridge_regressor', and 'svr'. Note that the 'svc' and 'svc_l2';
        'logistic' and 'logistic_l2'; 'ridge' and 'ridge_regressor'
def ensemble_ridge(penalty):
    
    ridge= RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(data_for_ensemble, train.is_exciting)
    predictions = ridge.predict(data_for_ensemble)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))
                        lassoed_geo, lassoed_crime,
                        dum_dict['dummy_dummies'], dum_dict['var8_dummies'],
                        dum_dict['var1_dummies'], dum_dict['var2_dummies'],
                        dum_dict['var3_dummies'], dum_dict['var4_dummies'],
                        dum_dict['var5_dummies'], dum_dict['var6_dummies'],
                        dum_dict['var9_dummies'])
 for var_type in var_types_for_svc:
     for cols in var_type:
         svc_feats.append(cols)
         
 ##################### Run classifiers ############################
 weights = np.array([fire_train_TRAIN_smp['var11']]).squeeze()
 
 # Ridge
 ## using weights crashes this, don't know why       
 ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True)
 ridge.fit(fire_train_TRAIN_smp[ridge_feats], fire_train_TRAIN_smp.target)
 write_preds_allsamps(ridge, "fin_rdg_preds", ridge_feats)
 
 # claim size conditional on claim ridge
 ones_only = fire_train_TRAIN_smp['target']>0
 size_train = fire_train_TRAIN_smp.ix[ones_only, :]
 size_ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True)
 size_ridge.fit(size_train[ridge_feats], size_train.target)
 write_preds_allsamps(size_ridge, "size_rdg_preds", ridge_feats)
 
 # Lasso
 lass = Lasso(alpha=.0000001, positive=True, max_iter=100000 ,
              tol=.001, normalize=True)
 lass.fit(np.array(fire_train_TRAIN_smp[ridge_feats]),
          np.array(fire_train_TRAIN_smp.target))
def pc_ridge(penalty): 
    # this function takes a complexity penalty as an input amd outputs RMSE
    ridge= RidgeCV(alphas= penalty, store_cv_values=True, normalize=True)
    ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
    predictions = ridge.predict(train_tokens)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))
示例#44
0
def _test_ridge_cv(filter_):
    n_samples = X_diabetes.shape[0]

    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(n_samples, 5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)
示例#45
0
def _test_ridge_cv(filter_):
    n_samples = X_diabetes.shape[0]

    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(n_samples, 5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)
print "It took {time} minutes to run forests".format(time=(time.time()-t0)/60)
forest_features = len(train_features.columns)

#run logistic
logit = LogisticRegression()
logit.fit(train_features, train_outcome)
logit_feats = len(train_features.columns)

validation['predictions']=logit.predict_proba(validation_for_p)[:,1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions)
auc_score = auc(fpr,tpr)
auc_score 

# run ridge

full_ridge= RidgeCV(np.array([7]), store_cv_values=True, normalize=True)
# using data range to gaurantee recency and also run time 
full_ridge.fit(train_features, train_outcome)
validation['predictions']=logit.predict_proba(validation_for_p)[:,1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions)
auc_score = auc(fpr,tpr)
auc_score 
  
    
# add predictions to train features
ens_train_features['Adaboost'] = pd.DataFrame(clf.predict_proba(train_features.iloc[:,0:30])[:,1])
validation_for_p['Adaboost'] = pd.DataFrame(clf.predict_proba(validation_for_p.iloc[:,0:30])[:,1])
test_X['Adaboost'] = pd.DataFrame(clf.predict_proba(test_X.iloc[:,0:30])[:,1])

ens_train_features['Forest'] = rndm_forest_clf.predict_proba(train_features.iloc[:,0:forest_features])[:,1]
validation_for_p['Forest'] = rndm_forest_clf.predict_proba(validation_for_p.iloc[:,0:forest_features])[:,1]
示例#47
0
def test_ridge_cv_sparse_svd():
    X = sp.csr_matrix(X_diabetes)
    ridge = RidgeCV(gcv_mode="svd")
    assert_raises(TypeError, ridge.fit, X)
示例#48
0
# init_guess initializes the opimization with a guess of the optimal penalty

t0 = time.time()
optimizer = minimize(pc_ridge,
                     init_guess,
                     method='nelder-mead',
                     options={
                         'xtol': 1e-2,
                         'disp': True
                     })
print "It took {time} minutes to optimize".format(time=(time.time() - t0) / 60)

# run ridge with optimal penalization

t0 = time.time()
ridge = RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True)
# optimizer.x is the ridge penalty that minimized rmse
ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
print "It took {time} minutes to run the optimized ridge".format(
    time=(time.time() - t0) / 60)

# create an OLS regression for word count
ols = sm.regression.linear_model.OLS(train.is_exciting, train.word_count)
results = ols.fit()

# add ols and ridge predictions to train and test data

train['ridge_predictions'] = ridge.predict(train_tokens)
train['length_predictions'] = train.word_count * results.params[0]
test['ridge_predictions'] = ridge.predict(test_tokens)
test['length_predictions'] = test.word_count * results.params[0]
示例#49
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(alpha=1.0, fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value) ** 2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes)
    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.alpha_, alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv3.alpha_, alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv4.alpha_, alpha_)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.alpha_, alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
                              Y_pred, decimal=5)

    return ret
示例#50
0
def pc_ridge(penalty):
    # this function takes a complexity penalty as an input amd outputs RMSE
    ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
    predictions = ridge.predict(train_tokens)
    return np.sqrt(np.mean((train.is_exciting - predictions)**2))
    predictions = ridge.predict(data_for_ensemble)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))
   
   
# we run an optimizer to find the penalty that minimizes rmse of ridge
init_guess = array([35])  
# init_guess initializes the opimization with a guess of the optimal penalty 
   
t0= time.time()
optimizer = minimize(pc_ridge, init_guess, method='nelder-mead', options= {'xtol':1e-2, 'disp':True})
print "It took {time} minutes to optimize".format(time=(time.time()-t0)/60)

# run ridge with optimal penalization

t0= time.time()
ridge= RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True)
# optimizer.x is the ridge penalty that minimized rmse
ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
print "It took {time} minutes to run the optimized ridge".format(time=(time.time()-t0)/60)

# create an OLS regression for word count
ols= sm.regression.linear_model.OLS(train.is_exciting, train.word_count)
results= ols.fit()

# add ols and ridge predictions to train and test data 

train['ridge_predictions']=ridge.predict(train_tokens) 
train['length_predictions'] = train.word_count*results.params[0]
test['ridge_predictions']=ridge.predict(test_tokens) 
test['length_predictions'] = test.word_count*results.params[0]
示例#52
0
def ensemble_ridge(penalty):

    ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(data_for_ensemble, train.is_exciting)
    predictions = ridge.predict(data_for_ensemble)
    return np.sqrt(np.mean((train.is_exciting - predictions)**2))