def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def test_ridgecv_store_cv_values(): rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_alphas) # with len(y.shape) == 2 n_targets = 3 y = rng.randn(n_samples, n_targets) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) r = RidgeCV(cv=3, store_cv_values=True) assert_raises_regex(ValueError, 'cv!=None and store_cv_values', r.fit, x, y)
def test_ridge_gcv_vs_ridge_loo_cv( gcv_mode, X_constructor, X_shape, y_shape, fit_intercept, normalize, noise): n_samples, n_features = X_shape n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise, n_informative=5 ) y = y.reshape(y_shape) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, alphas=alphas, scoring='neg_mean_squared_error', normalize=normalize) gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, alphas=alphas, normalize=normalize) loo_ridge.fit(X, y) X_gcv = X_constructor(X) gcv_ridge.fit(X_gcv, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_check_gcv_mode_error(mode): X, y = make_regression(n_samples=5, n_features=2) gcv = RidgeCV(gcv_mode=mode) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): gcv.fit(X, y) with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): _check_gcv_mode(X, mode)
def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) gs = GridSearchCV(Ridge(normalize=True), cv=3, param_grid={'alpha': ridge_cv.alphas}) gs.fit(filter_(10. * X_diabetes), y_diabetes) assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
def test_ridgecv_int_alphas(): X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] # Integers ridge = RidgeCV(alphas=(1, 10, 100)) ridge.fit(X, y)
def pred_sand(train, val, test, all_vars, loop): data = (val, test, train) # variable selection sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['Sand']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest nieghbors #neigh = KNeighborsRegressor(n_neighbors=2) #neigh.fit(train.ix[:, chosen], train['Sand']) #for dset in data: # dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen]) # SVM #svr = svm.SVR() #svr.fit(train.ix[:, lass_only], train['Sand']) #for dset in data: #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only]) # randomforest forst = RandomForestRegressor(n_estimators=200) forst.fit(train.ix[:, chosen], train['Sand']) for dset in data: dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen]) # SVM svr = svm.SVR(C=23000) svr.fit(train.ix[:, all_vars], train['Sand']) for dset in data: dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars]) # lasso #lass = Lasso(alpha=.0000001, positive=True) #lass.fit(train[all_vars], train['Sand']) #for dset in data: # dset['sand_las_prds'] = lass.predict(dset[all_vars]) # ridge sand_ridge = RidgeCV(np.array([1.135]), normalize=True) sand_ridge.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars]) # combination models= [ 'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds'] #print train.ix[0:20, models] name = 'sand_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Sand')
def pred_SOC(train, val, test, all_vars, loop): data = (val, test, train) # variable selection SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001) univ_selector = SelectKBest(score_func = f_regression, k = 4500) univ_selector.fit(train[all_vars], train['SOC']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['SOC']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x]: lass_only.append(all_vars[x]) #randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen]) gbr = GradientBoostingRegressor(n_estimators = 900, learning_rate = .0785, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4, subsample = .4) gbr.fit(train[chosen2], train['SOC']) for dset in data: dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # lasso #lass = Lasso(alpha=.00000025, positive=True) #lass.fit(train[all_vars], train['SOC']) #for dset in data: # dset['SOC_las_prds'] = lass.predict(dset[all_vars]) # ridge SOC_ridge = RidgeCV(np.array([.315]), normalize=True) SOC_ridge.fit(train[all_vars], train['SOC']) for dset in data: dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars]) # SVR svr = svm.SVR(C=9000, epsilon=.1) svr.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= ['SOC_rdg_prds', 'SOC_svr_prds', 'SOC_gbr_prds', 'SOC_for_prds', 'SOC_svr_prds' ] name = 'SOC_prds' + str(object=loop) write_preds(models, name, train, val, test, 'SOC')
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] fit_intercept = filter_ == DENSE_FILTER ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights if filter_ == DENSE_FILTER: ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5) return ret
def bag_of_words_ridge(variable): vectorizer = TfidfVectorizer(min_df=.1, max_df=.9) #use a vectorizer to count word usage instances and create sparse matrix bag_of_words_X = vectorizer.fit(train_and_validation[variable][pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-1')]) # normalization of vectorizer is fit using train only bag_of_words_X = vectorizer.transform(train_and_validation[variable]) test_bag_of_words= vectorizer.transform(test[variable]) ridge= RidgeCV(array([18]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time ridge.fit(bag_of_words_X[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')]) var_nm = "b_of_wds_prds_" + variable # put predictions into samples for use later as base classifiers in ada boost train_and_validation[var_nm]=ridge.predict(bag_of_words_X) test[var_nm]=ridge.predict(test_bag_of_words)
def test_ridge_gcv_sample_weights( gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV( alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV( alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def _test_ridge_cv(filter_): ridge_cv = RidgeCV() ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64) cv = KFold(5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64)
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 5000) univ_selector.fit(train[all_vars], train['Ca']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) gbr = GradientBoostingRegressor(n_estimators = 1000, learning_rate = .1695, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4) gbr.fit(train[chosen2], train['Ca']) for dset in data: dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # ridge Ca_ridge = RidgeCV(np.array([4.925]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # SVR model svr = svm.SVR(C=9500) svr.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= [ 'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def pred_P(train, val, test, all_vars, loop): data = (val, test, train) # variable selection P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1600) univ_selector.fit(train[all_vars], train['P']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x]: lass_only.append(all_vars[x]) chosen.append('sand_prds' + str(object=loop)) chosen.append('pH_prds' + str(object=loop)) chosen.append('SOC_prds' + str(object=loop)) chosen.append('Ca_prds' + str(object=loop)) # SVM svr = svm.SVR(C=10000, epsilon=.1) svr.fit(train.ix[:, all_vars], train['P']) for dset in data: dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars]) gbr = GradientBoostingRegressor(n_estimators = 60, learning_rate = 0.1, max_depth =5, random_state = 42, verbose = 0, min_samples_leaf=4) gbr.fit(train.ix[:, chosen], train['P']) for dset in data: dset['P_gbr_prds'] = gbr.predict(dset.ix[:,chosen]) # ridge P_ridge = RidgeCV(np.array([.55]), normalize=True) P_ridge.fit(train[all_vars], train['P']) for dset in data: dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars]) # combination models= [ 'P_rdg_prds', 'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds' name = 'P_prds' + str(object=loop) write_preds(models, name, train, val, test, 'P')
def test_ridgecv_store_cv_values(): rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_alphas) # with len(y.shape) == 2 n_targets = 3 y = rng.randn(n_samples, n_targets) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
def test_ridgecv_store_cv_values(): # Test _RidgeCV's store_cv_values attribute. rng = rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert_equal(r.cv_values_.shape, (n_samples, n_alphas)) # with len(y.shape) == 2 n_responses = 3 y = rng.randn(n_samples, n_responses) r.fit(x, y) assert_equal(r.cv_values_.shape, (n_samples, n_responses, n_alphas))
def test_ridgecv_sample_weight(): rng = np.random.RandomState(0) alphas = (0.1, 1.0, 10.0) # There are different algorithms for n_samples > n_features # and the opposite, so test them both. for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) cv = KFold(5) ridgecv = RidgeCV(alphas=alphas, cv=cv) ridgecv.fit(X, y, sample_weight=sample_weight) # Check using GridSearchCV directly parameters = {'alpha': alphas} gs = GridSearchCV(Ridge(), parameters, cv=cv) gs.fit(X, y, sample_weight=sample_weight) assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha) assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 1400) univ_selector.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() chosen = [] for x in range(1, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) #print forst.feature_importances_ for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_las_prds'] = lass.predict(dset[all_vars]) # ridge Ca_ridge = RidgeCV(np.array([.5]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # combination models= ['Ca_las_prds', 'Ca_rdg_prds', 'Ca_for_prds', 'Ca_for_prds', ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
# we run an optimizer to find the penalty that minimizes rmse of ridge init_guess = array([35]) # init_guess initializes the opimization with a guess of the optimal penalty t0= time.time() optimizer = minimize(pc_ridge, init_guess, method='nelder-mead', options= {'xtol':1e-2, 'disp':True}) print "It took {time} minutes to optimize".format(time=(time.time()-t0)/60) # run ridge with optimal penalization t0= time.time() ridge= RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True) # optimizer.x is the ridge penalty that minimized rmse ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) print "It took {time} minutes to run the optimized ridge".format(time=(time.time()-t0)/60) # create an OLS regression for word count ols= sm.regression.linear_model.OLS(train.is_exciting, train.word_count) results= ols.fit() # add ols and ridge predictions to train and test data train['ridge_predictions']=ridge.predict(train_tokens) train['length_predictions'] = train.word_count*results.params[0] test['ridge_predictions']=ridge.predict(test_tokens) test['length_predictions'] = test.word_count*results.params[0] data_for_ensemble = pd.DataFrame({"length_predictions":train.length_predictions,"ridge_predictions":train.ridge_predictions})
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def ensemble_ridge(penalty): ridge= RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(data_for_ensemble, train.is_exciting) predictions = ridge.predict(data_for_ensemble) return np.sqrt(np.mean((train.is_exciting-predictions)**2))
dum_dict['dummy_dummies'], dum_dict['var8_dummies'], dum_dict['var1_dummies'], dum_dict['var2_dummies'], dum_dict['var3_dummies'], dum_dict['var4_dummies'], dum_dict['var5_dummies'], dum_dict['var6_dummies'], dum_dict['var9_dummies']) for var_type in var_types_for_svc: for cols in var_type: svc_feats.append(cols) ##################### Run classifiers ############################ weights = np.array([fire_train_TRAIN_smp['var11']]).squeeze() # Ridge ## using weights crashes this, don't know why ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True) ridge.fit(fire_train_TRAIN_smp[ridge_feats], fire_train_TRAIN_smp.target) write_preds_allsamps(ridge, "fin_rdg_preds", ridge_feats) # claim size conditional on claim ridge ones_only = fire_train_TRAIN_smp['target']>0 size_train = fire_train_TRAIN_smp.ix[ones_only, :] size_ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True) size_ridge.fit(size_train[ridge_feats], size_train.target) write_preds_allsamps(size_ridge, "size_rdg_preds", ridge_feats) # Lasso lass = Lasso(alpha=.0000001, positive=True, max_iter=100000 , tol=.001, normalize=True) lass.fit(np.array(fire_train_TRAIN_smp[ridge_feats]), np.array(fire_train_TRAIN_smp.target)) write_preds_allsamps(lass, "fin_lass_preds", ridge_feats)
#run logistic logit = LogisticRegression() logit.fit(train_features, train_outcome) logit_feats = len(train_features.columns) validation['predictions'] = logit.predict_proba(validation_for_p)[:, 1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr, tpr) auc_score # run ridge full_ridge = RidgeCV(np.array([7]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time full_ridge.fit(train_features, train_outcome) validation['predictions'] = logit.predict_proba(validation_for_p)[:, 1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr, tpr) auc_score # add predictions to train features ens_train_features['Adaboost'] = pd.DataFrame( clf.predict_proba(train_features.iloc[:, 0:30])[:, 1]) validation_for_p['Adaboost'] = pd.DataFrame( clf.predict_proba(validation_for_p.iloc[:, 0:30])[:, 1]) test_X['Adaboost'] = pd.DataFrame( clf.predict_proba(test_X.iloc[:, 0:30])[:, 1]) ens_train_features['Forest'] = rndm_forest_clf.predict_proba(
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value)**2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def pc_ridge(penalty): # this function takes a complexity penalty as an input amd outputs RMSE ridge= RidgeCV(alphas= penalty, store_cv_values=True, normalize=True) ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) predictions = ridge.predict(train_tokens) return np.sqrt(np.mean((train.is_exciting-predictions)**2))
#run logistic logit = LogisticRegression() logit.fit(train_features, train_outcome) logit_feats = len(train_features.columns) validation['predictions']=logit.predict_proba(validation_for_p)[:,1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr,tpr) auc_score # run ridge full_ridge= RidgeCV(np.array([7]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time full_ridge.fit(train_features, train_outcome) validation['predictions']=logit.predict_proba(validation_for_p)[:,1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr,tpr) auc_score # add predictions to train features ens_train_features['Adaboost'] = pd.DataFrame(clf.predict_proba(train_features.iloc[:,0:30])[:,1]) validation_for_p['Adaboost'] = pd.DataFrame(clf.predict_proba(validation_for_p.iloc[:,0:30])[:,1]) test_X['Adaboost'] = pd.DataFrame(clf.predict_proba(test_X.iloc[:,0:30])[:,1]) ens_train_features['Forest'] = rndm_forest_clf.predict_proba(train_features.iloc[:,0:forest_features])[:,1] validation_for_p['Forest'] = rndm_forest_clf.predict_proba(validation_for_p.iloc[:,0:forest_features])[:,1] test_X['Forest'] = rndm_forest_clf.predict_proba(test_X.iloc[:,0:forest_features])[:,1]