def test_regularized_options(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) model1 = OLS(yvec - 1, xmat) result1 = model1.fit_regularized(alpha=1., L1_wt=0.5) model2 = OLS(yvec, xmat, offset=1) result2 = model2.fit_regularized(alpha=1., L1_wt=0.5, start_params=np.zeros(5)) assert_allclose(result1.params, result2.params)
def test_regularized_refit(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) model1 = OLS(yvec, xmat) result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True) model2 = OLS(yvec, xmat) result2 = model2.fit_regularized(alpha=2., L1_wt=0.5, refit=True) assert_allclose(result1.params, result2.params) assert_allclose(result1.bse, result2.bse)
def test_repeat_partition(): # tests that if we use identical partitions the average is the same # as the estimate for the full data np.random.seed(435265) N = 200 p = 10 m = 1 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) def _rep_data_gen(endog, exog, partitions): """partitions data""" n_exog = exog.shape[0] n_part = np.ceil(n_exog / partitions) ii = 0 while ii < n_exog: yield endog, exog ii += int(n_part) nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) assert_allclose(fitOLSnv.params, fitOLS.params)
def test_regularized_weights(self): np.random.seed(1432) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1 / wgts) for L1_wt in 0, 0.5, 1: for alpha in 0, 1: mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def test_regularized_weights(self): np.random.seed(1432) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1/wgts) for L1_wt in 0, 0.5, 1: for alpha in 0, 1: mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def test_regularized(self): import os from . import glmnet_r_results cur_dir = os.path.dirname(os.path.abspath(__file__)) data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter=",") tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")] for test in tests: vec = getattr(glmnet_r_results, test) n = vec[0] p = vec[1] L1_wt = float(vec[2]) lam = float(vec[3]) params = vec[4:].astype(np.float64) endog = data[0:int(n), 0] exog = data[0:int(n), 1:(int(p)+1)] endog = endog - endog.mean() endog /= endog.std(ddof=1) exog = exog - exog.mean(0) exog /= exog.std(0, ddof=1) mod = OLS(endog, exog) rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam) assert_almost_equal(rslt.params, params, decimal=3) # Smoke test for summary smry = rslt.summary()
def test_empty_model(self): np.random.seed(742) n = 100 endog = np.random.normal(size=n) exog = np.random.normal(size=(n, 3)) model = OLS(endog, exog) result = model.fit_regularized(alpha=1000) assert_equal(result.params, 0.)
def test_regularized_refit(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) # covariates 0 and 2 matter yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n) model1 = OLS(yvec, xmat) result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True) model2 = OLS(yvec, xmat[:, [0, 2]]) result2 = model2.fit() ii = [0, 2] assert_allclose(result1.params[ii], result2.params) assert_allclose(result1.bse[ii], result2.bse)
def test_ridge(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) v = np.ones(p) v[0] = 0 for a in (0, 1, 10): for alpha in (a, a*np.ones(p), a*v): model1 = OLS(yvec, xmat) result1 = model1._fit_ridge(alpha=alpha) model2 = OLS(yvec, xmat) result2 = model2.fit_regularized(alpha=alpha, L1_wt=0) assert_allclose(result1.params, result2.params) model3 = OLS(yvec, xmat) result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10) assert_allclose(result1.params, result3.params) fv1 = result1.fittedvalues fv2 = np.dot(xmat, result1.params) assert_allclose(fv1, fv2)
def test_ridge(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) v = np.ones(p) v[0] = 0 for a in (0, 1, 10): for alpha in (a, a * np.ones(p), a * v): model1 = OLS(yvec, xmat) result1 = model1._fit_ridge(alpha=alpha) model2 = OLS(yvec, xmat) result2 = model2.fit_regularized(alpha=alpha, L1_wt=0) assert_allclose(result1.params, result2.params) model3 = OLS(yvec, xmat) result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10) assert_allclose(result1.params, result3.params) fv1 = result1.fittedvalues fv2 = np.dot(xmat, result1.params) assert_allclose(fv1, fv2)
def test_ridge(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) for alpha in [1., np.ones(p), 10, 10*np.ones(p)]: model1 = OLS(yvec, xmat) result1 = model1._fit_ridge(alpha=1.) model2 = OLS(yvec, xmat) result2 = model2.fit_regularized(alpha=1., L1_wt=0) assert_allclose(result1.params, result2.params) fv1 = result1.fittedvalues fv2 = np.dot(xmat, result1.params) assert_allclose(fv1, fv2)
def _calc_nodewise_row(exog, idx, alpha): """calculates the nodewise_row values for the idxth variable, used to estimate approx_inv_cov. Parameters ---------- exog : array-like The weighted design matrix for the current partition. idx : scalar Index of the current variable. alpha : scalar or array-like The penalty weight. If a scalar, the same penalty weight applies to all variables in the model. If a vector, it must have the same length as `params`, and contains a penalty weight for each coefficient. Returns ------- An array-like object of length p-1 Notes ----- nodewise_row_i = arg min 1/(2n) ||exog_i - exog_-i gamma||_2^2 + alpha ||gamma||_1 """ p = exog.shape[1] # handle array alphas if not np.isscalar(alpha): alpha = alpha[ind] ind = list(range(p)) ind.pop(idx) tmod = OLS(exog[:, idx], exog[:, ind]) nodewise_row = tmod.fit_regularized(alpha=alpha).params return nodewise_row
def test_single_partition(): # tests that the results make sense if we have a single partition np.random.seed(435265) N = 200 p = 10 m = 1 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) # test regularized OLS v. naive db_mod = DistributedModel(m) fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0}) nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit(alpha=0) assert_allclose(fitOLSdb.params, fitOLS.params) assert_allclose(fitOLSnv.params, fitOLS.params) # test regularized nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) assert_allclose(fitOLSnv.params, fitOLS.params)
def test_non_zero_params(): # tests that the thresholding does not cause any issues np.random.seed(435265) N = 200 p = 10 m = 5 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) db_mod = DistributedModel(m, join_kwds={"threshold": 0.13}) fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) nz_params_db = 1 * (fitOLSdb.params != 0) nz_params_ols = 1 * (fitOLS.params != 0) assert_allclose(nz_params_db, nz_params_ols)
ts_b = np.round(ts_b, 3) p_values = np.round(p_values, 3) params = np.round(params, 4) myDF3 = pd.DataFrame() myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3[ "Probabilites"] = [params, sd_b, ts_b, p_values] print(myDF3) diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target # X2 = sm.add_constant(X) est = OLS(y, X2) est2 = est.fit_regularized() print(est2.summary()) ###### Ridge from sklearn.linear_model import Ridge # X, y = make_regression(n_features=2, random_state=0) X = enb_feature y = enb_target regr = Ridge(alpha=5.0) regr.fit(X, y) print(regr.coef_) print(regr.intercept_) # a= [[4,5],[89,76]] # np.array(a) # print(regr.predict(a))
def test_regularized(self): import os from . import glmnet_r_results cur_dir = os.path.dirname(os.path.abspath(__file__)) data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter=",") tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")] for test in tests: vec = getattr(glmnet_r_results, test) n = vec[0] p = vec[1] L1_wt = float(vec[2]) lam = float(vec[3]) params = vec[4:].astype(np.float64) endog = data[0:n, 0] exog = data[0:n, 1:(p + 1)] endog = endog - endog.mean() endog /= endog.std(ddof=1) exog = exog - exog.mean(0) exog /= exog.std(0, ddof=1) mod = OLS(endog, exog) rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam) assert_almost_equal(rslt.params, params, decimal=3) # Smoke test for summary smry = rslt.summary()