def test_repeat_partition(): # tests that if we use identical partitions the average is the same # as the estimate for the full data np.random.seed(435265) N = 200 p = 10 m = 1 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) def _rep_data_gen(endog, exog, partitions): """partitions data""" n_exog = exog.shape[0] n_part = np.ceil(n_exog / partitions) ii = 0 while ii < n_exog: yield endog, exog ii += int(n_part) nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) assert_allclose(fitOLSnv.params, fitOLS.params)
def __init__( self, factor=None, num_partitions=None, model_class=None, init_kwds=None, estimation_method=None, estimation_kwds=None, join_method=None, join_kwds=None, results_class=None, results_kwds=None, ): self._factor = factor self._sm_model = DistributedModel( num_partitions or 10, model_class=model_class, init_kwds=init_kwds, estimation_method=estimation_method, estimation_kwds=estimation_kwds, join_method=join_method, join_kwds=join_kwds, results_class=results_class, results_kwds=results_kwds, )
def test_non_zero_params(): # tests that the thresholding does not cause any issues np.random.seed(435265) N = 200 p = 10 m = 5 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) db_mod = DistributedModel(m, join_kwds={"threshold": 0.13}) fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) nz_params_db = 1 * (fitOLSdb.params != 0) nz_params_ols = 1 * (fitOLS.params != 0) assert_allclose(nz_params_db, nz_params_ols)
def test_single_partition(): # tests that the results make sense if we have a single partition np.random.seed(435265) N = 200 p = 10 m = 1 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) # test regularized OLS v. naive db_mod = DistributedModel(m) fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0}) nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit(alpha=0) assert_allclose(fitOLSdb.params, fitOLS.params) assert_allclose(fitOLSnv.params, fitOLS.params) # test regularized nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) ols_mod = OLS(y, X) fitOLS = ols_mod.fit_regularized(alpha=0.1) assert_allclose(fitOLSnv.params, fitOLS.params)
def test_debiased_v_average(): # tests that the debiased method performs better than the standard # average. Does this for both OLS and GLM. np.random.seed(435265) N = 200 p = 10 m = 4 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) db_mod = DistributedModel(m) fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2}) olsdb = np.linalg.norm(fitOLSdb.params - beta) n_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSn = n_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2}) olsn = np.linalg.norm(fitOLSn.params - beta) assert_(olsdb < olsn) prob = 1 / (1 + np.exp(-X.dot(beta) + np.random.normal(size=N))) y = 1. * (prob > 0.5) db_mod = DistributedModel(m, model_class=GLM, init_kwds={"family": Binomial()}) fitGLMdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2}) glmdb = np.linalg.norm(fitGLMdb.params - beta) n_mod = DistributedModel(m, model_class=GLM, init_kwds={"family": Binomial()}, estimation_method=_est_regularized_naive, join_method=_join_naive) fitGLMn = n_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2}) glmn = np.linalg.norm(fitGLMn.params - beta) assert_(glmdb < glmn)
def test_larger_p(): # tests when p > N / m for the debiased and naive case np.random.seed(435265) N = 40 p = 40 m = 5 beta = np.random.normal(size=p) beta = beta * np.random.randint(0, 2, p) X = np.random.normal(size=(N, p)) y = X.dot(beta) + np.random.normal(size=N) db_mod = DistributedModel(m) fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) assert_equal(np.sum(np.isnan(fitOLSdb.params)), 0) nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive, join_method=_join_naive) fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1}) assert_equal(np.sum(np.isnan(fitOLSnv.params)), 0)
yield endog[ii:jj] ii += int(n_part) # Next we generate some random data to serve as an example. X = np.random.normal(size=(1000, 25)) beta = np.random.normal(size=25) beta *= np.random.randint(0, 2, size=25) y = norm.rvs(loc=X.dot(beta)) m = 5 # This is the most basic fit, showing all of the defaults, which are to # use OLS as the model class, and the debiasing procedure. debiased_OLS_mod = DistributedModel(m) debiased_OLS_fit = debiased_OLS_mod.fit(zip(_endog_gen(y, m), _exog_gen(X, m)), fit_kwds={"alpha": 0.2}) # Then we run through a slightly more complicated example which uses the # GLM model class. from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import Gaussian debiased_GLM_mod = DistributedModel(m, model_class=GLM, init_kwds={"family": Gaussian()}) debiased_GLM_fit = debiased_GLM_mod.fit(zip(_endog_gen(y, m), _exog_gen(X, m)), fit_kwds={"alpha": 0.2})
def test_fit_joblib(): # tests that the results of all the intermediate steps # remains correct for joblib fit, does this for OLS and GLM # and a variety of model sizes # # regression test np.random.seed(435265) X = np.random.normal(size=(50, 3)) y = np.random.randint(0, 2, size=50) mod = DistributedModel(1, model_class=OLS) fit = mod.fit(_data_gen(y, X, 1), parallel_method="joblib", fit_kwds={"alpha": 0.5}) assert_allclose(fit.params, np.array([-0.191606, -0.012565, -0.351398]), atol=1e-6, rtol=0) mod = DistributedModel(2, model_class=OLS) fit = mod.fit(_data_gen(y, X, 2), parallel_method="joblib", fit_kwds={"alpha": 0.5}) assert_allclose(fit.params, np.array([-0.157416, -0.029643, -0.471653]), atol=1e-6, rtol=0) mod = DistributedModel(3, model_class=OLS) fit = mod.fit(_data_gen(y, X, 3), parallel_method="joblib", fit_kwds={"alpha": 0.5}) assert_allclose(fit.params, np.array([-0.124891, -0.050934, -0.403354]), atol=1e-6, rtol=0) mod = DistributedModel(1, model_class=GLM, init_kwds={"family": Binomial()}) fit = mod.fit(_data_gen(y, X, 1), parallel_method="joblib", fit_kwds={"alpha": 0.5}) assert_allclose(fit.params, np.array([-0.164515, -0.412854, -0.223955]), atol=1e-6, rtol=0) mod = DistributedModel(2, model_class=GLM, init_kwds={"family": Binomial()}) fit = mod.fit(_data_gen(y, X, 2), parallel_method="joblib", fit_kwds={"alpha": 0.5}) assert_allclose(fit.params, np.array([-0.142513, -0.360324, -0.295485]), atol=1e-6, rtol=0) mod = DistributedModel(3, model_class=GLM, init_kwds={"family": Binomial()}) fit = mod.fit(_data_gen(y, X, 3), parallel_method="joblib", fit_kwds={"alpha": 0.5}) assert_allclose(fit.params, np.array([-0.110487, -0.306431, -0.243921]), atol=1e-6, rtol=0)
yield endog[ii:jj] ii += int(n_part) # Next we generate some random data to serve as an example. X = np.random.normal(size=(1000, 25)) beta = np.random.normal(size=25) beta *= np.random.randint(0, 2, size=25) y = norm.rvs(loc=X.dot(beta)) m = 5 # This is the most basic fit, showing all of the defaults, which are to # use OLS as the model class, and the debiasing procedure. debiased_OLS_mod = DistributedModel(m) debiased_OLS_fit = debiased_OLS_mod.fit( zip(_endog_gen(y, m), _exog_gen(X, m)), fit_kwds={"alpha": 0.2}) # Then we run through a slightly more complicated example which uses the # GLM model class. from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import Gaussian debiased_GLM_mod = DistributedModel( m, model_class=GLM, init_kwds={"family": Gaussian()}) debiased_GLM_fit = debiased_GLM_mod.fit( zip(_endog_gen(y, m), _exog_gen(X, m)), fit_kwds={"alpha": 0.2}) # We can also change the `estimation_method` and the `join_method`. The