def setup_class(cls): sp = np.array([40491.3940640059, 232455.530262537]) # s_scale is same as before cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) cls.exog = patsy.dmatrix('fuel + drive', data=df_autos) x_spline = df_autos[['weight', 'hp']].values bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive', df_autos, smoother=bs, family=family.Poisson(), alpha=alpha0) cls.res1a = gam_bs.fit(use_t=False) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed
def test_partial_plot(): # verify that plot and partial_values method agree # the model only has one component so partial values is the same as # fittedvalues # Generate a plot to visualize analyze the result. cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values se_from_mgcv = data_from_r.y_est_se # noqa: F841 df = [10] degree = [6] bsplines = BSplines(x, degree=degree, df=df) alpha = 0.03 glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs') fig = res_glm_gam.plot_partial(0) xp, yp = fig.axes[0].get_children()[0].get_data() # Note xp and yp are sorted by x sort_idx = np.argsort(x) hat_y, se = res_glm_gam.partial_values(0) # assert that main plot line is the prediction assert_allclose(xp, x[sort_idx]) assert_allclose(yp, hat_y[sort_idx])
def setup_class(cls): sp = np.array([0.830689464223685, 425.361212061649]) cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) x_spline = df_autos[['weight', 'hp']].values # We need asarray to remove the design_info # If design_info is attached, # then exog_linear will also be transformed in predict. cls.exog = np.asarray(patsy.dmatrix('fuel + drive', data=df_autos)) bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) # TODO alpha needs to be list alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs, alpha=(alpha0).tolist()) cls.res1a = gam_bs.fit(use_t=True) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs.mpg_bs cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed # for checking that alpha model attribute is unchanged, same as alpha0 cls.alpha = [169947.78222669504, 26767.58046340008]
def test_partial_values2(): np.random.seed(0) n = 1000 x = np.random.uniform(0, 1, (n, 2)) x = x - x.mean() y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n) y -= y.mean() alpha = 0.0 # BUG: mask is incorrect if exog is not None, start_idx missing # bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2) # glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines, # alpha=alpha) bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, include_intercept=[True, False]) glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0, disp=0, maxiter=5000) glm = GLM(y, bsplines.basis) # noqa: F841 # case with constant column in exog is currently wrong # ex = np.column_stack((np.zeros((len(y), 1)), bsplines.smoothers[0].basis, # np.zeros_like(bsplines.smoothers[1].basis) )) ex = np.column_stack((bsplines.smoothers[0].basis, np.zeros_like(bsplines.smoothers[1].basis))) y_est = res_glm_gam.predict(ex, transform=False) y_partial_est, se = res_glm_gam.partial_values(0) assert_allclose(y_est, y_partial_est, atol=0.05) assert se.min() < 100
def test_partial_values(): # this test is only approximate because we don't use the same spline # basis functions (knots) as mgcv cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values se_from_mgcv = data_from_r.y_est_se df = [10] degree = [6] bsplines = BSplines(x, degree=degree, df=df, include_intercept=True) # TODO: alpha found by trial and error to pass assert alpha = 0.025 / 115 * 500 glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs') # TODO: if IRLS is used res_glm_gam has not partial_values. univ_bsplines = bsplines.smoothers[0] # noqa: F841 hat_y, se = res_glm_gam.partial_values(0) assert_allclose(hat_y, data_from_r["y_est"], rtol=0, atol=0.008) # TODO: bug missing scale bug_fact = np.sqrt(res_glm_gam.scale) * 0.976 # this is = 0.106 assert_allclose(se, se_from_mgcv * bug_fact, rtol=0, atol=0.008)
def test_gam_discrete(): cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.ybin.values df = [10] degree = [5] bsplines = BSplines(x, degree=degree, df=df, include_intercept=True) # y_mgcv is obtained from R with the following code # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80) y_mgcv = data_from_r.ybin_est alpha = 0.00002 # gp = UnivariateGamPenalty(alpha=alpha, univariate_smoother=bsplines) # lg_gam = LogitGam(y, bsplines.basis, penal=gp) # lg_gam = LogitGam(y, bsplines, alpha=alpha) res_lg_gam = lg_gam.fit(maxiter=10000) y_gam = np.dot(bsplines.basis, res_lg_gam.params) y_gam = sigmoid(y_gam) y_mgcv = sigmoid(y_mgcv) # plt.plot(x, y_gam, label='gam') # plt.plot(x, y_mgcv, label='mgcv') # plt.plot(x, y, '.', label='y') # plt.ylim(-0.4, 1.4) # plt.legend() # plt.show() assert_allclose(y_gam, y_mgcv, rtol=1.e-10, atol=1.e-1)
def test_multivariate_gam_cv(): # SMOKE test # no test is performed. It only checks that there isn't any runtime error def cost(x1, x2): return np.linalg.norm(x1 - x2) / len(x1) cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values df = [10] degree = [5] bsplines = BSplines(x, degree=degree, df=df) # y_mgcv is obtained from R with the following code # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80) alphas = [0.0251] alphas = [2] cv = KFold(3) gp = MultivariateGamPenalty(bsplines, alpha=alphas) # noqa: F841 gam_cv = MultivariateGAMCV(smoother=bsplines, alphas=alphas, gam=GLMGam, cost=cost, endog=y, exog=None, cv_iterator=cv) gam_cv_res = gam_cv.fit() # noqa: F841
def test_gam_glm(): cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values df = [10] degree = [3] bsplines = BSplines(x, degree=degree, df=df, include_intercept=True) # y_mgcv is obtained from R with the following code # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80) y_mgcv = np.asarray(data_from_r.y_est) alpha = 0.1 # chosen by trial and error glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0, disp=1, maxiter=10000, maxfun=5000) y_gam0 = np.dot(bsplines.basis, res_glm_gam.params) y_gam = np.asarray(res_glm_gam.fittedvalues) assert_allclose(y_gam, y_gam0, rtol=1e-10) # plt.plot(x, y_gam, '.', label='gam') # plt.plot(x, y_mgcv, '.', label='mgcv') # plt.plot(x, y, '.', label='y') # plt.legend() # plt.show() assert_allclose(y_gam, y_mgcv, atol=1.e-2)
def test_cov_params(): np.random.seed(0) n = 1000 x = np.random.uniform(0, 1, (n, 2)) x = x - x.mean() y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n) y -= y.mean() bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center') alpha = [0, 0] glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0, disp=0, maxiter=5000) glm = GLM(y, bsplines.basis) res_glm = glm.fit() assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(), rtol=0.0025) alpha = 1e-13 glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0, disp=0, maxiter=5000) assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(), atol=1e-10) res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0, disp=0, maxiter=5000, maxfun=5000) assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(), rtol=1e-4, atol=1e-8)
def test_multivariate_gam_1d_data(): cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y df = [10] degree = [3] bsplines = BSplines(x, degree=degree, df=df) # y_mgcv is obtained from R with the following code # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80) y_mgcv = data_from_r.y_est # alpha is by manually adjustment to reduce discrepancy in fittedvalues alpha = [0.0168 * 0.0251 / 2 * 500] gp = MultivariateGamPenalty(bsplines, alpha=alpha) # noqa: F841 glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines, alpha=alpha) # "nm" converges to a different params, "bfgs" params are close to pirls # res_glm_gam = glm_gam.fit(method='nm', max_start_irls=0, # disp=1, maxiter=10000, maxfun=5000) res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0, disp=1, maxiter=10000) y_gam = res_glm_gam.fittedvalues # plt.plot(x, y_gam, '.', label='gam') # plt.plot(x, y_mgcv, '.', label='mgcv') # plt.plot(x, y, '.', label='y') # plt.legend() # plt.show() assert_allclose(y_gam, y_mgcv, atol=0.01)
def test_multivariate_gam_cv_path(): def sample_metric(y1, y2): return np.linalg.norm(y1 - y2) / len(y1) cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values se_from_mgcv = data_from_r.y_est_se # noqa: F841 y_mgcv = data_from_r.y_mgcv_gcv # noqa: F841 df = [10] degree = [6] bsplines = BSplines(x, degree=degree, df=df, include_intercept=True) gam = GLMGam alphas = [np.linspace(0, 2, 10)] k = 3 cv = KFold(k_folds=k, shuffle=True) # Note: kfold cv uses random shuffle np.random.seed(123) gam_cv = MultivariateGAMCVPath(smoother=bsplines, alphas=alphas, gam=gam, cost=sample_metric, endog=y, exog=None, cv_iterator=cv) gam_cv_res = gam_cv.fit() # noqa: F841 glm_gam = GLMGam(y, smoother=bsplines, alpha=gam_cv.alpha_cv) res_glm_gam = glm_gam.fit(method='irls', max_start_irls=0, disp=1, maxiter=10000) y_est = res_glm_gam.predict(bsplines.basis) # plt.plot(x, y, '.', label='y') # plt.plot(x, y_est, '.', label='y est') # plt.plot(x, y_mgcv, '.', label='y mgcv') # plt.legend() # plt.show() # The test compares to result obtained with GCV and not KFOLDS CV. # This is because MGCV does not support KFOLD CV assert_allclose(data_from_r.y_mgcv_gcv, y_est, atol=1.e-1, rtol=1.e-1) # Note: kfold cv uses random shuffle np.random.seed(123) alpha_cv, res_cv = glm_gam.select_penweight_kfold(alphas=alphas, k_folds=3) assert_allclose(alpha_cv, gam_cv.alpha_cv, rtol=1e-12)
def test_glm_pirls_compatibility(): np.random.seed(0) n = 500 x1 = np.linspace(-3, 3, n) x2 = np.random.rand(n) x = np.vstack([x1, x2]).T y1 = np.sin(x1) / x1 y2 = x2 * x2 y0 = y1 + y2 y = y0 + np.random.normal(0, .3, n) y -= y.mean() y0 -= y0.mean() # TODO: we have now alphas == alphas_glm alphas = [5.75] * 2 alphas_glm = [1.2] * 2 # noqa: F841 # using constraints avoids singular exog. cs = BSplines(x, df=[10, 10], degree=[3, 3], constraints='center') gam_pirls = GLMGam(y, smoother=cs, alpha=alphas) gam_glm = GLMGam(y, smoother=cs, alpha=alphas) gam_res_glm = gam_glm.fit(method='nm', max_start_irls=0, disp=1, maxiter=20000, maxfun=10000) gam_res_glm = gam_glm.fit(start_params=gam_res_glm.params, method='bfgs', max_start_irls=0, disp=1, maxiter=20000, maxfun=10000) gam_res_pirls = gam_pirls.fit() y_est_glm = np.dot(cs.basis, gam_res_glm.params) y_est_glm -= y_est_glm.mean() y_est_pirls = np.dot(cs.basis, gam_res_pirls.params) y_est_pirls -= y_est_pirls.mean() # plt.plot(y_est_pirls) # plt.plot(y_est_glm) # plt.plot(y, '.') # plt.show() assert_allclose(gam_res_glm.params, gam_res_pirls.params, atol=5e-5, rtol=5e-5) assert_allclose(y_est_glm, y_est_pirls, atol=5e-5)
def test_bsplines(x, df, degree): bspline = BSplines(x, df, degree) bspline.transform(x)
def decompose(x, transform=True): # Decompose data into trend, seasonality and randomness # Accepts a pandas series object with a datetime index if (transform and min(x.dropna()) >= 0): # Transforms data and finds the lambda that maximizes the log likelihood # R version has above method and method that minimizes the coefficient of variation ("guerrero") x_transformed, var_lambda = boxcox(na_contiguous(x), lmbda=None) x_transformed = pd.Series(x_transformed, index=na_contiguous(x).index) else: x_transformed = x var_lambda = np.nan transform = False # Seasonal data # In R code, we find the number of samples per unit time below (should be 1 every time) # Here I take the datetime index differences, take their inverses, and store in a list to be evaluated # https://stackoverflow.com/questions/36583859/compute-time-difference-of-datetimeindex idx = x_transformed.index #samples = np.unique([int(1/(idx[n]-idx[n - 1]).days) for n in range(1,len(idx))]) # Filter out Nulls for this exercise #samples = samples[~np.isnan(samples)] #if len(samples) == 1 and samples.item() > 1: # Just use the R code instead # This is supposed to be "> 1" but all data results in a frequency of 1 # All frequency results in R equal 4, meaning this code block gets evaluated every time in R # So this code block should always be evaluated as well if int(rstats.frequency(x_transformed).item()) == 1: # Decompose stl = sm.tsa.seasonal_decompose(na_contiguous(x_transformed)) #stl = rstats.stl(na_contiguous(x_transformed),s_window='periodic') # When I try to use above function, I get this: ''' R[write to console]: Error in (function (x, s.window, s.degree = 0, t.window = NULL, t.degree = 1, : series is not periodic or has less than two periods ''' trend = stl.trend seasonality = stl.seasonal remainder = x_transformed - trend - seasonality else: # Nonseasonal data trend = pd.Series(np.nan, index=x_transformed.index) time_index = pd.Index([i for i in range(1, len(x_transformed) + 1)]) # Python specific bs = BSplines(time_index, df=[12, 10], degree=[3, 3]) cs = CyclicCubicSplines(time_index, df=[3, 3]) alpha = np.array([218.338888]) gam = GLMGam(x_transformed, smoother=cs, alpha=alpha).fit() #trend.loc[~x_transformed.isnull()] = gam.fittedvalues # R Code fmla = Formula('x ~ s(tt)') env = fmla.environment env['tt'] = time_index env['x'] = x_transformed trend.loc[~x_transformed.isnull()] = rstats.fitted(rmgcv.gam(fmla)) seasonality = pd.Series(np.nan, index=x_transformed.index) remainder = x_transformed - trend return_dct = { 'x': x_transformed, 'trend': trend, 'seasonality': seasonality, 'remainder': remainder, 'transform': transform, 'lambda': var_lambda, } return return_dct