def spline_poisson(data,column_name,integer=False): if data.shape[0] < 1: column_regression = column_name + '_regression' data[column_regression] = data[column_name] return data else: index_origin = data.index.name data.index.name = 'init_index' data = data.reset_index() data['index'] = data.index x_spline = data[['index']] bs = BSplines(x_spline, df=[4], degree=[3]) gam_bs = GLMGam.from_formula(f'{column_name} ~ index', data=data[['index', column_name]], smoother=bs, family=sm.families.Poisson()) res_bs = gam_bs.fit() column_regression = column_name + '_regression' if integer: data[column_regression] = np.random.poisson(res_bs.predict()) else: data[column_regression] = res_bs.predict() data = data.drop(columns='index') data = data.set_index('init_index') data.index.name = index_origin return data
def pred_monthly_population_gam(df): df.drop(columns=["Sex", "Age_group", "Value"], inplace=True) df = df.iloc[1].transpose().reset_index() df.columns = ["time", "pop"] bs = BSplines(df.index.values, df=[12], degree=[3]) gam = GLMGam.from_formula("pop ~ 1", data=df, smoother=bs) res = gam.fit() return pd.Series(res.predict(), index=df["time"])
def fit(self, X, y): X, y = self._validate_data(X, y, y_numeric=True) self.spline = BSplines( X, df = [self.df] * self.n_features_in_, degree = [self.degree] * self.n_features_in_, include_intercept = False ) gam = GLMGam( y, exog = np.ones(X.shape[0]), smoother = self.spline, alpha = self.alpha ) self.gam_predictor = gam.fit() return self
def memorize_chunk(self, x, bs, df=4, degree=3, return_penalty=False, knot_kwds=None): assert bs == "bs" or bs == "cc", "Spline basis not defined!" if bs == "bs": self.s = BSplines(x, df=[df], degree=[degree], include_intercept=True, knot_kwds=None) elif bs == "cc": self.s = CyclicCubicSplines(x, df=[df]) self.penalty_matrices = self.s.penalty_matrices
def fit(self, X: pd.DataFrame, y: pd.Series): if len(self._smooth_names) == 0: bs = None else: X_spline = X[self._smooth_names] bs = BSplines( X_spline, df=self._dfs, degree=[self._degree] * len(self._smooth_names), knot_kwds=[{ "lower_bound": None if self.lower_bound is None else self.lower_bound[i], "upper_bound": None if self.upper_bound is None else self.upper_bound[i] } for i in range(len(self._smooth_names))]) self._gam_bs = GLMGam(y, X.iloc[:, ~X.columns.isin(self._smooth_names)], smoother=bs, alpha=self._alphas, family=self._family) self._res_bs = self._gam_bs.fit() return self
def td_prob(rush_att: pd.DataFrame) -> Tuple[GLMResults, BSplines]: """ TD probability as a function of distance to goal """ rush_att["inv_yards"] = 1 / rush_att["yardline_100"] train_params = ["inv_yards"] # train_params = ["yardline_100"] y_data_train = rush_att["rush_touchdown"] x_data_train = rush_att[train_params] # TODO: These should be checked by X-val; this situation is more complicated than FGs degree, df, alpha = 3, 4, 0.0 bs = BSplines(x_data_train, df=[df], degree=[degree]) model = sm.GLMGam( y_data_train, sm.add_constant(x_data_train[[]]), smoother=bs, alpha=alpha, family=sm.families.Binomial(), ) fit = model.fit() print(fit.summary2()) return fit, bs
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]): assert not (self.splines == "cyclic_cubic") or (self.degree == 3) df = self.df if isinstance(self.df, list) else [self.df] * X.shape[1] degree = self.degree if isinstance( self.degree, list) else [self.degree] * X.shape[1] alpha = self.alpha if isinstance(self.alpha, list) else [self.alpha] * X.shape[1] if self.splines == "cyclic_cubic": self.splines_ = CyclicCubicSplines(X, df=df) elif self.splines == "b": self.splines_ = BSplines(X, df=df, degree=degree) else: raise ValueError(self.splines) self.x_min_ = np.min(X, axis=0) self.x_max_ = np.max(X, axis=0) self.estimator_ = GLMGam(y, X, smoother=self.splines_, family=self.family, alpha=alpha) self.res_ = self.estimator_.fit() return self
def basic_gam(data: pd.DataFrame, save_pred: bool = False) -> GLMResults: """ Regression of success probability as a function of kick distance """ # TODO: treat blocked FGs separately data["fg_make"] = data["field_goal_result"].map( # {"made": True, "missed": False, "blocked": False} { "made": 1, "missed": 0, "blocked": 0 }) train_params = ["kick_distance"] # data_test, data_train = split_test_train(data) # x_data_train = data_train[train_params] # y_data_train = data_train["fg_make"] # x_data_test = data_test[train_params] # y_data_test = data_test["fg_make"] x_data_train = data[train_params] y_data_train = data["fg_make"] # These values should be tested by cross-validation # degree 3 has slightly better test loss than 2 and isn't noticeably worse than 4 degree = 3 # df = 4 has the least test loss but is the minimum required for degree 3 df = 4 # alpha > 0 results in increased test loss. select_penweight() can be used to choose # in general. alpha = 0.0 bs = BSplines(x_data_train, df=[df], degree=[degree]) model = sm.GLMGam( y_data_train, sm.add_constant(x_data_train[[]]), smoother=bs, alpha=alpha, family=sm.families.Binomial(), ) fit = model.fit() # test_loss = get_loss( # fit, # sm.add_constant(bs.transform(x_data_test.to_numpy())), # y_data_test # ) df_resid = fit.df_resid llf = fit.llf / df_resid deviance = fit.deviance / df_resid chi_sq_df = fit.pearson_chi2 / df_resid print(f"ll / ndf = {llf}") print(f"deviance = {deviance}") print(f"chi sq / ndf = {chi_sq_df}") print(f"AIC = {fit.aic}") if save_pred: data["fg_make_prob"] = fit.predict( sm.add_constant(bs.transform(data[train_params].to_numpy()))) # defaults to 95% confidence interval (0.05 argument is alpha) # print(fit.conf_int(0.1)) # standard error approximation (95% ~ 2*sigma, double-sided) # print(0.25 * (fit.conf_int()[1] - fit.conf_int()[0])) # print(dir(sm_model_fit)) print(f"params = {fit.params}") return fit, bs
def harmonizationLearn(data, covars, eb=True, smooth_terms=[], smooth_term_bounds=(None, None), return_s_data=False, orig_model=None, seed=None): """ Wrapper for neuroCombat function that returns the harmonization model. Arguments --------- data : a numpy array data to harmonize with ComBat, dimensions are N_samples x N_features covars : a pandas DataFrame contains covariates to control for during harmonization all covariates must be encoded numerically (no categorical variables) must contain a single column "SITE" with site labels for ComBat dimensions are N_samples x (N_covariates + 1) eb : bool, default True whether to use empirical Bayes estimates of site effects smooth_terms (Optional) : a list, default [] names of columns in covars to include as smooth, nonlinear terms can be any or all columns in covars, except "SITE" if empty, ComBat is applied with a linear model of covariates if not empty, Generalized Additive Models (GAMs) are used will increase computation time due to search for optimal smoothing smooth_term_bounds (Optional) : tuple of float, default (None, None) feature to support custom boundaries of the smoothing terms useful when holdout data covers different range than specify the bounds as (minimum, maximum) currently not supported for models with mutliple smooth terms return_s_data (Optional) : bool, default False whether to return s_data, the standardized data array can be useful for diagnostics but will be costly to save/load if large seed (Optional) : int, default None By default, this function is non-deterministic. Setting the optional argument `seed` will make the function deterministic. Returns ------- model : a dictionary of estimated model parameters design, var_pooled, B_hat, grand_mean, gamma_star, delta_star, info_dict (a neuroCombat invention), gamma_hat, delta_hat, gamma_bar, t2, a_prior, b_prior, smooth_model bayes_data : a numpy array harmonized data, corrected for effects of SITE dimensions are N_samples x N_features s_data (Optional) : a numpy array standardized residuals after accounting for `covars` other than `SITE` set return_s_data=True to output the variable """ # set optional random seed if seed is not None: pass else: np.random.seed(seed) if orig_model is None: pass else: model = orig_model.copy() # transpose data as per ComBat convention data = data.T # prep covariate data covar_levels = list(covars.columns) batch_labels = np.unique(covars.SITE) batch_col = covars.columns.get_loc('SITE') if orig_model is None: pass else: isTrainSite = covars['SITE'].isin(model['SITE_labels']) isTrainSiteLabel = set(model['SITE_labels']) isTrainSiteColumns = np.where((pd.DataFrame(np.unique( covars['SITE'])).isin(model['SITE_labels']).values).flat) isTrainSiteColumnsOrig = np.where((pd.DataFrame( model['SITE_labels']).isin(np.unique(covars['SITE'])).values).flat) isTestSiteColumns = np.where((~pd.DataFrame(np.unique( covars['SITE'])).isin(model['SITE_labels']).values).flat) cat_cols = [] num_cols = [ covars.columns.get_loc(c) for c in covars.columns if c != 'SITE' ] smooth_cols = [ covars.columns.get_loc(c) for c in covars.columns if c in smooth_terms ] # maintain a dictionary of smoothing information smooth_model = { 'perform_smoothing': len(smooth_terms) > 0, 'smooth_terms': smooth_terms, 'smooth_cols': smooth_cols, 'bsplines_constructor': None, 'formula': None, 'df_gam': None } covars = np.array(covars, dtype='object') ### additional setup code from neuroCombat implementation: # convert batch col to integer covars[:, batch_col] = np.unique(covars[:, batch_col], return_inverse=True)[-1] # create dictionary that stores batch info (batch_levels, sample_per_batch) = np.unique(covars[:, batch_col], return_counts=True) info_dict = { 'batch_levels': batch_levels.astype('int'), 'n_batch': len(batch_levels), 'n_sample': int(covars.shape[0]), 'sample_per_batch': sample_per_batch.astype('int'), 'batch_info': [ list(np.where(covars[:, batch_col] == idx)[0]) for idx in batch_levels ] } ### design = make_design_matrix(covars, batch_col, cat_cols, num_cols) ### additional setup if smoothing is performed if smooth_model['perform_smoothing']: # create cubic spline basis for smooth terms X_spline = covars[:, smooth_cols].astype(float) if orig_model is None: bs = BSplines(X_spline, df=[10] * len(smooth_cols), degree=[3] * len(smooth_cols), knot_kwds=[{ 'lower_bound': smooth_term_bounds[0], 'upper_bound': smooth_term_bounds[1] }]) # construct formula and dataframe required for gam formula = 'y ~ ' df_gam = {} for b in batch_levels: formula = formula + 'x' + str(b) + ' + ' df_gam['x' + str(b)] = design[:, b] for c in num_cols: if c not in smooth_cols: formula = formula + 'c' + str(c) + ' + ' df_gam['c' + str(c)] = covars[:, c].astype(float) formula = formula[:-2] + '- 1' df_gam = pd.DataFrame(df_gam) # for matrix operations, a modified design matrix is required design = np.concatenate((df_gam, bs.basis), axis=1) # store objects in dictionary smooth_model['bsplines_constructor'] = bs smooth_model['formula'] = formula smooth_model['df_gam'] = df_gam else: bs_basis = model['smooth_model']['bsplines_constructor'].transform( X_spline) # construct formula and dataframe required for gam formula = 'y ~ ' df_gam = {} for b in batch_levels: formula = formula + 'x' + str(b) + ' + ' df_gam['x' + str(b)] = design[:, b] for c in num_cols: if c not in smooth_cols: formula = formula + 'c' + str(c) + ' + ' df_gam['c' + str(c)] = covars[:, c].astype(float) formula = formula[:-2] + '- 1' df_gam = pd.DataFrame(df_gam) # for matrix operations, a modified design matrix is required design = np.concatenate((df_gam, bs_basis), axis=1) ### # run steps to perform ComBat if orig_model is None: s_data, stand_mean, var_pooled, B_hat, grand_mean = standardizeAcrossFeatures( data, design, info_dict, smooth_model) LS_dict = fitLSModelAndFindPriors(s_data, design, info_dict, eb=eb) # optional: avoid EB estimates if eb: gamma_star, delta_star = find_parametric_adjustments( s_data, LS_dict, info_dict) else: gamma_star = LS_dict['gamma_hat'] delta_star = np.array(LS_dict['delta_hat']) bayes_data = adjust_data_final(s_data, design, gamma_star, delta_star, stand_mean, var_pooled, info_dict) # save model parameters in single object model = { 'design': design, 'SITE_labels': batch_labels, 'var_pooled': var_pooled, 'B_hat': B_hat, 'grand_mean': grand_mean, 'gamma_star': gamma_star, 'delta_star': delta_star, 'info_dict': info_dict, 'gamma_hat': LS_dict['gamma_hat'], 'delta_hat': np.array(LS_dict['delta_hat']), 'gamma_bar': LS_dict['gamma_bar'], 't2': LS_dict['t2'], 'a_prior': LS_dict['a_prior'], 'b_prior': LS_dict['b_prior'], 'smooth_model': smooth_model, 'eb': eb, 'SITE_labels_train': batch_labels, 'Covariates': covar_levels } # transpose data to return to original shape bayes_data = bayes_data.T else: # Create train data (batch_levels, sample_per_batch) = np.unique(covars[isTrainSite, batch_col], return_counts=True) if batch_levels.size == 0: bayes_data_train = np.zeros(shape=(0, data.shape[0])) s_data_train = np.zeros(shape=(0, data.shape[0])).T else: info_dict_train = model['info_dict'].copy() info_dict_train['sample_per_batch'] = sample_per_batch.astype( 'int') info_dict_train['batch_info'] = [ list(np.where(covars[isTrainSite, batch_col] == idx)[0]) for idx in batch_levels ] tmp = np.concatenate((np.zeros(shape=(info_dict['n_sample'], len(model['SITE_labels']))), design[:, len(batch_labels):]), axis=1) s_data_train, stand_mean_train, _ = applyStandardizationAcrossFeatures( data[:, isTrainSite], tmp[isTrainSite, :], info_dict_train, model) design2 = tmp.copy() design2[:, isTrainSiteColumnsOrig[0]] = design[:, isTrainSiteColumns[0]] bayes_data_train = adjust_data_final( s_data_train, design2[isTrainSite, :], model['gamma_star'], model['delta_star'], stand_mean_train, model['var_pooled'], info_dict_train) # transpose data to return to original shape bayes_data_train = bayes_data_train.T # Create test data (new SITE) (batch_levels, sample_per_batch) = np.unique(covars[~isTrainSite, batch_col], return_counts=True) if batch_levels.size == 0: bayes_data_test = np.zeros(shape=(0, data.shape[0])) s_data_test = np.zeros(shape=(0, data.shape[0])).T else: info_dict_test = { 'batch_levels': batch_levels.astype('int'), 'n_batch': len(batch_levels), 'n_sample': int(covars[~isTrainSite, :].shape[0]), 'sample_per_batch': sample_per_batch.astype('int'), 'batch_info': [ list(np.where(covars[~isTrainSite, batch_col] == idx)[0]) for idx in batch_levels ] } design_tmp = np.concatenate( (design[:, isTestSiteColumns[0]], design[:, len(batch_labels):]), axis=1) s_data_test, stand_mean_test, _ = applyStandardizationAcrossFeatures( data[:, ~isTrainSite], design_tmp[~isTrainSite, :], info_dict_test, model) LS_dict = fitLSModelAndFindPriors(s_data_test, design_tmp[~isTrainSite, :], info_dict_test, eb=eb) if eb: gamma_star, delta_star = find_parametric_adjustments( s_data_test, LS_dict, info_dict_test) else: gamma_star = LS_dict['gamma_hat'] delta_star = np.array(LS_dict['delta_hat']) betas = [] for i in range(info_dict_test['n_batch']): diff_mean = np.mean( data[:, info_dict_test['batch_info'][i]] - np.dot( design[info_dict_test['batch_info'][i], info_dict['n_batch']:], model['B_hat'][len(model['SITE_labels']):, :]).T, axis=1) betas.append(diff_mean) new_betas = np.array(betas) model['B_hat'] = np.concatenate( (model['B_hat'][:len(model['SITE_labels']), :], new_betas, model['B_hat'][len(model['SITE_labels']):, :])) model['SITE_labels'] = np.append( model['SITE_labels'], list(set(batch_labels) - isTrainSiteLabel)) model['gamma_star'] = np.append(model['gamma_star'], gamma_star, axis=0) model['delta_star'] = np.append(model['delta_star'], delta_star, axis=0) model['info_dict']['n_batch'] = len(model['SITE_labels']) bayes_data_test = adjust_data_final(s_data_test, design_tmp[~isTrainSite, :], gamma_star, delta_star, stand_mean_test, model['var_pooled'], info_dict_test) # transpose data to return to original shape bayes_data_test = bayes_data_test.T bayes_data = np.zeros(shape=data.T.shape) bayes_data[isTrainSite, :] = bayes_data_train bayes_data[~isTrainSite, :] = bayes_data_test s_data = np.zeros(shape=data.T.shape) s_data[isTrainSite, :] = s_data_train.T s_data[~isTrainSite, :] = s_data_test.T if return_s_data: return model, bayes_data, s_data.T else: return model, bayes_data
def harmonizationLearn(data, covars, eb=True, smooth_terms=[], smooth_term_bounds=(None, None), return_s_data=False): """ Wrapper for neuroCombat function that returns the harmonization model. Arguments --------- data : a numpy array data to harmonize with ComBat, dimensions are N_samples x N_features covars : a pandas DataFrame contains covariates to control for during harmonization all covariates must be encoded numerically (no categorical variables) must contain a single column "SITE" with site labels for ComBat dimensions are N_samples x (N_covariates + 1) eb : bool, default True whether to use empirical Bayes estimates of site effects smooth_terms (Optional) : a list, default [] names of columns in covars to include as smooth, nonlinear terms can be any or all columns in covars, except "SITE" if empty, ComBat is applied with a linear model of covariates if not empty, Generalized Additive Models (GAMs) are used will increase computation time due to search for optimal smoothing smooth_term_bounds (Optional) : tuple of float, default (None, None) feature to support custom boundaries of the smoothing terms useful when holdout data covers different range than specify the bounds as (minimum, maximum) currently not supported for models with mutliple smooth terms return_s_data (Optional) : bool, default False whether to return s_data, the standardized data array can be useful for diagnostics but will be costly to save/load if large Returns ------- model : a dictionary of estimated model parameters design, var_pooled, B_hat, grand_mean, gamma_star, delta_star, info_dict (a neuroCombat invention), gamma_hat, delta_hat, gamma_bar, t2, a_prior, b_prior, smooth_model bayes_data : a numpy array harmonized data, dimensions are N_samples x N_features s_data (Optional) : a numpy array if return_s_data=True """ # transpose data as per ComBat convention data = data.T # prep covariate data batch_col = covars.columns.get_loc('SITE') cat_cols = [] num_cols = [ covars.columns.get_loc(c) for c in covars.columns if c != 'SITE' ] smooth_cols = [ covars.columns.get_loc(c) for c in covars.columns if c in smooth_terms ] # maintain a dictionary of smoothing information smooth_model = { 'perform_smoothing': len(smooth_terms) > 0, 'smooth_terms': smooth_terms, 'smooth_cols': smooth_cols, 'bsplines_constructor': None, 'formula': None, 'df_gam': None } covars = np.array(covars, dtype='object') ### additional setup code from neuroCombat implementation: # convert batch col to integer covars[:, batch_col] = np.unique(covars[:, batch_col], return_inverse=True)[-1] # create dictionary that stores batch info (batch_levels, sample_per_batch) = np.unique(covars[:, batch_col], return_counts=True) info_dict = { 'batch_levels': batch_levels.astype('int'), 'n_batch': len(batch_levels), 'n_sample': int(covars.shape[0]), 'sample_per_batch': sample_per_batch.astype('int'), 'batch_info': [ list(np.where(covars[:, batch_col] == idx)[0]) for idx in batch_levels ] } ### design = make_design_matrix(covars, batch_col, cat_cols, num_cols) ### additional setup if smoothing is performed if smooth_model['perform_smoothing']: # create cubic spline basis for smooth terms X_spline = covars[:, smooth_cols].astype(float) bs = BSplines(X_spline, df=[10] * len(smooth_cols), degree=[3] * len(smooth_cols), knot_kwds=[{ 'lower_bound': smooth_term_bounds[0], 'upper_bound': smooth_term_bounds[1] }]) # construct formula and dataframe required for gam formula = 'y ~ ' df_gam = {} for b in batch_levels: formula = formula + 'x' + str(b) + ' + ' df_gam['x' + str(b)] = design[:, b] for c in num_cols: if c not in smooth_cols: formula = formula + 'c' + str(c) + ' + ' df_gam['c' + str(c)] = covars[:, c].astype(float) formula = formula[:-2] + '- 1' df_gam = pd.DataFrame(df_gam) # for matrix operations, a modified design matrix is required design = np.concatenate((df_gam, bs.basis), axis=1) # store objects in dictionary smooth_model['bsplines_constructor'] = bs smooth_model['formula'] = formula smooth_model['df_gam'] = df_gam ### # run steps to perform ComBat s_data, stand_mean, var_pooled, B_hat, grand_mean = StandardizeAcrossFeatures( data, design, info_dict, smooth_model) LS_dict = fit_LS_model_and_find_priors(s_data, design, info_dict) # optional: avoid EB estimates if eb: gamma_star, delta_star = find_parametric_adjustments( s_data, LS_dict, info_dict) else: gamma_star = LS_dict['gamma_hat'] delta_star = np.array(LS_dict['delta_hat']) bayes_data = adjust_data_final(s_data, design, gamma_star, delta_star, stand_mean, var_pooled, info_dict) # save model parameters in single object model = { 'design': design, 'var_pooled': var_pooled, 'B_hat': B_hat, 'grand_mean': grand_mean, 'gamma_star': gamma_star, 'delta_star': delta_star, 'info_dict': info_dict, 'gamma_hat': LS_dict['gamma_hat'], 'delta_hat': np.array(LS_dict['delta_hat']), 'gamma_bar': LS_dict['gamma_bar'], 't2': LS_dict['t2'], 'a_prior': LS_dict['a_prior'], 'b_prior': LS_dict['b_prior'], 'smooth_model': smooth_model, 'eb': eb } # transpose data to return to original shape bayes_data = bayes_data.T if return_s_data: return model, bayes_data, s_data else: return model, bayes_data
def GAM_pt(pse_t, expr, smooth='BSplines', df=5, degree=3, family=sm.families.NegativeBinomial()): """\ Fit a Generalized Additive Model with the exog to be the pseudo-time. The likelihood ratio test is performed to test the significance of pseudo-time in affecting gene expression value Parameters ---------- pse_t pseudo-time expr expression value smooth choose between BSplines and CyclicCubicSplines df number of basis function, or degree of freedom degree degree of the spline function family distribution family to choose, default is negative binomial. Returns ------- y_full predict regressed value with full model y_reduced predict regressed value from null hypothesis lr_pvalue p-value """ from statsmodels.gam.api import GLMGam, BSplines, CyclicCubicSplines if smooth == 'BSplines': spline = BSplines(pse_t, df=[df], degree=[degree]) elif smooth == 'CyclicCubicSplines': spline = CyclicCubicSplines(pse_t, df=[df]) exog, endog = sm.add_constant(pse_t), expr # calculate full model model_full = sm.GLMGam(endog=endog, exog=exog, smoother=spline, family=family) try: res_full = model_full.fit() except: # print("The gene expression is mostly zero") return None, None, None else: # default is exog y_full = res_full.predict() # reduced model y_reduced = res_full.null # number of samples - number of paras (res_full.df_resid) df_full_residual = expr.shape[0] - df df_reduced_residual = expr.shape[0] - 1 # likelihood of full model llf_full = res_full.llf # likelihood of reduced(null) model llf_reduced = res_full.llnull lrdf = (df_reduced_residual - df_full_residual) lrstat = -2 * (llf_reduced - llf_full) lr_pvalue = stats.chi2.sf(lrstat, df=lrdf) return y_full, y_reduced, lr_pvalue
import statsmodels.api as sm import numpy as np from statsmodels.gam.api import GLMGam, BSplines from statsmodels.gam.tests.test_penalized import df_autos x_spline = df_autos[['weight', 'hp']] bs = BSplines(x_spline, df=[12, 10], degree=[3, 3]) alpha = np.array([21833888.8, 6460.38479]) gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive', data=df_autos, smoother=bs, alpha=alpha) res_bs = gam_bs.fit() print(res_bs.summary()) res_bs.plot_partial(0, cpr=True) res_bs.plot_partial(1, cpr=True) alpha = np.array([8283989284.5829611, 14628207.58927821]) gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive', data=df_autos, smoother=bs, alpha=alpha, family=sm.families.Poisson()) res_bs = gam_bs.fit() print(res_bs.summary()) gam_bs.select_penweight()[0] gam_bs.select_penweight_kfold()[0]
plt.plot(x, p(x), 'k-') # from patsy import bs kts = [0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 1] z = sm.OLS(y, bs(x, knots=kts, include_intercept=True)).fit() plt.scatter(x, y) plt.plot(x, z.fittedvalues) # ## Additive Models # from statsmodels.gam.api import GLMGam, BSplines xmat = ethanol[['C', 'E']] bs = BSplines(xmat, df=[4, 4], degree=[3, 3]) gamod = GLMGam.from_formula('NOx ~ C + E', ethanol, smoother=bs).fit() # fig = gamod.plot_partial(0, cpr=True) # fig = gamod.plot_partial(1, cpr=True) # ## More Complex Models # ## Exercises # ## Packages Used