def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:,0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k:[] for k in X.columns} pred=np.zeros(y.shape[0]) for train,test in CV.split(X,y): Xtrain = X.iloc[train,:] ytrain = y.iloc[train] Xtest = X.iloc[test,:] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape)>1: p=p[:,0] pred[test]=p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k,v in importance_out.items(): importances[k].append(v) cv_scores = [{'r': np.corrcoef(y,pred)[0,1], 'R2': np.corrcoef(y,pred)[0,1]**2, 'MAE': mean_absolute_error(y,pred)}] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{'r': np.corrcoef(y,in_pred)[0,1], 'R2': np.corrcoef(y,in_pred)[0,1]**2, 'MAE': mean_absolute_error(y,in_pred)}] GAM_results[name] = {'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam} return GAM_results
def smoother_linearGAM(x,y,X,**kwargs): from pygam import LinearGAM, l, s if isinstance(x,list): x = np.array(x) x = x.reshape(len(x),1) if isinstance(y,list): y = np.array(y) if isinstance(X,list): X = np.array(X) if X is None: X = x.reshape(len(x),1) else: X = X.reshape(len(X),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # n_splines = int(len(y)/5) #gam = LinearGAM(n_splines=n_splines,\ # terms=s(0,basis='ps')\ # ).gridsearch(x, y) gam = LinearGAM( terms=s(0,basis='ps')\ ).gridsearch(x, y ) # sample on the input grid means = gam.predict(X) return means
def feature_selection_single(x, y, x_test, y_test): timestart = time.time() cols = list(deepcopy(x.columns)) best_result = 0 selected_cols = [] continue_selection = True iterationresult = {} while continue_selection: for col in tqdm(cols, leave=False): testcols = selected_cols + [col] model = LinearGAM().gridsearch(x[testcols].values, y, progress=False) iterationresult[col] = model._estimate_r2( x_test[testcols].values, y_test)['explained_deviance'] #iterationresult[col] = r2_score(model.predict(x_test[testcols].values), y_test) key = max(iterationresult.keys(), key=(lambda key: iterationresult[key])) if (iterationresult[key] > best_result) & check_significance( x, y, x_test, selected_cols, key): best_result = iterationresult[key] selected_cols.append(key) cols.remove(key) else: continue_selection = False logging.info("{}: {}".format(selected_cols, best_result)) return best_result, selected_cols, time.time() - timestart
def BAM(X, y): # model implementation by PYGAM gam = LinearGAM(s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1)) gam.gridsearch(X, y) # print(gam.gridsearch(X, y).summary()) return gam
def tsSSE(self, model='linear'): sse = 0 for i in range(self.m): index = [ item for sublist in np.where(self.dataLabel == i) for item in sublist ] Xfit = self.Xall[index, :] Afit = self.Aall[index] Bfit = self.Ball[index] Af = Afit * self.model.decision_function(Xfit) Xmat = np.column_stack((Xfit, Af)) if model == 'linear': ## linear regression model for B Xmat = sm.add_constant(Xmat) BModel = sm.OLS(Bfit, Xmat) res = BModel.fit() pred = res.predict() elif model == 'GAM': BModel = LinearGAM(fit_intercept=True) res = BModel.fit( Xmat, Bfit) ##the GAM model can be specified differently pred = res.predict(Xmat) sse = sse + sum([(Bfit[elem] - pred[elem])**2 for elem in range(len(Bfit))]) return sse
def __init__(self, m, idim=20): if m == 0: self.model = self.base(idim) self.type = 2 elif m == 1: self.model = self.base2(idim) self.type = 2 elif m == 2: self.model = self.returnSequential4(idim) self.type = 2 elif m == 3: self.model = self.returnSequential8(idim) self.type = 2 elif m == 4: self.model = self.returnSequential15_regularized(idim) self.type = 2 elif m == 5: self.model = self.multi_RNN(idim) self.type = 1 elif m == 6: self.model = self.multi_lstm(idim) self.type = 1 elif m == 7: self.model = LinearGAM() self.type = 3 elif m == 8: self.model = self.RNN(idim) self.type = 1 elif m == 9: self.model = self.lstm(idim) self.type = 1
def GAM(X, Y): """SPLITTING THE DATASET""" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options) """PREPROCESSING""" # NB: No need for one-hot encoding – categorical columns are already binary! scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) """CREATING A DESIGN MATRIX""" poly = PolynomialFeatures(1) X_test = poly.fit_transform(X_test) X_train = poly.fit_transform(X_train) gam_input = None for n in range(X_train.shape[1]): if gam_input is not None: gam_input += GAM_spline(n) else: gam_input = GAM_spline(n) gam = LinearGAM(gam_input).fit(X_train, Y_train) Y_predict = gam.predict(X_test) Y_predict[Y_predict >= 0.5] = 1 Y_predict[Y_predict < 0.5] = 0 accuracy = (Y_predict.squeeze() == Y_test.squeeze()).astype(int) accuracy = np.sum(accuracy)/accuracy.shape[0] return accuracy
def smooth_gam(x, y, n_splines=100, lam=10): from pygam import ExpectileGAM, LinearGAM, s, f gam = LinearGAM(s(0, n_splines=n_splines), lam=lam).fit(x, y) # gam = ExpectileGAM(s(0, n_splines=n_splines), expectile=0.5, lam=lam).gridsearch(x.values.reshape((-1,1)), y) XX = gam.generate_X_grid(term=0) confi = gam.confidence_intervals(XX) # confi = gam.prediction_intervals(XX) ym = gam.predict_mu(XX) return XX[:, 0], ym, confi
def test_model_constructors(): # test that the right errors are thrown because cannot be constructed with pytest.raises(TypeError): BaseTEModel() with pytest.raises(ValueError): IFLearnerTE(None) # test other configurations of base learners if_learner1 = IFLearnerTE(None, base_estimator=LinearGAM()) if_learner2 = IFLearnerTE(te_estimator=LinearGAM(), base_estimator=None)
def interp_gam(data): valid = np.isfinite(data.stream_dist.values[:, 0]) sample_xy = data.sample_xy.values[valid] sample_st = data.stream_dist.values[valid] sample_z = data.sample_z.values[valid] if np.sum(valid) == 0: return np.nan gam = LinearGAM( s(0, n_splines=4) + s(1, n_splines=5) + te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z) z_pred = gam.predict(np.array([[0, 0]]))[0] return z_pred
def updateEmpTauX(self, bFit=True, mask=None): if mask is None: mask = np.ones((self.V, self.S)) square_diff_matrix = self.exp_square_diff_matrix() mXFit = np.ma.masked_where(mask == 0, self.X) X1DFit = np.ma.compressed(mXFit) logX1DFit = np.log(0.5 + X1DFit) mSDMFit = np.ma.masked_where(mask == 0, square_diff_matrix) mFitFit = np.ma.compressed(mSDMFit) logMFitFit = np.log(mFitFit + NMF_VB.minVar) if bFit: try: self.gam = LinearGAM( s(0, n_splines=5, constraints='monotonic_inc')).fit(logX1DFit, logMFitFit) except ValueError: print("Performing fixed tau") self.updateFixedTau(mask) return mX = np.ma.masked_where(mask == 0, self.X) X1D = np.ma.compressed(mX) logX1D = np.log(0.5 + X1D) yest_sm = self.gam.predict(logX1D) mBetaTau = self.beta * (X1D + 0.5) + 0.5 * np.exp(yest_sm) np.place(self.betaTau, mask == 1, mBetaTau) mExpTau = (self.alpha + 0.5) / mBetaTau np.place(self.expTau, mask == 1, mExpTau) mLogTau = digamma(self.alpha + 0.5) - np.log(mBetaTau) np.place(self.expLogTau, mask == 1, mLogTau)
def test_if_learner(): # get data without noise X, y, w, ite, p, bs = make_te_data(n=200, noise=False) # get surrogate predictions to compare against po predictions mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w) # get surrogate predictions for two folds as inside the iflearner splitter = StratifiedKFold(n_splits=2, shuffle=True, random_state=42) idx_list = [] for train_index, test_index in splitter.split(X, w): idx_list.append((train_index, test_index)) fold2_mask = np.zeros(200, dtype=bool) fold2_mask[idx_list[0][1]] = 1 mu_0, mu_1 = np.zeros(200), np.zeros(200) mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask) mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask) pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1) # make second stage model t_model = LinearGAM() t_model.fit(X, pseudo_outcome) te_debiased = t_model.predict(X) # fit if learner if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True) # test outcomes np.testing.assert_almost_equal(te, te_debiased) np.testing.assert_almost_equal(mu_0, mu_0_plug) np.testing.assert_almost_equal(mu_1, mu_1_plug) np.testing.assert_almost_equal(if_learner.predict(X), te_debiased) with pytest.raises(ValueError): # predicting po when base model not fitted should not be possible if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True) with pytest.warns(UserWarning): # warning raised if only one fold? if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42) if_learner.fit(X, y, w, p) # check that binary_y setting also works (smoketest) X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline, noise=False, binary_y=True) if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(), binary_y=True, setting=RR_NAME, fit_base_model=True) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
def test_scores(): # get data X, y, w, ite, p, bs = make_te_data(n=200) train = [i for i in range(100)] test = [i for i in range(100, 200)] # test that score is correct by pre-training IFLearner outside of scorer # split data X_train, y_train, w_train, p_train = _safe_indexing(X, train), _safe_indexing(y, train), \ _safe_indexing(w, train), _safe_indexing(p, train) X_test, t_test = _safe_indexing(X, test), _safe_indexing(ite, test) # fit if-learner and get predictions on test set if_learner = IFLearnerTE(LinearGAM()) if_learner.fit(X_train, y_train, w_train, p_train) t_pred = if_learner.predict(X_test) neg_mse = -mean_squared_error(t_test, t_pred) # score output score = fit_and_score_te_oracle(IFLearnerTE(LinearGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True, error_score=np.nan) np.testing.assert_almost_equal(score, neg_mse) # smoke test some other capabilities # test that we can pass parameters too score = fit_and_score_te_oracle(IFLearnerTE(LinearGAM()), X, y, w, p, ite, train=train, test=test, parameters={'te_estimator': LinearGAM()}, scorer='neg_mean_squared_error', return_test_score_only=True, error_score=np.nan) np.testing.assert_almost_equal(score, neg_mse)
def test_exceptions(): # get data X, y, w, ite, p, bs = make_te_data(n=200) train = [i for i in range(100)] test = [i for i in range(100, 200)] with pytest.raises(ValueError): # pass incorrect type of estimator fit_and_score_te_oracle(LinearGAM(), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True) with pytest.raises(ValueError): # fit should throw an error fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True, error_score='raise') with pytest.raises(ValueError): # fit should throw an error because error score is incorrect fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=False, error_score='asdfad') # assert we get error score otherwise score = fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True, error_score=np.nan) assert math.isnan(score)
def spline_fit(windspeed_column, power_column, n_splines=20): """ Use the pyGAM package to fit a wind speed and power curve using spline fitting Args: windspeed_column (:obj:`pandas.Series`): feature column power_column (:obj:`pandas.Series`): response column n_splines (:obj:`int`): number of splines to use in the fit Returns: :obj:`function`: Python function of type (Array[float] -> Array[float]) implementing the power curve. """ # Fit the data x = windspeed_column.values.reshape((windspeed_column.size, 1)) y = power_column.values s = LinearGAM(n_splines=n_splines).gridsearch(x, y) # Create a closure over the spline fit which computes the power curve value for arbitrary array-like input def pc_spline(xx): P = s.predict(xx) return P return pc_spline
def get_surrogate_predictions(X, y, w, pred_mask=None): if pred_mask is None: pred_mask = np.ones(len(y), dtype=bool) fit_mask = pred_mask else: fit_mask = ~pred_mask # get surrogates model_1 = LinearGAM() model_1.fit(X[fit_mask & (w == 1), :], y[fit_mask & (w == 1)]) mu_1_plug = model_1.predict(X[pred_mask, :]) model_0 = LinearGAM() model_0.fit(X[fit_mask & (w == 0), :], y[fit_mask & (w == 0)]) mu_0_plug = model_0.predict(X[pred_mask, :]) return mu_0_plug, mu_1_plug
def gam_3param(windspeed_column, winddir_column, airdens_column, power_column, n_splines=20): """ Use a generalized additive model to fit power to wind speed, wind direction and air density. Args: windspeed_column (:obj:`pandas.Series`): Wind speed feature column power_column (:obj:`pandas.Series`): Power response column winddir_column (:obj:`pandas.Series`): Optional. Wind direction feature column airdens_column (:obj:`pandas.Series`): Optional. Air density feature column n_splines (:obj:`int`): number of splines to use in the fit Returns: :obj:`function`: Python function of type (Array[float] -> Array[float]) implementing the power curve. """ # create dataframe input to LinearGAM X = pd.DataFrame({"ws": windspeed_column, "wd": winddir_column, "dens": airdens_column}) # Set response y = power_column.values # Fit the model s = LinearGAM(n_splines=n_splines).fit(X, y) # Wrap the prediction function in a closure to pack input variables def predict(windspeed_column, winddir_column, airdens_column): X = pd.DataFrame({"ws": windspeed_column, "wd": winddir_column, "dens": airdens_column}) return s.predict(X) return predict
def _fit_final_gam(self): """We now regress the original treatment values against the pseudo-outcome values """ return LinearGAM(s(0, n_splines=30, spline_order=3), max_iter=500, lam=self.bandwidth).fit(self.t_data, y=self.pseudo_out)
def fit_pygam_model(X_train: pandas.core.frame.DataFrame, X_test: pandas.core.frame.DataFrame, y_train: pandas.core.frame.DataFrame, y_test: pandas.core.frame.DataFrame): ''' Creates a general additive model LinearGAM (normally distributed errors) with grid search. Returns the best model with given hyperparameters. hyperparameters: n_splines and lam regularization parameter. ''' from pygam import LinearGAM gam = LinearGAM().gridsearch(X_train.values, y_train, n_splines=np.arange(3, 20), lam=np.logspace(-3, 3, 11)) print(gam.summary()) y_train_predicted = gam.predict(X_train) y_test_predicted = np.floor(gam.predict(X_test)) rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted)) mae_train = mean_absolute_error(y_train, y_train_predicted) r2_train = r2_score(y_train, y_train_predicted) print("RMSE of training set is {}".format(rmse_train)) print("MAE of testing set is {}".format(mae_train)) print("R2 score of training set is {}\n".format(r2_train)) if len(y_test) > 0: rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted)) mae_test = mean_absolute_error(y_test, y_test_predicted) r2_test = r2_score(y_test, y_test_predicted) print("RMSE of testing set is {}".format(rmse_test)) print("MAE of testing set is {}".format(mae_test)) print("R2 score of testing set is {}\n".format(r2_test)) ''' Visualize the feature significance and confidence intervals ''' num_features = len(X_train.columns) fig = plt.figure(figsize=(18, 12)) fig.subplots_adjust(hspace=0.4) cnt = 1 p_values = gam.statistics_['p_values'] for i in range(num_features): axs = fig.add_subplot(num_features, 1, cnt) m = gam.generate_X_grid(term=i) axs.plot(m[:, i], gam.partial_dependence(term=i, X=m)) # this is the actual coefficents axs.plot(m[:, i], gam.partial_dependence(term=i, X=m, width=.95)[1], c='r', ls='--') # this plots the confidence intervals axs.set_title(X_train.columns[i] + ('*' if p_values[cnt] < 0.05 else '')) cnt += 1
def __init__(self, model_path, **kwargs): super().__init__() print('Using GeneralizedAdditive model.') self.model_params = {'n_splines': 25} self.model_path = model_path if kwargs: for kw in kwargs: self.model_params[kw] = kwargs[kw] self.model = LinearGAM(**self.model_params)
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa): prediction = [] actual_value = [] n_splines_all = [] lam_all = [] # THIS IS OUTER LOOP: for VALIDATION/TESTING #train n models and evaluate their average performance gene_indexes = index_set y = cell_count_aa X = gene_expression[gene_expression.columns[gene_indexes]] loo = LeaveOneOut() loo.get_n_splits(X) gam = LinearGAM() gam = gam.gridsearch(X, y, n_splines=np.arange(10, 50), lam=[0.4, 0.5, 0.6, 0.7, 0.8]) for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] # THIS IS INNER LOOP: for TRAINING/VALIDATION #train model with given optimized parameters regr = gam.fit(X_train, y_train) #make a prediction on OUTER LOOP test set prediction_val = regr.predict(X_test)[0] # store predictions and actual values prediction.append(prediction_val) actual_value.append(y_test[0]) # add optimal parameter values to arrays n_splines_all.append(regr.n_splines) lam_all.append(regr.lam) print(test_index) print(str(prediction_val), " ", str(y_test[0])) #calculate spearman correlation over all of the models rho, pval = spearmanr(actual_value, prediction) lams = np.array(lam_all) lams_mean = lams.mean() n_splines_all = np.array(n_splines_all) n_splines_mean = n_splines_all.mean() return lams_mean, n_splines_mean, rho, pval
def GAM_model(df, feature_list): X_train = df[feature_list] y_train = df[['logerror']] scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X_train) X_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns.values).set_index( [X_train.index.values]) X_scaled = X_scaled.to_numpy() y_train = y_train.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam.gridsearch(X_scaled, y_train) y_pred = gam.predict(X_scaled) y_pred = pd.DataFrame(y_pred) y_pred['actual'] = y_train y_pred.columns = ['predicted', 'actual'] RMSE = float('{:.3f}'.format( sqrt(mean_squared_error(y_pred.actual, y_pred.predicted)))) R2 = float('{:.3f}'.format(r2_score(y_pred.actual, y_pred.predicted))) return RMSE, R2, gam
def _fit_gam(self): """Fits a GAM that predicts the outcome from the treatment and GPS """ X = np.column_stack((self.T.values, self.gps)) y = np.asarray(self.y) return LinearGAM( s(0, n_splines=self.n_splines, spline_order=self.spline_order) + s(1, n_splines=self.n_splines, spline_order=self.spline_order), max_iter=self.max_iter, lam=self.lambda_, ).fit(X, y)
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest, pred)[0, 1]**2 importances[predictor] = R2 return importances
def fit_gam_plot_dependencies(df=None, features=None, target=None, basis_1=s, basis_2=False, summary=False): X = df[features] y = df[target] if basis_1 and basis_2: gam = LinearGAM(basis_1(0, lam=60) + basis_2(1, lam=60), fit_intercept=True).fit(X, y) elif basis_1: gam = LinearGAM(basis_1(0, lam=60), fit_intercept=True).fit(X, y) else: print('no basis called for features.. error') if summary: print(gam.summary()) plot_gam_partial_dependencies(gam, features, target)
def train(self, X, y, base_pred): """Trains ensemble model based on data and base predictions. Adds value to class attribute "model_weight" Args: X: (np.ndarray) Training features, shape (N, D) y: (np.ndarray) Training labels, shape (N, 1) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. """ # build feature and gam terms ens_feature, feature_terms = self._build_ensemble_feature(X, base_pred) # define model self.gam_model = LinearGAM(feature_terms) # additional fine-tuning lam_grid = self._build_lambda_grid(n_grid=100) self.gam_model.gridsearch(X=ens_feature, y=y, lam=lam_grid, progress=False)
def GAM(X, Y, factor = False): """SPLITTING THE DATASET""" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options) """PREPROCESSING""" # NB: No need for one-hot encoding – categorical columns are already binary! scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) """CREATING A DESIGN MATRIX""" poly = PolynomialFeatures(1) X_test = poly.fit_transform(X_test) X_train = poly.fit_transform(X_train) linear = ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'y', 'n', 'y', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'] # for feature in X_train.T: # unique = np.unique(feature) # if len(unique) < 6: # linear.append("n") # else: # idx = np.argsort(feature) # plt.plot(feature[idx], Y.squeeze()[idx]) # plt.show() # linear.append(input("Linear?\t")) linear = np.array(linear) linear[linear == "n"] = 0 linear[linear == "y"] = 1 linear = linear.astype(bool) gam_input = None for n,is_linear in enumerate(linear): if gam_input is not None: if is_linear: gam_input += GAM_line(n) if factor: gam_input += GAM_factor(n) else: gam_input += GAM_spline(n) else: if is_linear: gam_input = GAM_line(n) if factor: gam_input += GAM_factor(n) else: gam_input = GAM_spline(n) gam = LinearGAM(gam_input, fit_intercept = False, max_iter = int(1E5)) gam.fit(X_train, Y_train) Y_predict_train = gam.predict(X_train) Y_predict_test = gam.predict(X_test) MSE_train = np.mean((Y_predict_train - Y_train)**2) MSE_test = np.mean((Y_predict_test - Y_test)**2) return MSE_train, MSE_test
def cleaner_linearGAM(x,y,**kwargs): from pygam import LinearGAM, l, s if isinstance(x,list): x = np.array(x) if isinstance(y,list): y = np.array(y) X = x.reshape(len(x),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # #n_splines = int(len(y)/5) #gam = LinearGAM(n_splines=n_splines,\ # terms=s(0,basis='ps')\ # ).gridsearch(X, y) gam = LinearGAM(terms=s(0,basis='ps')).gridsearch(X, y) #gam = LinearGAM(n_splines=n_splines,terms=s(0)).gridsearch(X, y) # sample on the input grid means = gam.predict(X) bounds = gam.prediction_intervals(X, width=.95) idx = [i for i in range(len(y)) \ if (y[i]>bounds[i,1] or y[i]<bounds[i,0])] return idx
def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR): model_spec = f(0) if features[0].is_factor() else s( 0, n_splines=self.num_splines) for i in range(1, len(features)): model_spec += f(i) if features[i].is_factor() else s( i, n_splines=self.num_splines) if model_type == TYPE_LINEAR: return LinearGAM(model_spec) if model_type == TYPE_LOGISTIC: return LogisticGAM(model_spec)
def get_GAM_predictions(Xtrain, Ytrain, Xtest): """ Perform grid search and train Linear GAM model and return predictions for the test set. :param Xtrain: X values for training. :param Ytrain: Y values for training. :param Xtest: X values for validation. :return: Predictions from Linear GAM model for test dataset """ # Create an array of lambda values to search lams = np.logspace(-3, 20, 35) # GAM search requires numpy arrays Xtrain_np = np.array(Xtrain, dtype=np.float64) Ytrain_np = np.array(Ytrain, dtype=np.float64) # Linear Generalised Additive Model model = LinearGAM( s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) + l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) + l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) + l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np, Ytrain_np, lam=lams) return model.predict(Xtest)
def predict_gam(ad_group,date): ads_file = 'data/ad_table.csv' df = pd.read_csv(ads_file, header=0, sep=',') df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True) splines=[5, 7, 10, 20, 30, 40, 45] lams = np.logspace(-3,3,7) if(ad_group in df['ad'].unique()): df_ad_group_train = df[df['ad'] == ad_group] df_ad_group_train = df_ad_group_train.reset_index() df_ad_group_train['time_period'] = (df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days X_train = df_ad_group_train[['time_period']].values y_train = df_ad_group_train['shown'].values #auto tuning gam = LinearGAM().gridsearch(X_train, y_train, lam=lams, n_splines=splines) predictions = gam.predict(X_train) print('==== Tuning for ad group %s - best generalized cross-validation %f ' % (ad_group, gam.statistics_['GCV'])) tuning_result = (gam.lam[0][0], gam.n_splines[0], gam.statistics_['GCV']) predict_date = (pd.to_datetime(date) - df_ad_group_train['date'][0]).days print("Auto tuning result=",tuning_result) print("Prediction for number of ads Shown for",ad_group,"on ",date,"=",gam.predict([[predict_date]])) print("Regression/Lambda value = ",gam.lam) print("n_splines=",gam.n_splines) else: print("Ad group does not exist")
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest,pred)[0,1]**2 importances[predictor] = R2 return importances
import patsy as pt import numpy as np from plotly import tools import plotly.offline as py import plotly.graph_objs as go # Prep the dataset data = pd.read_csv( "/home/dusty/Econ8310/DataSets/HappinessWorld.csv") # Generate x and y matrices eqn = """happiness ~ -1 + freedom + family + year + economy + health + trust""" y,x = pt.dmatrices(eqn, data=data) # Initialize and fit the model gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam = gam.gridsearch(np.asarray(x), y) # Specify plot shape titles = ['freedom', 'family', 'year', 'economy', 'health', 'trust'] fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles) fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False) for i, title in enumerate(titles): XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, width=.95) trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect') ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI') ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI')
from bokeh.plotting import figure, show from bokeh.layouts import gridplot, row import matplotlib.pyplot as plt # Importing data from the web path = 'http://www.stat.cmu.edu/~larry/' \ 'all-of-nonpar/=data/rock.dat' data = pd.read_csv(path, sep=' *', engine='python') X = data[['peri','shape','perm']] y = data['area'] adjy = y - np.mean(y) gam = LinearGAM(n_splines=10).gridsearch(X, y) XX = generate_X_grid(gam) # fig, axs = plt.subplots(1, 3) titles = ['peri', 'shape', 'perm'] # for i, ax in enumerate(axs): # pdep, confi = gam.partial_dependence(XX, feature=i+1, width=.95) # ax.scatter(X[X.columns[i]], adjy, color='gray', edgecolors='none') # ax.plot(XX[:, i], pdep) # ax.plot(XX[:, i], confi[0], c='r', ls='--') # ax.set_title(titles[i]) pdep, confi = gam.partial_dependence(XX, width=.95)