def gam_results(x, y, df, param, infection_time): gam = LinearGAM(s(0), lam=.5).fit(x, y) y_new = gam.predict(x) confi1 = gam.prediction_intervals(x, width=.95) pred = np.zeros(x.shape[0]) for i in np.arange(x.shape[0]): if i == 0: pred[i] = np.mean(df[param].iloc[0:3]) else: if i < infection_time: pred[i] = pred[i - 1] * y_new[i] + pred[i - 1] else: pred[i] = pred[i - 1] * y_new[i] + pred[i - 1] if param == 'Positive': pred = pred + np.concatenate( (np.zeros(infection_time), pred[0:(pred.shape[0] - infection_time)]), axis=0) x_forcast = np.arange(np.max(x), np.max(x) + 10) y_forcast = gam.predict(x_forcast) confi = gam.prediction_intervals(x_forcast, width=.95) forcast = np.zeros(x_forcast.shape[0]) forcast_L = np.zeros(x_forcast.shape[0]) forcast_U = np.zeros(x_forcast.shape[0]) for i in np.arange(x_forcast.shape[0]): if i == 0: forcast[i] = df[param].iloc[-1] forcast_L[i] = forcast[i] forcast_U[i] = forcast[i] else: forcast[i] = forcast[i - 1] * y_forcast[i - 1] + forcast[i - 1] forcast_L[i] = forcast_L[i - 1] * confi[i - 1, 0] + forcast_L[i - 1] forcast_U[i] = forcast_U[i - 1] * confi[i - 1, 1] + forcast_U[i - 1] return ([pred, forcast, forcast_L, forcast_U, y_new, confi1])
def GAMfitter(indir, dat_st, T0=None): fname = [i for i in os.listdir(indir) if dat_st in i] data = np.loadtxt(indir + fname[0]) frequency = np.linspace( 1e-3, 0.5, int(1e6)) # A range for frequencies (2 to 1000 day periods) power = LombScargle(data[:, 0], data[:, 1]).power(frequency=frequency) # Get spectrum ind = get_index_of_max(power) # Best frequency if T0 is None: # If we have no preset T0, try to get a minimum phs = get_phase_curve(data[:, 0], data[0, 0], 1 / frequency[ind]) ext_phs, ext_mags = phase_curve_extender(phs, data[:, 1]) gam = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags) # Fit a GAM XX = gam.generate_X_grid(term=0, n=500) fit = gam.predict(XX) # This is the fit on the grid minimal_val = max(fit) # Maximum magnitude (minimal brightness) min_ind = get_index_of_min(abs(data[:, 1] - minimal_val)) T0 = data[min_ind, 0] phs = get_phase_curve(data[:, 0], T0, 1 / frequency[ind]) ext_phs, ext_mags = phase_curve_extender(phs, data[:, 1]) gam = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags) pred_int_vls = gam.prediction_intervals(phs, width=.85) cond = (data[:, 1] > pred_int_vls[:, 0]) & (data[:, 1] < pred_int_vls[:, 1]) filtered_data = data[cond] power_f = LombScargle(data[:, 0], data[:, 1]).power(frequency=frequency) ind_f = get_index_of_max(power_f) phs_f = get_phase_curve(filtered_data[:, 0], T0, 1 / frequency[ind_f]) ext_phs, ext_mags = phase_curve_extender(phs_f, filtered_data[:, 1]) gam_f = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags) return filtered_data, gam_f, frequency[ind_f], T0
def cleaner_linearGAM(x,y,**kwargs): from pygam import LinearGAM, l, s if isinstance(x,list): x = np.array(x) if isinstance(y,list): y = np.array(y) X = x.reshape(len(x),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # #n_splines = int(len(y)/5) #gam = LinearGAM(n_splines=n_splines,\ # terms=s(0,basis='ps')\ # ).gridsearch(X, y) gam = LinearGAM(terms=s(0,basis='ps')).gridsearch(X, y) #gam = LinearGAM(n_splines=n_splines,terms=s(0)).gridsearch(X, y) # sample on the input grid means = gam.predict(X) bounds = gam.prediction_intervals(X, width=.95) idx = [i for i in range(len(y)) \ if (y[i]>bounds[i,1] or y[i]<bounds[i,0])] return idx
# In[5]: gam = LinearGAM(s(0)).fit(X, Y) gam.gridsearch(X, Y) # Plot # In[6]: XX = gam.generate_X_grid(term=0) pdep, confi = gam.partial_dependence(term=0, X=XX, width=0.95) plt.figure() plt.plot(XX, pdep) # fit plt.plot(XX, confi, c='r', ls='--') # confidence interval plt.plot(XX, gam.prediction_intervals(XX, width=.95), color='b', ls='--') # 95% prediction interval plt.scatter(X, Y, facecolor='gray', edgecolors='none', alpha=0.5) # data plt.xlabel('Age') plt.ylabel('Brain feature') plt.show() # In[7]: metric = 'jd' X = df_pheno.loc[:, ['ageAtScan1_Years', 'mprage_antsCT_vol_TBV']] Y = df_system.loc[:, metric] # Estimate GAM with spline # In[8]:
''' #GAMs #https://github.com/dswah/pyGAM #https://codeburst.io/pygam-getting-started-with-generalized-additive-models-in-python-457df5b4705f from pygam import LinearGAM, LogisticGAM gam_model = LinearGAM().fit(d[['disp', 'wt']], d['mpg']) print(gam_model.summary()) gam_predictions = gam_model.predict(d[['disp', 'wt']]) gam_mse = np.mean((gam_predictions - d['mpg'])**2) print('MSE:', gam_mse) #Plot the predictions with confidence intervals plt.plot(list(d.index), gam_predictions, 'r--') plt.plot(list(d.index), gam_model.prediction_intervals(d[['disp', 'wt']], width=.95), color='b', ls='--') plt.scatter(list(d.index), d['mpg'], facecolor='gray', edgecolors='none') plt.xlabel('Row Index') plt.ylabel('mpg') plt.title('GAM Prediction with 95% Condidence Interval') plt.show() #Plot with simulated posterior for response in gam_model.sample(d[['disp', 'wt']], d['mpg'], quantity='y', n_draws=50, sample_at_X=d[['disp', 'wt']]): plt.scatter(list(d.index), response, alpha=0.03, color='k')
def GAMf(df, in_var, ex_vars, city, cut, pred_end='one_month', train_duration='all'): """ Parameters ---------- df: dataframe containing all variables of interest for the whole time of measurement in_var: independent variable ex_vars: list of explanatory variables city: name of specific city cut: string of the format '%m/%d/%Y' indicating the date where training set ends & test set starts pred_end: end of the prediction period if 'one_month' pred_end is set to one month after the cut train_duration: int, indicating the number of months that should be used for training defaults to 'all' -> all available data before the cut date will be used as training data Returns ------- gam: fitted gam model instance model_statistics: vector containing the following information about the fitted model rmse: RMSE for test set r_squared: pseudo R-squared for the fitted GAM model fac2: fraction of predictions that lies between 50% and 200% of the corresponding measurements test_len: number of observations in the test set train_len: number of observations in the training set ratio: ratio of prediction to true values for test set avg_err: preds: a dataframe containing all explanatory variables, the independent variable, the predicted values & the absolute error divided by the average value of the pollution variables in the training set """ # drop rows with NAN values for explantory variables df = df.dropna(subset=ex_vars) # subset dataset to given city df = df[df['city'] == city] # convert cut variable to datetime object cut = datetime.strptime(cut, '%m/%d/%Y') # if pred_end has the default value add one month to cut date to calculate end of the test dataset # else convert given string to datetime if (pred_end == 'one_month'): pred_end = cut + relativedelta(months=+1) else: pred_end = datetime.strptime(pred_end, '%m/%d/%Y') # determine subset of dataset used for training based on the given value for training duration if (train_duration == 'all'): df_train = df[df.index < cut] else: train_start = cut - relativedelta(months=+train_duration) df_train = df[df.index < cut] df_train = df_train[df_train.index > train_start] df_train = df_train.replace([np.inf, -np.inf], np.nan) df_train = df_train.dropna(subset=ex_vars) # determine subset of dataset used for test df_test = df[df.index > cut] df_test = df_test[df_test.index < pred_end] # extract values for independent and explanatory variables train_X = df_train[ex_vars].values train_y = np.log(df_train[in_var].values) test_X = df_test[ex_vars].values test_y = np.log(df_test[in_var].values) # check if test and training set contain sufficient observations if ((len(test_y) != 0) and (len(train_y) != 0)): # generate TermList for GAM string = str() if isinstance(ex_vars, str): length = 1 else: length = len(ex_vars) for i in range(0, length): if (ex_vars[i] in [ 'weekday', 'month', 'season', 'hour', 'season', 'new_year', 'daytime' ]) and (len(train_y) > 300): string = string + "+f(" + str(i) + ")" # else: elif ('ws' in ex_vars[i]): string = string + '+l(' + str(i) + ')' else: string = string + '+s(' + str(i) + ", lam = 0.6, basis = 'ps')" string = string[1:] # specify and fit GAM model gam = LinearGAM(eval(string)) gam.fit(train_X, train_y) y_pred = gam.predict(test_X) # get max observed value for y max_value = train_y.max() # cut prediction to not get higher than maximum value in the training dataset y_pred[y_pred > max_value] = max_value # calculate model statistics ratio = np.mean(y_pred / test_y) rmse = np.sqrt( metrics.mean_squared_error(np.exp(test_y), np.exp(y_pred))) avg_err = np.mean(np.exp(test_y) - np.exp(y_pred)) r_squared = list(gam.statistics_['pseudo_r2'].items())[0][1] fac2 = np.mean(test_y / y_pred < 2) # dataframe with independent & dependent variables, prediction and prediction error preds = df_test.copy()[ex_vars] preds['true'] = np.exp(test_y) preds['y_pred'] = np.exp(y_pred) preds['err'] = abs(preds['true'] - preds['y_pred']) / (np.mean(train_y)) confidence = gam.prediction_intervals(test_X) preds['lower'] = np.exp(confidence[:, 0]) preds['upper'] = np.exp(confidence[:, 1]) else: # return Nan and give a warning if the training set is very small print( 'Problem with test and/or training data length for the station ' + city + 'in the month of ' + str(cut.month)) print('Training Length: ' + str(len(train_y)) + ' Test Length: ' + str(len(test_y))) rmse = gam = ratio = preds = avg_err = r_squared = fac2 = float("NaN") # calculate length of test & training set test_len = len(test_X) train_len = len(train_X) model_statistics = [ rmse, r_squared, fac2, test_len, train_len, ratio, avg_err ] return (gam, model_statistics, preds)
class GAMEnsemble(EnsembleModel): """Implements GAM ensemble in [1].""" def __init__(self, nonlinear_ensemble=False, residual_process=True): """ Initializer. Args: nonlinear_ensemble: (bool) Whether use nonlinear term to transform base model. residual_process: (bool) Whether model residual process. """ model_name = ( "Generalized Additive Ensemble" if residual_process else "{} Stacking".format("Nonlinear" if nonlinear_ensemble else "Linear")) super().__init__(model_name) self.gam_model = None self.nonlinear_ensemble = nonlinear_ensemble self.model_residual = residual_process def train(self, X, y, base_pred): """Trains ensemble model based on data and base predictions. Adds value to class attribute "model_weight" Args: X: (np.ndarray) Training features, shape (N, D) y: (np.ndarray) Training labels, shape (N, 1) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. """ # build feature and gam terms ens_feature, feature_terms = self._build_ensemble_feature(X, base_pred) # define model self.gam_model = LinearGAM(feature_terms) # additional fine-tuning lam_grid = self._build_lambda_grid(n_grid=100) self.gam_model.gridsearch(X=ens_feature, y=y, lam=lam_grid, progress=False) def predict(self, X, base_pred): """Predicts label based on feature and base model. Args: X: (np.ndarray) Training features, shape (N, D) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. Returns: (np.ndarray) ensemble prediction and variance Raises: (ValueError) If self.model_weight is empty. """ if not self.gam_model: raise ValueError("Attribute gam_model empty." "Model was not trained properly.") # build feature and gam terms ens_feature, _ = self._build_ensemble_feature(X, base_pred) # prediction prediction = self.gam_model.predict(ens_feature) prediction_var = ((self.gam_model.prediction_intervals( ens_feature, width=.95)[:, 1] - prediction) / 2) ** 2 return prediction, prediction_var def _build_ensemble_feature(self, X, base_pred): """Builds featurre array and corresponding GAM TermList. Terms corresponding to X will be summation of dimension-wise splines, plus a tensor-product term across all dimension. """ ensemble_term_func = s if self.nonlinear_ensemble else l ens_feature = np.asarray(list(base_pred.values())).T term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])] # optionally, add residual process if self.model_residual: # build gam terms term_list += [s(dim_index) for dim_index in range(ens_feature.shape[1], ens_feature.shape[1] + X.shape[1])] if X.shape[1] > 1: term_list += [te(*list(ens_feature.shape[1] + np.array(range(X.shape[1]))))] # update features ens_feature = np.concatenate([ens_feature, X], axis=1) gam_feature_terms = TermList(*term_list) return ens_feature, gam_feature_terms def _build_lambda_grid(self, n_grid=100): # count actual number of terms in each nonlinear term # (e.g. te(0, 1) will actually have two terms) n_terms = np.sum([len(model_term._terms) if model_term.istensor else 1 for model_term in self.gam_model.terms]) lam = np.random.rand(n_grid, n_terms) # rescale to between (0, 1) lam_norm = (lam - np.min(lam)) / (np.max(lam) - np.min(lam)) return np.exp((lam_norm - 0.5) * 6)
lams = np.linspace(-4, 100, 10) gam = LinearGAM(n_splines=20).gridsearch(X_train, y_train, lam=lams) gam.summary() # <div class="alert alert-block alert-warning"><b>Realizando el test:</b> Si bien anteriormente se selecciono el 20% de las variables, para efectos del ejemplo y para poder visualizar mejor la gráfica, se realiza la predicción de 100 datos de la variable X_test. </div> # In[16]: """Tomo 100 valores para predecir""" predictions = gam.predict(X_test[:100]) xsa = range(len(predictions)) IC = 0.95 # Intervalo de confianza plt.plot(xsa, predictions, 'r', xsa, y_test[:100], '--k', lw=1.5) plt.plot(xsa, gam.prediction_intervals(X_test[0:100], width=IC), color='gray', ls='-.', lw=1) plt.legend(('Prediction', 'Real', f'Intervalos de confianza {IC*100:.2f} %')) plt.title(f"Intervalo de confianza del {IC*100:.2f} %") # <div class="alert alert-block alert-warning"><b>Preparando resultados:</b> Preparo la información para mostrar la predicción en un DataFrame. Se adiciona la columna del error. </div> # In[17]: y_test = pd.DataFrame(y_test).reset_index(drop=True) y_predictions = pd.DataFrame(predictions).reset_index(drop=True) table_predict = pd.concat([y_test, y_predictions], axis=1) table_predict.columns = ['Calidad_real', 'Calidad_predecida']
def make_plot(plot_dir, site, df_flx, df_met, pft, fp): K_TO_C = 273.15 #golden_mean = 0.6180339887498949 #width = 6*2*(1/golden_mean) #height = width * golden_mean fig = plt.figure(figsize=(14, 4)) fig.subplots_adjust(hspace=0.1) fig.subplots_adjust(wspace=0.1) plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = "sans-serif" plt.rcParams['font.sans-serif'] = "Helvetica" plt.rcParams['axes.labelsize'] = 14 plt.rcParams['font.size'] = 14 plt.rcParams['legend.fontsize'] = 14 plt.rcParams['xtick.labelsize'] = 14 plt.rcParams['ytick.labelsize'] = 14 almost_black = '#262626' # change the tick colors also to the almost black plt.rcParams['ytick.color'] = almost_black plt.rcParams['xtick.color'] = almost_black # change the text colors also to the almost black plt.rcParams['text.color'] = almost_black # Change the default axis colors from black to a slightly lighter black, # and a little thinner (0.5 instead of 1) plt.rcParams['axes.edgecolor'] = almost_black plt.rcParams['axes.labelcolor'] = almost_black ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) # Mask crap stuff df_met.where(df_flx.Qle_qc == 1, inplace=True) df_met.where(df_flx.Qh_qc == 1, inplace=True) df_flx.where(df_flx.Qle_qc == 1, inplace=True) df_flx.where(df_flx.Qh_qc == 1, inplace=True) #df_flx.where(df_met.Tair_qc == 1, inplace=True) #df_flx.where(df_met.SWdown == 1, inplace=True) #df_met.where(df_met.SWdown == 1, inplace=True) #df_met.where(df_met.Tair_qc == 1, inplace=True) # Mask dew df_met.where(df_flx.Qle > 0., inplace=True) df_flx.where(df_flx.Qle > 0., inplace=True) df_flx.dropna(inplace=True) df_met.dropna(inplace=True) if len(df_flx) > 0 and len(df_met) > 0: print(site, len(df_flx), len(df_met)) alpha = 0.07 # < "Midday" data df_flx = df_flx.between_time("09:00", "13:00") df_met = df_met.between_time("09:00", "13:00") ax1.plot(df_met.SWdown, df_flx.Qle, ls=" ", marker="o", color="salmon", alpha=alpha) ax1.plot(df_met.SWdown, df_flx.Qh, ls=" ", marker="o", color="royalblue", alpha=alpha) gam = LinearGAM(n_splines=20).gridsearch(df_met.SWdown, df_flx.Qle) XX = generate_X_grid(gam) ax1.plot(XX, gam.predict(XX), color="salmon", ls='-', lw=2.0, label="Qle") ax1.plot(XX, gam.prediction_intervals(XX, width=.95), color='salmon', ls='--') for ii in range(len(df_met)): print("%f,%f,%f,%f,%s" % (df_met.SWdown[ii], df_flx.Qle[ii],\ df_flx.Qh[ii], df_met.Tair[ii] - K_TO_C,\ pft), file=fp) gam = LinearGAM(n_splines=20).gridsearch(df_met.SWdown, df_flx.Qh) XX = generate_X_grid(gam) ax1.plot(XX, gam.predict(XX), color="royalblue", ls='-', lw=2.0, label="Qh") ax1.plot(XX, gam.prediction_intervals(XX, width=.95), color='royalblue', ls='--') ax2.plot(df_met.Tair - K_TO_C, df_flx.Qle, ls=" ", marker="o", color="salmon", alpha=alpha, label="Qle") ax2.plot(df_met.Tair - K_TO_C, df_flx.Qh, ls=" ", marker="o", color="royalblue", alpha=alpha, label="Qh") gam = LinearGAM(n_splines=20).gridsearch(df_met.Tair - K_TO_C, df_flx.Qle) XX = generate_X_grid(gam) ax2.plot(XX, gam.predict(XX), color="salmon", ls='-', lw=2.0) ax2.plot(XX, gam.prediction_intervals(XX, width=.95), color='salmon', ls='--') gam = LinearGAM(n_splines=20).gridsearch(df_met.Tair - K_TO_C, df_flx.Qh) XX = generate_X_grid(gam) ax2.plot(XX, gam.predict(XX), color="royalblue", ls='-', lw=2.0) ax2.plot(XX, gam.prediction_intervals(XX, width=.95), color='royalblue', ls='--') plt.setp(ax2.get_yticklabels(), visible=False) ax1.set_xlim(0, 1300) ax1.set_ylim(0, 1000) ax2.set_xlim(0, 45) ax2.set_ylim(0, 1000) ax1.set_xlabel("SW down (W m$^{-2}$)") ax2.set_xlabel("Tair (deg C)") ax1.set_ylabel("Daytime flux (W m$^{-2}$)") ax1.legend(numpoints=1, loc="best") #fig.savefig(os.path.join(plot_dir, "%s.pdf" % (site)), # bbox_inches='tight', pad_inches=0.1) fig.savefig(os.path.join(plot_dir, "%s.png" % (site)), bbox_inches='tight', pad_inches=0.1, dpi=100)
fig.update_traces(marker=dict(size=2, line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers')) fig.show() #%% # pyGAM # train gam = LinearGAM(s(0, constraints="monotonic_inc"), n_splines=25).gridsearch(X_train.reshape((-1, 1)), y_train.reshape((-1, 1))) # predict XX = gam.generate_X_grid(term=0, n=500) y = gam.predict(XX) y_pred = gam.predict(X_test) y_CI = gam.prediction_intervals(XX, width=.95) #%% # plot prediction and confindence intervals fig = go.Figure() fig.add_trace( go.Scatter(x=XX.reshape((-1, )), y=y, name="Prediction", line=dict(color="firebrick", width=1))) fig.add_trace( go.Scatter(x=XX.reshape((-1, )), y=y_CI[:, 0], name="95% Confidence", line=dict(color="green", width=1, dash="dash"))) fig.add_trace( go.Scatter(x=XX.reshape((-1, )),
# -*- coding: utf-8 -*- """ @author: Christian Winkler """ import matplotlib.pyplot as plt import numpy as np import pandas as pd from pygam import LinearGAM import pygam from pygam.utils import generate_X_grid example_data = pd.read_csv("example_data.csv") y = example_data['head'].values X = example_data['age'].values gam = LinearGAM(n_splines=4).fit(X, y) # your fitted model # change resolution of X grid XX = generate_X_grid(gam, n=20) plt.figure(figsize=(10, 8)) plt.scatter(X, y) plt.plot(XX, gam.prediction_intervals(XX, quantiles=[.025, .5, .975]), color="k") plt.savefig("pygam_example_2.png") plt.show()