def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:,0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k:[] for k in X.columns} pred=np.zeros(y.shape[0]) for train,test in CV.split(X,y): Xtrain = X.iloc[train,:] ytrain = y.iloc[train] Xtest = X.iloc[test,:] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape)>1: p=p[:,0] pred[test]=p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k,v in importance_out.items(): importances[k].append(v) cv_scores = [{'r': np.corrcoef(y,pred)[0,1], 'R2': np.corrcoef(y,pred)[0,1]**2, 'MAE': mean_absolute_error(y,pred)}] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{'r': np.corrcoef(y,in_pred)[0,1], 'R2': np.corrcoef(y,in_pred)[0,1]**2, 'MAE': mean_absolute_error(y,in_pred)}] GAM_results[name] = {'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam} return GAM_results
def GAM(X, Y, factor = False): """SPLITTING THE DATASET""" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options) """PREPROCESSING""" # NB: No need for one-hot encoding – categorical columns are already binary! scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) """CREATING A DESIGN MATRIX""" poly = PolynomialFeatures(1) X_test = poly.fit_transform(X_test) X_train = poly.fit_transform(X_train) linear = ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'y', 'n', 'y', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'] # for feature in X_train.T: # unique = np.unique(feature) # if len(unique) < 6: # linear.append("n") # else: # idx = np.argsort(feature) # plt.plot(feature[idx], Y.squeeze()[idx]) # plt.show() # linear.append(input("Linear?\t")) linear = np.array(linear) linear[linear == "n"] = 0 linear[linear == "y"] = 1 linear = linear.astype(bool) gam_input = None for n,is_linear in enumerate(linear): if gam_input is not None: if is_linear: gam_input += GAM_line(n) if factor: gam_input += GAM_factor(n) else: gam_input += GAM_spline(n) else: if is_linear: gam_input = GAM_line(n) if factor: gam_input += GAM_factor(n) else: gam_input = GAM_spline(n) gam = LinearGAM(gam_input, fit_intercept = False, max_iter = int(1E5)) gam.fit(X_train, Y_train) Y_predict_train = gam.predict(X_train) Y_predict_test = gam.predict(X_test) MSE_train = np.mean((Y_predict_train - Y_train)**2) MSE_test = np.mean((Y_predict_test - Y_test)**2) return MSE_train, MSE_test
def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:, 0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k: [] for k in X.columns} pred = np.zeros(y.shape[0]) for train, test in CV.split(X, y): Xtrain = X.iloc[train, :] ytrain = y.iloc[train] Xtest = X.iloc[test, :] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape) > 1: p = p[:, 0] pred[test] = p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k, v in importance_out.items(): importances[k].append(v) cv_scores = [{ 'r': np.corrcoef(y, pred)[0, 1], 'R2': np.corrcoef(y, pred)[0, 1]**2, 'MAE': mean_absolute_error(y, pred) }] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{ 'r': np.corrcoef(y, in_pred)[0, 1], 'R2': np.corrcoef(y, in_pred)[0, 1]**2, 'MAE': mean_absolute_error(y, in_pred) }] GAM_results[name] = { 'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam } return GAM_results
def fit_pygam_model(X_train: pandas.core.frame.DataFrame, X_test: pandas.core.frame.DataFrame, y_train: pandas.core.frame.DataFrame, y_test: pandas.core.frame.DataFrame): ''' Creates a general additive model LinearGAM (normally distributed errors) with grid search. Returns the best model with given hyperparameters. hyperparameters: n_splines and lam regularization parameter. ''' from pygam import LinearGAM gam = LinearGAM().gridsearch(X_train.values, y_train, n_splines=np.arange(3, 20), lam=np.logspace(-3, 3, 11)) print(gam.summary()) y_train_predicted = gam.predict(X_train) y_test_predicted = np.floor(gam.predict(X_test)) rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted)) mae_train = mean_absolute_error(y_train, y_train_predicted) r2_train = r2_score(y_train, y_train_predicted) print("RMSE of training set is {}".format(rmse_train)) print("MAE of testing set is {}".format(mae_train)) print("R2 score of training set is {}\n".format(r2_train)) if len(y_test) > 0: rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted)) mae_test = mean_absolute_error(y_test, y_test_predicted) r2_test = r2_score(y_test, y_test_predicted) print("RMSE of testing set is {}".format(rmse_test)) print("MAE of testing set is {}".format(mae_test)) print("R2 score of testing set is {}\n".format(r2_test)) ''' Visualize the feature significance and confidence intervals ''' num_features = len(X_train.columns) fig = plt.figure(figsize=(18, 12)) fig.subplots_adjust(hspace=0.4) cnt = 1 p_values = gam.statistics_['p_values'] for i in range(num_features): axs = fig.add_subplot(num_features, 1, cnt) m = gam.generate_X_grid(term=i) axs.plot(m[:, i], gam.partial_dependence(term=i, X=m)) # this is the actual coefficents axs.plot(m[:, i], gam.partial_dependence(term=i, X=m, width=.95)[1], c='r', ls='--') # this plots the confidence intervals axs.set_title(X_train.columns[i] + ('*' if p_values[cnt] < 0.05 else '')) cnt += 1
def check_significance(x, y, x_test, cols, col_add): if len(cols) == 0: return True model = LinearGAM().gridsearch(x[cols].values, y, progress=False) predictions1 = model.predict(x_test[cols].values) model = LinearGAM().gridsearch(x[cols + [col_add]].values, y, progress=False) predictions2 = model.predict(x_test[cols + [col_add]].values) test_stats = wilcoxon(predictions1, predictions2) return test_stats.pvalue < 0.05
def get_surrogate_predictions(X, y, w, pred_mask=None): if pred_mask is None: pred_mask = np.ones(len(y), dtype=bool) fit_mask = pred_mask else: fit_mask = ~pred_mask # get surrogates model_1 = LinearGAM() model_1.fit(X[fit_mask & (w == 1), :], y[fit_mask & (w == 1)]) mu_1_plug = model_1.predict(X[pred_mask, :]) model_0 = LinearGAM() model_0.fit(X[fit_mask & (w == 0), :], y[fit_mask & (w == 0)]) mu_0_plug = model_0.predict(X[pred_mask, :]) return mu_0_plug, mu_1_plug
def smoother_linearGAM(x,y,X,**kwargs): from pygam import LinearGAM, l, s if isinstance(x,list): x = np.array(x) x = x.reshape(len(x),1) if isinstance(y,list): y = np.array(y) if isinstance(X,list): X = np.array(X) if X is None: X = x.reshape(len(x),1) else: X = X.reshape(len(X),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # n_splines = int(len(y)/5) #gam = LinearGAM(n_splines=n_splines,\ # terms=s(0,basis='ps')\ # ).gridsearch(x, y) gam = LinearGAM( terms=s(0,basis='ps')\ ).gridsearch(x, y ) # sample on the input grid means = gam.predict(X) return means
def GAM(X, Y): """SPLITTING THE DATASET""" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options) """PREPROCESSING""" # NB: No need for one-hot encoding – categorical columns are already binary! scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) """CREATING A DESIGN MATRIX""" poly = PolynomialFeatures(1) X_test = poly.fit_transform(X_test) X_train = poly.fit_transform(X_train) gam_input = None for n in range(X_train.shape[1]): if gam_input is not None: gam_input += GAM_spline(n) else: gam_input = GAM_spline(n) gam = LinearGAM(gam_input).fit(X_train, Y_train) Y_predict = gam.predict(X_test) Y_predict[Y_predict >= 0.5] = 1 Y_predict[Y_predict < 0.5] = 0 accuracy = (Y_predict.squeeze() == Y_test.squeeze()).astype(int) accuracy = np.sum(accuracy)/accuracy.shape[0] return accuracy
def test_if_learner(): # get data without noise X, y, w, ite, p, bs = make_te_data(n=200, noise=False) # get surrogate predictions to compare against po predictions mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w) # get surrogate predictions for two folds as inside the iflearner splitter = StratifiedKFold(n_splits=2, shuffle=True, random_state=42) idx_list = [] for train_index, test_index in splitter.split(X, w): idx_list.append((train_index, test_index)) fold2_mask = np.zeros(200, dtype=bool) fold2_mask[idx_list[0][1]] = 1 mu_0, mu_1 = np.zeros(200), np.zeros(200) mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask) mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask) pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1) # make second stage model t_model = LinearGAM() t_model.fit(X, pseudo_outcome) te_debiased = t_model.predict(X) # fit if learner if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True) # test outcomes np.testing.assert_almost_equal(te, te_debiased) np.testing.assert_almost_equal(mu_0, mu_0_plug) np.testing.assert_almost_equal(mu_1, mu_1_plug) np.testing.assert_almost_equal(if_learner.predict(X), te_debiased) with pytest.raises(ValueError): # predicting po when base model not fitted should not be possible if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True) with pytest.warns(UserWarning): # warning raised if only one fold? if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42) if_learner.fit(X, y, w, p) # check that binary_y setting also works (smoketest) X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline, noise=False, binary_y=True) if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(), binary_y=True, setting=RR_NAME, fit_base_model=True) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest, pred)[0, 1]**2 importances[predictor] = R2 return importances
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest,pred)[0,1]**2 importances[predictor] = R2 return importances
def GAM_linear(X, y): X= X.to_numpy() y = y.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) +s(1) +f(2)) gam.gridsearch(X,y) y_pred = gam.predict(X) y_pred = pd.DataFrame(y_pred) y_pred['actual'] =y y_pred['residual'] = y_pred.actual-y_pred[0] return gam, gam.summary(), y_pred
def interp_gam(data): valid = np.isfinite(data.stream_dist.values[:, 0]) sample_xy = data.sample_xy.values[valid] sample_st = data.stream_dist.values[valid] sample_z = data.sample_z.values[valid] if np.sum(valid) == 0: return np.nan gam = LinearGAM( s(0, n_splines=4) + s(1, n_splines=5) + te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z) z_pred = gam.predict(np.array([[0, 0]]))[0] return z_pred
def gam_results(x, y, df, param, infection_time): gam = LinearGAM(s(0), lam=.5).fit(x, y) y_new = gam.predict(x) confi1 = gam.prediction_intervals(x, width=.95) pred = np.zeros(x.shape[0]) for i in np.arange(x.shape[0]): if i == 0: pred[i] = np.mean(df[param].iloc[0:3]) else: if i < infection_time: pred[i] = pred[i - 1] * y_new[i] + pred[i - 1] else: pred[i] = pred[i - 1] * y_new[i] + pred[i - 1] if param == 'Positive': pred = pred + np.concatenate( (np.zeros(infection_time), pred[0:(pred.shape[0] - infection_time)]), axis=0) x_forcast = np.arange(np.max(x), np.max(x) + 10) y_forcast = gam.predict(x_forcast) confi = gam.prediction_intervals(x_forcast, width=.95) forcast = np.zeros(x_forcast.shape[0]) forcast_L = np.zeros(x_forcast.shape[0]) forcast_U = np.zeros(x_forcast.shape[0]) for i in np.arange(x_forcast.shape[0]): if i == 0: forcast[i] = df[param].iloc[-1] forcast_L[i] = forcast[i] forcast_U[i] = forcast[i] else: forcast[i] = forcast[i - 1] * y_forcast[i - 1] + forcast[i - 1] forcast_L[i] = forcast_L[i - 1] * confi[i - 1, 0] + forcast_L[i - 1] forcast_U[i] = forcast_U[i - 1] * confi[i - 1, 1] + forcast_U[i - 1] return ([pred, forcast, forcast_L, forcast_U, y_new, confi1])
def AAM(): gam = LinearGAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147, 147]) + l(3) # the last travel time + te(0, 1) # distance and departure_time + te(2, 0) # distance and isWeekend + l(2), # isWeekend fit_intercept=True) print(gam.gridsearch(X1, y1).summary()) # print(gam.gridsearch(X1,y1).get_params(deep=True)) '''plt.scatter(X1[:,0][0:56], y1[0:56], s=3, linewidth=1, label = 'data') plt.plot(X1[:,0][0:56], gam.predict(X1[0:56]), color = 'red', linewidth = 1, label = 'prediction') plt.legend() plt.title('Extended Additive Model') plt.show()''' # error calculation rmse_val = rmse(np.array(y1), np.array(gam.predict(X1))) print("RMSE is: "+str(rmse_val)) mae = mean_absolute_error(y1, gam.predict(X1)) print("MAE is: "+str(mae)) mape = mean_absolute_percentage_error(np.array(y1), np.array(gam.predict(X1))) print("MAPE is: "+ str(mape))
def predict_gam(ad_group,date): ads_file = 'data/ad_table.csv' df = pd.read_csv(ads_file, header=0, sep=',') df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True) splines=[5, 7, 10, 20, 30, 40, 45] lams = np.logspace(-3,3,7) if(ad_group in df['ad'].unique()): df_ad_group_train = df[df['ad'] == ad_group] df_ad_group_train = df_ad_group_train.reset_index() df_ad_group_train['time_period'] = (df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days X_train = df_ad_group_train[['time_period']].values y_train = df_ad_group_train['shown'].values #auto tuning gam = LinearGAM().gridsearch(X_train, y_train, lam=lams, n_splines=splines) predictions = gam.predict(X_train) print('==== Tuning for ad group %s - best generalized cross-validation %f ' % (ad_group, gam.statistics_['GCV'])) tuning_result = (gam.lam[0][0], gam.n_splines[0], gam.statistics_['GCV']) predict_date = (pd.to_datetime(date) - df_ad_group_train['date'][0]).days print("Auto tuning result=",tuning_result) print("Prediction for number of ads Shown for",ad_group,"on ",date,"=",gam.predict([[predict_date]])) print("Regression/Lambda value = ",gam.lam) print("n_splines=",gam.n_splines) else: print("Ad group does not exist")
def GAM_model(df, feature_list): X_train = df[feature_list] y_train = df[['logerror']] scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X_train) X_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns.values).set_index( [X_train.index.values]) X_scaled = X_scaled.to_numpy() y_train = y_train.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam.gridsearch(X_scaled, y_train) y_pred = gam.predict(X_scaled) y_pred = pd.DataFrame(y_pred) y_pred['actual'] = y_train y_pred.columns = ['predicted', 'actual'] RMSE = float('{:.3f}'.format( sqrt(mean_squared_error(y_pred.actual, y_pred.predicted)))) R2 = float('{:.3f}'.format(r2_score(y_pred.actual, y_pred.predicted))) return RMSE, R2, gam
def GAMfitter(indir, dat_st, T0=None): fname = [i for i in os.listdir(indir) if dat_st in i] data = np.loadtxt(indir + fname[0]) frequency = np.linspace( 1e-3, 0.5, int(1e6)) # A range for frequencies (2 to 1000 day periods) power = LombScargle(data[:, 0], data[:, 1]).power(frequency=frequency) # Get spectrum ind = get_index_of_max(power) # Best frequency if T0 is None: # If we have no preset T0, try to get a minimum phs = get_phase_curve(data[:, 0], data[0, 0], 1 / frequency[ind]) ext_phs, ext_mags = phase_curve_extender(phs, data[:, 1]) gam = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags) # Fit a GAM XX = gam.generate_X_grid(term=0, n=500) fit = gam.predict(XX) # This is the fit on the grid minimal_val = max(fit) # Maximum magnitude (minimal brightness) min_ind = get_index_of_min(abs(data[:, 1] - minimal_val)) T0 = data[min_ind, 0] phs = get_phase_curve(data[:, 0], T0, 1 / frequency[ind]) ext_phs, ext_mags = phase_curve_extender(phs, data[:, 1]) gam = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags) pred_int_vls = gam.prediction_intervals(phs, width=.85) cond = (data[:, 1] > pred_int_vls[:, 0]) & (data[:, 1] < pred_int_vls[:, 1]) filtered_data = data[cond] power_f = LombScargle(data[:, 0], data[:, 1]).power(frequency=frequency) ind_f = get_index_of_max(power_f) phs_f = get_phase_curve(filtered_data[:, 0], T0, 1 / frequency[ind_f]) ext_phs, ext_mags = phase_curve_extender(phs_f, filtered_data[:, 1]) gam_f = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags) return filtered_data, gam_f, frequency[ind_f], T0
def get_GAM_predictions(Xtrain, Ytrain, Xtest): """ Perform grid search and train Linear GAM model and return predictions for the test set. :param Xtrain: X values for training. :param Ytrain: Y values for training. :param Xtest: X values for validation. :return: Predictions from Linear GAM model for test dataset """ # Create an array of lambda values to search lams = np.logspace(-3, 20, 35) # GAM search requires numpy arrays Xtrain_np = np.array(Xtrain, dtype=np.float64) Ytrain_np = np.array(Ytrain, dtype=np.float64) # Linear Generalised Additive Model model = LinearGAM( s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) + l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) + l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) + l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np, Ytrain_np, lam=lams) return model.predict(Xtest)
def cleaner_linearGAM(x,y,**kwargs): from pygam import LinearGAM, l, s if isinstance(x,list): x = np.array(x) if isinstance(y,list): y = np.array(y) X = x.reshape(len(x),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # #n_splines = int(len(y)/5) #gam = LinearGAM(n_splines=n_splines,\ # terms=s(0,basis='ps')\ # ).gridsearch(X, y) gam = LinearGAM(terms=s(0,basis='ps')).gridsearch(X, y) #gam = LinearGAM(n_splines=n_splines,terms=s(0)).gridsearch(X, y) # sample on the input grid means = gam.predict(X) bounds = gam.prediction_intervals(X, width=.95) idx = [i for i in range(len(y)) \ if (y[i]>bounds[i,1] or y[i]<bounds[i,0])] return idx
# Based on "Elements of causal inference" code snippet 4.14 #https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html# import pygam from pygam import LinearGAM, s p66 import numpy as np np.random.seed(42) N = 200 X = np.random.randn(N) Y = np.power(X, 3) + np.random.randn(N) gam_fwd = LinearGAM(s(0)).fit(X, Y) Yhat = gam_fwd.predict(X) residuals_fwd = Y - Yhat loglik_fwd = -(np.log(np.var(X)) + np.log(np.var(residuals_fwd))) print(loglik_fwd) gam_back = LinearGAM(s(0)).fit(Y, X) Xhat = gam_fwd.predict(Y) residuals_back = X - Xhat loglik_back = -(np.log(np.var(Y)) + np.log(np.var(residuals_back))) print(loglik_back)
def explain_instance_with_data(self, neighborhood_data, neighborhood_labels, distances, label, num_features, feature_selection='auto', model_regressor=None, gam_type=None): """Takes perturbed data, labels and distances, returns explanation. Args: neighborhood_data: perturbed data, 2d array. first element is assumed to be the original data point. neighborhood_labels: corresponding perturbed labels. should have as many columns as the number of possible labels. distances: distances to original data point. label: label for which we want an explanation num_features: maximum number of features in explanation feature_selection: how to select num_features. options are: 'forward_selection': iteratively add features to the model. This is costly when num_features is high 'highest_weights': selects the features that have the highest product of absolute weight * original data point when learning with all the features 'lasso_path': chooses features based on the lasso regularization path 'none': uses all features, ignores num_features 'auto': uses forward_selection if num_features <= 6, and 'highest_weights' otherwise. model_regressor: sklearn regressor to use in explanation. Defaults to Ridge regression if None. Must have model_regressor.coef_ and 'sample_weight' as a parameter to model_regressor.fit() Returns: (intercept, exp, score): intercept is a float. exp is a sorted list of tuples, where each tuple (x,y) corresponds to the feature id (x) and the local weight (y). The list is sorted by decreasing absolute value of y. score is the R^2 value of the returned explanation """ weights = self.kernel_fn(distances) labels_column = neighborhood_labels[:, label] used_features = self.feature_selection(neighborhood_data, labels_column, weights, num_features, feature_selection) X = neighborhood_data[:, used_features] y = neighborhood_labels[:, label] (X_train, X_test, y_train, y_test, train_weights, test_weights) = train_test_split(X, y, weights, test_size=0.2) linear_model = Ridge(alpha=1, fit_intercept=True, random_state=self.random_state) gam = LinearGAM() dt = DecisionTreeRegressor() linear_model.fit(X_train, y_train, sample_weight=train_weights) gam.fit(X_train, y_train, weights=train_weights) dt.fit(X_train, y_train, sample_weight=train_weights) # # plot # for i, term in enumerate(gam.terms): # if term.isintercept: # continue # XX = gam.generate_X_grid(term=i) # # pdep = gam.predict(XX) # pdep = gam.partial_dependence(term=i, X=XX) + linear_model.intercept_ # # line = XX[:, term.feature] * linear_model.coef_[term.feature] # line = linear_model.predict(XX) # dect = dt.predict(XX) # plt.figure() # plt.plot(XX[:, term.feature], pdep) # plt.plot(XX[:, term.feature], line) # plt.plot(XX[:, term.feature], dect) # plt.title(repr(term)) # plt.show() # exit() y_lr = linear_model.predict(X_test) y_gam = gam.predict(X_test) y_dt = dt.predict(X_test) # y_lr = linear_model.predict(X_train) # y_gam = gam.predict(X_train) # y_dt = dt.predict(X_train) # mse_lr = mean_squared_error(y_test, y_lr, sample_weight=test_weights) # mse_gam = mean_squared_error(y_test, y_gam, sample_weight=test_weights) # mse_dt = mean_squared_error(y_test, y_dt, sample_weight=test_weights) mse_lr = explained_variance_score(y_test, y_lr, sample_weight=test_weights) mse_gam = explained_variance_score(y_test, y_gam, sample_weight=test_weights) mse_dt = explained_variance_score(y_test, y_dt, sample_weight=test_weights) # mse_lr = explained_variance_score(y_train, y_lr, sample_weight=train_weights) # mse_gam = explained_variance_score(y_train, y_gam, sample_weight=train_weights) # mse_dt = explained_variance_score(y_train, y_dt, sample_weight=train_weights) metrics = (mse_lr, mse_gam, mse_dt) prediction_score = linear_model.score(neighborhood_data[:, used_features], labels_column, sample_weight=weights) local_pred = linear_model.predict( neighborhood_data[0, used_features].reshape(1, -1)) linear_exp = sorted(zip(used_features, linear_model.coef_), key=lambda x: np.abs(x[1]), reverse=True) gam_exp = [] for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) y = gam.partial_dependence(term=i, X=XX) x = XX[:, i] feature = used_features[i] gam_exp.append((used_features[i], x, y)) if self.verbose: print('Intercept', linear_model.intercept_) print( 'Prediction_local', local_pred, ) print('Right:', neighborhood_labels[0, label]) # return (linear_model.intercept_, # sorted(zip(used_features, linear_model.coef_), # key=lambda x: np.abs(x[1]), reverse=True), # prediction_score, local_pred) return (metrics, linear_exp, gam_exp)
plt.ylabel('mpg') plt.title('LOESS Smoothing') plt.show() ''' ------------------------------------------------------------------------------- ------------------------Generalized Additive Models---------------------------- ------------------------------------------------------------------------------- ''' #GAMs #https://github.com/dswah/pyGAM #https://codeburst.io/pygam-getting-started-with-generalized-additive-models-in-python-457df5b4705f from pygam import LinearGAM, LogisticGAM gam_model = LinearGAM().fit(d[['disp', 'wt']], d['mpg']) print(gam_model.summary()) gam_predictions = gam_model.predict(d[['disp', 'wt']]) gam_mse = np.mean((gam_predictions - d['mpg'])**2) print('MSE:', gam_mse) #Plot the predictions with confidence intervals plt.plot(list(d.index), gam_predictions, 'r--') plt.plot(list(d.index), gam_model.prediction_intervals(d[['disp', 'wt']], width=.95), color='b', ls='--') plt.scatter(list(d.index), d['mpg'], facecolor='gray', edgecolors='none') plt.xlabel('Row Index') plt.ylabel('mpg') plt.title('GAM Prediction with 95% Condidence Interval') plt.show()
# * Ajustamos el modelo a nuestra base de datos de entrenamiento: # In[9]: model.gridsearch(X_train, y_train) # #### Predicción # In[10]: #Predicción del modelo y_pred_validation = model.predict(X_validation) y_pred_validation # #### Evaluación de nuestro modelo: # In[11]: # diseñamos función para evaluar def mean_absolute_percentage_error(y_train, y_pred_validation): return np.mean(np.abs((y_train - y_pred_validation) / y_train)) * 100 # In[12]:
def main(): f = open('results.txt', 'w') f.write("Preprocessing data...\n\n") # pre-process data train_X, train_Y, train_idx, _, test_X, test_idx = load_data(config.data_path, config.test_path) names = list(train_X) types = train_X.dtypes floats = (types == np.float64) new_X_GAM, new_test_GAM = construct_features(train_X, train_Y, test_X, have_poly=False) # feature selection f.write("Feature Selection\n") ridge_scores, ridge_X, ridge_test, ridge_names = select_features(train_X, train_Y, test_X, config.ridge_select, config.ridge_feats) lasso_scores, lasso_X, lasso_test, lasso_names = select_features(train_X, train_Y, test_X, config.lasso_select, config.lasso_feats) knn_scores, knn_X, knn_test, knn_names = select_features(train_X, train_Y, test_X, config.knn_select, config.knn_feats) rf_scores, rf_X, rf_test, rf_names = select_features(train_X, train_Y, test_X, config.rf_select, config.rf_feats) est_scores, est_X, est_test, est_names = select_features(train_X, train_Y, test_X, config.est_select, config.est_feats) write_selection_results(f, 'Ridge Regression', config.ridge_feats, ridge_scores, ridge_names) write_selection_results(f, 'LASSO Regression', config.lasso_feats, lasso_scores, lasso_names) write_selection_results(f, 'K-Nearest Neighbours', config.knn_feats, knn_scores, knn_names) write_selection_results(f, 'Random Forest', config.rf_feats, rf_scores, rf_names) write_selection_results(f, 'Gradient Boosting', config.est_feats, est_scores, est_names) f.write('\n#######################################\n\n') # model selection f.write("Model Selection\n") ridge_scores = cross_valid(config.ridge_models, ridge_X, train_Y) lasso_scores = cross_valid(config.lasso_models, lasso_X, train_Y) knn_scores = cross_valid(config.knn_models, knn_X, train_Y) rf_scores = cross_valid(config.rf_models, rf_X, train_Y) est_scores = cross_valid(config.est_models, est_X, train_Y) write_model_results(f, 'Ridge Regression', config.ridge_models, ridge_scores) write_model_results(f, 'LASSO Regression', config.lasso_models, lasso_scores) write_model_results(f, 'K-Nearest Neighbours', config.knn_models, knn_scores) write_model_results(f, 'Random Forest', config.rf_models, rf_scores) write_model_results(f, 'Gradient Boosting', config.est_models, est_scores) f.write('\n#######################################\n\n') best_reg = config.lasso3 best_tree = config.est3 best_reg.fit(lasso_X, train_Y) predictions_reg = best_reg.predict(lasso_test) best_tree.fit(est_X, train_Y) predictions_tree = best_tree.predict(est_test) write_test_file(predictions_reg, test_idx, 'results_reg.csv') write_test_file(predsictions_tree, test_idx, 'results_tree.csv') # valid_X = new_X[:200] valid_Y = train_Y[:200] # new_X = new_X[200:] train1_Y = train_Y[200:] # est.fit(new_X, train_Y) # preds = est.predict(new_test) err = [] for i in range(90, 100, 10): sel = SelectPercentile(mutual_info_regression, percentile=i) new1_X = sel.fit_transform(new_X, train_Y) valid_X = new1_X[:200] train_X = new1_X[200:] est.fit(train_X, train1_Y) predictions = est.predict(valid_X) # preds = np.exp(predictions) # print(predictions) # print(preds) # write_test_file(preds, test_idx) err.append(np.sqrt(mean_squared_error(valid_Y, predictions))) print(explained_variance_score(valid_Y, predictions)) print(r2_score(valid_Y, predictions)) plt.scatter(valid_Y, predictions) x = [10.5, 11, 11.5, 12, 12.5, 13, 13.5] y = [10.5, 11, 11.5, 12, 12.5, 13, 13.5] plt.plot(x,y,'--') plt.ylabel("Predictions") plt.xlabel("Actual Y-values") plt.show() # plt.plot([10,20,30,40,50,60,70,80,90],err) # plt.xlabel("Percentage of Feature") # plt.ylabel("Validation MSE") # plt.show() # preds = np.exp(preds) # write_test_file(preds, test_idx) # new2_X = rfe2.fit_transform(new_X, train_Y) # print(new2_X.shape) # new3_X = rfe3.fit_transform(new_X, train_Y) # print(new3_X.shape) # new4_X = rfe4.fit_transform(new_X, train_Y) # print(new4_X.shape) # new5_X = rfe5.fit_transform(new_X, train_Y) # print(new5_X.shape) # new2_X = rfe2.fit_transform(new_X, train_Y) # new1_X = rfe1.fit_transform(new_X, train_Y) # new2_X = rfe2.fit_transform(new_X, train_Y) # new3_X = rfe3.fit_transform(new_X, train_Y) # new4_X = rfe4.fit_transform(new_X, train_Y) # pca1.fit(train_X, train_Y) # sel3.fit(train_X, train_Y) # new4_X = pca2.fit_transform(new_X, train_Y) # names1 = [new_names[i] for i in np.where(rfe1.support_ == True)[0]] # names2 = [new_names[i] for i in np.where(rfe2.support_ == True)[0]] # scores1 = cross_valid(models, new_X, train_Y) # scores2 = cross_valid([lasso2], new2_X, train_Y) # scores3 = cross_valid([lasso3], new3_X, train_Y) # scores4 = cross_valid([lasso4], new4_X, train_Y) # scores5 = cross_valid([lasso5], new5_X, train_Y) # scores5 = cross_valid(models, new3_X, train_Y) # scores5 = cross_valid(models, new3_X, train_Y) # scores6 = cross_valid(models, new4_X, train_Y) # print(sel_names) # print(new1_X.shape) # print(new2_X.shape) # print(new_X.shape) # print(scores1) # print(scores2) # print(scores3) # print(scores4) # print(scores5) # valid_X = new_X[:200] valid_Y = train_Y[:200] # train_X = new_X[200:] train_Y = train_Y[200:] # new_train = sel3.transform(train_X) # new_valid = sel3.transform(valid_X) # print(new_valid.shape) err = [] for i in range(80, 90, 10): pca = PCA(n_components=i) new1_X = pca.fit_transform(new_X, train_Y) valid_X = new1_X[:200] train_X = new1_X[200:] gam = LinearGAM(n_splines=8).gridsearch(train_X, train_Y) predictions = gam.predict(valid_X) # preds = np.exp(predictions) # print(predictions) # print(preds) # write_test_file(preds, test_idx) err.append(np.sqrt(mean_squared_error(valid_Y, predictions))) print(explained_variance_score(valid_Y, predictions)) print(r2_score(valid_Y, predictions)) plt.scatter(valid_Y, predictions) x = [10.5, 11, 11.5, 12, 12.5, 13, 13.5] y = [10.5, 11, 11.5, 12, 12.5, 13, 13.5] plt.plot(x,y,'--') plt.ylabel("Predictions") plt.xlabel("Actual Y-values") plt.show()
# Change the default axis colors from black to a slightly lighter black, # and a little thinner (0.5 instead of 1) plt.rcParams['axes.edgecolor'] = almost_black plt.rcParams['axes.labelcolor'] = almost_black ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) if plot_type == "GAM": nsplines = 20 lct_1D = np.tile(np.arange(8), 22) gam1 = LinearGAM(n_splines=nsplines).fit( lct_1D, soilmoist_rn1.reshape(8 * 22)) x_pred = np.linspace(0, 7, num=100) y_pred1 = gam1.predict(x_pred) y_int1 = gam1.confidence_intervals(x_pred, width=.95) np.savetxt('soilmoist_rn1.out', soilmoist_rn1.reshape(8 * 22), delimiter=',') np.savetxt('soilmoist_rn2.out', soilmoist_rn2.reshape(8 * 22), delimiter=',') np.savetxt('soilmoist_tdr_rn2.out', soilmoist_tdr_rn2.reshape(8 * 22), delimiter=',') gam2 = LinearGAM(n_splines=nsplines).fit( lct_1D, soilmoist_rn2.reshape(8 * 22)) y_pred2 = gam2.predict(x_pred) y_int2 = gam2.confidence_intervals(x_pred, width=.95)
# Specify plot shape titles = ['freedom', 'family', 'year', 'economy', 'health', 'trust'] fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles) fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False) for i, title in enumerate(titles): XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, width=.95) trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect') ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI') ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI') if i<3: fig.append_trace(trace, 1, i+1) fig.append_trace(ci1, 1, i+1) fig.append_trace(ci2, 1, i+1) else: fig.append_trace(trace, 2, i-2) fig.append_trace(ci1, 2, i-2) fig.append_trace(ci2, 2, i-2) py.plot(fig) # Making a Forecast # predicting the outcome of the UAE in 2015 gam.predict([[0.64, 1.13, 2015, 1.47, 0.81, 0.38]])
def GAMf(df, in_var, ex_vars, city, cut, pred_end='one_month', train_duration='all'): """ Parameters ---------- df: dataframe containing all variables of interest for the whole time of measurement in_var: independent variable ex_vars: list of explanatory variables city: name of specific city cut: string of the format '%m/%d/%Y' indicating the date where training set ends & test set starts pred_end: end of the prediction period if 'one_month' pred_end is set to one month after the cut train_duration: int, indicating the number of months that should be used for training defaults to 'all' -> all available data before the cut date will be used as training data Returns ------- gam: fitted gam model instance model_statistics: vector containing the following information about the fitted model rmse: RMSE for test set r_squared: pseudo R-squared for the fitted GAM model fac2: fraction of predictions that lies between 50% and 200% of the corresponding measurements test_len: number of observations in the test set train_len: number of observations in the training set ratio: ratio of prediction to true values for test set avg_err: preds: a dataframe containing all explanatory variables, the independent variable, the predicted values & the absolute error divided by the average value of the pollution variables in the training set """ # drop rows with NAN values for explantory variables df = df.dropna(subset=ex_vars) # subset dataset to given city df = df[df['city'] == city] # convert cut variable to datetime object cut = datetime.strptime(cut, '%m/%d/%Y') # if pred_end has the default value add one month to cut date to calculate end of the test dataset # else convert given string to datetime if (pred_end == 'one_month'): pred_end = cut + relativedelta(months=+1) else: pred_end = datetime.strptime(pred_end, '%m/%d/%Y') # determine subset of dataset used for training based on the given value for training duration if (train_duration == 'all'): df_train = df[df.index < cut] else: train_start = cut - relativedelta(months=+train_duration) df_train = df[df.index < cut] df_train = df_train[df_train.index > train_start] df_train = df_train.replace([np.inf, -np.inf], np.nan) df_train = df_train.dropna(subset=ex_vars) # determine subset of dataset used for test df_test = df[df.index > cut] df_test = df_test[df_test.index < pred_end] # extract values for independent and explanatory variables train_X = df_train[ex_vars].values train_y = np.log(df_train[in_var].values) test_X = df_test[ex_vars].values test_y = np.log(df_test[in_var].values) # check if test and training set contain sufficient observations if ((len(test_y) != 0) and (len(train_y) != 0)): # generate TermList for GAM string = str() if isinstance(ex_vars, str): length = 1 else: length = len(ex_vars) for i in range(0, length): if (ex_vars[i] in [ 'weekday', 'month', 'season', 'hour', 'season', 'new_year', 'daytime' ]) and (len(train_y) > 300): string = string + "+f(" + str(i) + ")" # else: elif ('ws' in ex_vars[i]): string = string + '+l(' + str(i) + ')' else: string = string + '+s(' + str(i) + ", lam = 0.6, basis = 'ps')" string = string[1:] # specify and fit GAM model gam = LinearGAM(eval(string)) gam.fit(train_X, train_y) y_pred = gam.predict(test_X) # get max observed value for y max_value = train_y.max() # cut prediction to not get higher than maximum value in the training dataset y_pred[y_pred > max_value] = max_value # calculate model statistics ratio = np.mean(y_pred / test_y) rmse = np.sqrt( metrics.mean_squared_error(np.exp(test_y), np.exp(y_pred))) avg_err = np.mean(np.exp(test_y) - np.exp(y_pred)) r_squared = list(gam.statistics_['pseudo_r2'].items())[0][1] fac2 = np.mean(test_y / y_pred < 2) # dataframe with independent & dependent variables, prediction and prediction error preds = df_test.copy()[ex_vars] preds['true'] = np.exp(test_y) preds['y_pred'] = np.exp(y_pred) preds['err'] = abs(preds['true'] - preds['y_pred']) / (np.mean(train_y)) confidence = gam.prediction_intervals(test_X) preds['lower'] = np.exp(confidence[:, 0]) preds['upper'] = np.exp(confidence[:, 1]) else: # return Nan and give a warning if the training set is very small print( 'Problem with test and/or training data length for the station ' + city + 'in the month of ' + str(cut.month)) print('Training Length: ' + str(len(train_y)) + ' Test Length: ' + str(len(test_y))) rmse = gam = ratio = preds = avg_err = r_squared = fac2 = float("NaN") # calculate length of test & training set test_len = len(test_X) train_len = len(train_X) model_statistics = [ rmse, r_squared, fac2, test_len, train_len, ratio, avg_err ] return (gam, model_statistics, preds)
from sklearn import datasets from pygam import LinearGAM import pandas as pd boston = datasets.load_boston() X = boston.data y = boston.target features = boston.feature_names boston_data = pd.DataFrame(X, columns=features) gam = LinearGAM().fit(boston_data[boston.feature_names], y) X_test_res = gam.predict(X) print(X_test_res[:5])
class GAMEnsemble(EnsembleModel): """Implements GAM ensemble in [1].""" def __init__(self, nonlinear_ensemble=False, residual_process=True): """ Initializer. Args: nonlinear_ensemble: (bool) Whether use nonlinear term to transform base model. residual_process: (bool) Whether model residual process. """ model_name = ( "Generalized Additive Ensemble" if residual_process else "{} Stacking".format("Nonlinear" if nonlinear_ensemble else "Linear")) super().__init__(model_name) self.gam_model = None self.nonlinear_ensemble = nonlinear_ensemble self.model_residual = residual_process def train(self, X, y, base_pred): """Trains ensemble model based on data and base predictions. Adds value to class attribute "model_weight" Args: X: (np.ndarray) Training features, shape (N, D) y: (np.ndarray) Training labels, shape (N, 1) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. """ # build feature and gam terms ens_feature, feature_terms = self._build_ensemble_feature(X, base_pred) # define model self.gam_model = LinearGAM(feature_terms) # additional fine-tuning lam_grid = self._build_lambda_grid(n_grid=100) self.gam_model.gridsearch(X=ens_feature, y=y, lam=lam_grid, progress=False) def predict(self, X, base_pred): """Predicts label based on feature and base model. Args: X: (np.ndarray) Training features, shape (N, D) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. Returns: (np.ndarray) ensemble prediction and variance Raises: (ValueError) If self.model_weight is empty. """ if not self.gam_model: raise ValueError("Attribute gam_model empty." "Model was not trained properly.") # build feature and gam terms ens_feature, _ = self._build_ensemble_feature(X, base_pred) # prediction prediction = self.gam_model.predict(ens_feature) prediction_var = ((self.gam_model.prediction_intervals( ens_feature, width=.95)[:, 1] - prediction) / 2) ** 2 return prediction, prediction_var def _build_ensemble_feature(self, X, base_pred): """Builds featurre array and corresponding GAM TermList. Terms corresponding to X will be summation of dimension-wise splines, plus a tensor-product term across all dimension. """ ensemble_term_func = s if self.nonlinear_ensemble else l ens_feature = np.asarray(list(base_pred.values())).T term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])] # optionally, add residual process if self.model_residual: # build gam terms term_list += [s(dim_index) for dim_index in range(ens_feature.shape[1], ens_feature.shape[1] + X.shape[1])] if X.shape[1] > 1: term_list += [te(*list(ens_feature.shape[1] + np.array(range(X.shape[1]))))] # update features ens_feature = np.concatenate([ens_feature, X], axis=1) gam_feature_terms = TermList(*term_list) return ens_feature, gam_feature_terms def _build_lambda_grid(self, n_grid=100): # count actual number of terms in each nonlinear term # (e.g. te(0, 1) will actually have two terms) n_terms = np.sum([len(model_term._terms) if model_term.istensor else 1 for model_term in self.gam_model.terms]) lam = np.random.rand(n_grid, n_terms) # rescale to between (0, 1) lam_norm = (lam - np.min(lam)) / (np.max(lam) - np.min(lam)) return np.exp((lam_norm - 0.5) * 6)
class DeepModels: # Sequential 6 layer neural network def returnSequential6(self, idim = 20): model = Sequential() model.add(Dense(50, input_dim=idim, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential6_regularized(self, idim = 20): model = Sequential() model.add(Dense(50, input_dim=idim, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01))) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential9(self, idim = 20): model = Sequential() model.add(Dense(80, input_dim = idim, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential15(self, idim = 20): model = Sequential() model.add(Dense(140, input_dim=idim, activation='relu')) model.add(Dense(130, activation='relu')) model.add(Dense(120, activation='relu')) model.add(Dense(110, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(90, activation='relu')) model.add(Dense(80, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential15_regularized(self, idim = 20): model = Sequential() model.add(Dense(140, input_dim=idim, activation='relu')) model.add(Dense(130, activation='relu')) model.add(Dense(120, activation='relu')) model.add(Dense(110, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(90, activation='relu')) model.add(Dense(80, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential21(self, idim = 20): model = Sequential() model.add(Dense(200, input_dim=idim, activation='relu')) model.add(Dense(190, activation='relu')) model.add(Dense(180, activation='relu')) model.add(Dense(170, activation='relu')) model.add(Dense(160, activation='relu')) model.add(Dense(150, activation='relu')) model.add(Dense(140, activation='relu')) model.add(Dense(130, activation='relu')) model.add(Dense(120, activation='relu')) model.add(Dense(110, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(90, activation='relu')) model.add(Dense(80, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def RNN(self, idim = 20): model = Sequential() model.add(SimpleRNN(10, input_dim=idim)) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def multi_RNN(self, idim = 20): model = Sequential() model.add(SimpleRNN(14, input_dim=idim, activation='relu')) model.add(Dense(7, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def multi_RNN2(self, idim = 20): model = Sequential() model.add(SimpleRNN(40, input_dim=idim)) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def baseline(self, idim=20): # Create model model = Sequential() model.add(Dense(20, input_dim=idim, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mean_absolute_error']) return model def lstm(self, idim = 20): model = Sequential() model.add(LSTM(20, input_dim=idim)) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='mean_absolute_error', optimizer='adam') return model def multi_lstm(self, idim = 20): model = Sequential() model.add(LSTM(14, input_dim=idim, activation='relu')) model.add(Dense(7, input_dim=idim, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='mean_absolute_error', optimizer='adam') return model # Sequential 4 layer neural network def returnSequential4(self, idim = 20): model = Sequential() model.add(Dense(20, activation='relu', input_dim=idim)) model.add(Dense(units=15, activation='relu')) model.add(Dense(units=10, activation='relu')) model.add(Dense(units=5, activation='relu')) model.add(Dense(units=1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model # Sequential 4 layer neural network def returnSequential8(self, idim=20): model = Sequential() model.add(Dense(70, activation='relu', input_dim=idim)) model.add(Dense(units=60, activation='relu')) model.add(Dense(units=50, activation='relu')) model.add(Dense(units=40, activation='relu')) model.add(Dense(units=30, activation='relu')) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=10, activation='relu')) model.add(Dense(units=1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01))) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def base(self, idim=20): model = Sequential() model.add(Dense(10, activation='relu', input_dim=idim)) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def base2(self, idim=20): model = Sequential() model.add(Dense(14, activation='relu', input_dim=idim)) model.add(Dense(7, activation='relu', input_dim=idim)) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def __init__(self, m, idim=20): if m == 0: self.model = self.base(idim) self.type = 2 elif m == 1: self.model = self.base2(idim) self.type = 2 elif m == 2: self.model = self.returnSequential4(idim) self.type = 2 elif m == 3: self.model = self.returnSequential8(idim) self.type = 2 elif m == 4: self.model = self.returnSequential15_regularized(idim) self.type = 2 elif m == 5: self.model = self.multi_RNN(idim) self.type = 1 elif m == 6: self.model = self.multi_lstm(idim) self.type = 1 elif m == 7: self.model = LinearGAM() self.type = 3 elif m == 8: self.model = self.RNN(idim) self.type = 1 elif m == 9: self.model = self.lstm(idim) self.type = 1 def returnModel(self): return self.model def train(self, X, y, bs=10, epochs=100): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) if self.type == 3: self.model.gridsearch(X,y) else: self.model.fit(X, y, batch_size = bs, epochs = epochs, shuffle=True, verbose = 0) def prediction(self, X): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) return self.model.predict(X) def cross_eval_with_plotting(self, city, X,y,bs=10,ep=100, k=3): scores = [] multiplier = 0 fig10, ax10 = plt.subplots() if self.type == 0: kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) predictions = self.model.predict(X_test) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g') scores.append(score) multiplier = multiplier + 1 plt.title('True vs. Predicted Cases {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) elif self.type == 1: kf = KFold(n_splits=k, shuffle=False, random_state=0) scores = [] multiplier = 0 fig10, ax10 = plt.subplots() for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) predictions = self.model.predict(X_test) plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), predictions, 'g') score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) multiplier = multiplier + 1 plt.title('True vs. Predicted Cases in {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) elif self.type == 2: multiplier = 0 fig10, ax10 = plt.subplots() kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0) predictions = self.model.predict(X_test) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g') score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) multiplier = multiplier + 1 plt.title('True vs. Predicted Cases in {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) elif self.type == 3: multiplier = 0 fig10, ax10 = plt.subplots() kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.gridsearch(X_train, y_train) y_pre = self.model.predict(X_test) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_pre, 'g') scores.append(mean_absolute_error(y_pre, y_test)) plt.title('True vs. Predicted Cases in {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) def cross_eval(self, X, y, bs=10, ep=100, k=3): scores = [] if self.type == 0: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 1: kf = KFold(n_splits=k, shuffle=False, random_state=0) scores = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 2: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0) score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 3: kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.gridsearch(X_train, y_train) y_pre = self.model.predict(X_test) print(y_pre) scores.append(mean_absolute_error(y_pre, y_test)) return sum(scores) / len(scores)
def main(flux_dir): K_TO_C = 273.15 sites = ["AdelaideRiver","Calperum","CapeTribulation","CowBay",\ "CumberlandPlains","DalyPasture","DalyUncleared",\ "DryRiver","Emerald","Gingin","GreatWesternWoodlands",\ "HowardSprings","Otway","RedDirtMelonFarm","RiggsCreek",\ "Samford","SturtPlains","Tumbarumba","Whroo",\ "WombatStateForest","Yanco"] pfts = ["SAV","SHB","TRF","TRF","EBF","GRA","SAV",\ "SAV","NA","EBF","EBF",\ "SAV","GRA","NA","GRA",\ "GRA","GRA","EBF","EBF",\ "EBF","GRA"] d = dict(zip(sites, pfts)) id = dict(zip(sites, pd.factorize(pfts)[0])) plot_dir = "plots" if not os.path.exists(plot_dir): os.makedirs(plot_dir) flux_files = sorted(glob.glob(os.path.join(flux_dir, "*_flux.nc"))) met_files = sorted(glob.glob(os.path.join(flux_dir, "*_met.nc"))) data_qle = [] data_qh = [] data_tair = [] data_sw = [] pft_ids = [] # collect up data for flux_fn, met_fn in zip(flux_files, met_files): (site, df_flx, df_met) = open_file(flux_fn, met_fn) if d[site] != "NA": pft = d[site] colour_id = id[site] # Mask crap stuff df_met.where(df_flx.Qle_qc == 1, inplace=True) df_met.where(df_flx.Qh_qc == 1, inplace=True) df_flx.where(df_flx.Qle_qc == 1, inplace=True) df_flx.where(df_flx.Qh_qc == 1, inplace=True) #df_flx.where(df_met.Tair_qc == 1, inplace=True) #df_flx.where(df_met.SWdown == 1, inplace=True) #df_met.where(df_met.SWdown == 1, inplace=True) #df_met.where(df_met.Tair_qc == 1, inplace=True) # Mask dew df_met.where(df_flx.Qle > 0., inplace=True) df_flx.where(df_flx.Qle > 0., inplace=True) df_flx.dropna(inplace=True) df_met.dropna(inplace=True) df_flx = df_flx.between_time("09:00", "13:00") df_met = df_met.between_time("09:00", "13:00") if len(df_flx) > 0 and len(df_met) > 0: #data_qle[pft].append(df_flx.Qle.values) #data_qh[pft].append(df_flx.Qh.values) #data_tair[pft].append(df_met.Tair.values - K_TO_C) #data_sw[pft].append(df_met.SWdown.values) data_qle.append(df_flx.Qle.values) data_qh.append(df_flx.Qh.values) data_tair.append(df_met.Tair.values - K_TO_C) data_sw.append(df_met.SWdown.values) pft_ids.append([pft] * len(df_flx)) pft_ids = list(itertools.chain(*pft_ids)) data_qle = list(itertools.chain(*data_qle)) data_qh = list(itertools.chain(*data_qh)) data_sw = list(itertools.chain(*data_sw)) data_tair = list(itertools.chain(*data_tair)) data_qle = np.asarray(data_qle) data_qh = np.asarray(data_qh) data_tair = np.asarray(data_tair) data_sw = np.asarray(data_sw) pft_ids = np.asarray(pft_ids) colours = ["red", "green", "blue", "yellow", "pink"] fig = plt.figure(figsize=(14, 4)) fig.subplots_adjust(hspace=0.1) fig.subplots_adjust(wspace=0.1) plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = "sans-serif" plt.rcParams['font.sans-serif'] = "Helvetica" plt.rcParams['axes.labelsize'] = 14 plt.rcParams['font.size'] = 14 plt.rcParams['legend.fontsize'] = 14 plt.rcParams['xtick.labelsize'] = 14 plt.rcParams['ytick.labelsize'] = 14 almost_black = '#262626' # change the tick colors also to the almost black plt.rcParams['ytick.color'] = almost_black plt.rcParams['xtick.color'] = almost_black # change the text colors also to the almost black plt.rcParams['text.color'] = almost_black # Change the default axis colors from black to a slightly lighter black, # and a little thinner (0.5 instead of 1) plt.rcParams['axes.edgecolor'] = almost_black plt.rcParams['axes.labelcolor'] = almost_black ax1 = fig.add_subplot(221) ax2 = fig.add_subplot(222) ax3 = fig.add_subplot(223) ax4 = fig.add_subplot(224) colour_id = 0 for pft in np.unique(pfts): if pft != "NA": qle = data_qle[np.argwhere(pft_ids == pft)] qh = data_qh[np.argwhere(pft_ids == pft)] tair = data_tair[np.argwhere(pft_ids == pft)] sw = data_sw[np.argwhere(pft_ids == pft)] print(pft, len(qle), len(qh), len(tair), len(sw)) gam = LinearGAM(n_splines=20).gridsearch(sw, qh) XX = generate_X_grid(gam) CI = gam.confidence_intervals(XX, width=.95) ax1.plot(XX, gam.predict(XX), color=colours[colour_id], ls='-', lw=2.0) ax1.fill_between(XX[:, 0], CI[:, 0], CI[:, 1], color=colours[colour_id], alpha=0.7) gam = LinearGAM(n_splines=20).gridsearch(sw, qle) XX = generate_X_grid(gam) CI = gam.confidence_intervals(XX, width=.95) ax2.plot(XX, gam.predict(XX), color=colours[colour_id], ls='-', lw=2.0) ax2.fill_between(XX[:, 0], CI[:, 0], CI[:, 1], color=colours[colour_id], alpha=0.7) gam = LinearGAM(n_splines=20).gridsearch(tair, qh) XX = generate_X_grid(gam) CI = gam.confidence_intervals(XX, width=.95) ax3.plot(XX, gam.predict(XX), color=colours[colour_id], ls='-', lw=2.0) ax3.fill_between(XX[:, 0], CI[:, 0], CI[:, 1], color=colours[colour_id], alpha=0.7) gam = LinearGAM(n_splines=20).gridsearch(tair, qle) XX = generate_X_grid(gam) CI = gam.confidence_intervals(XX, width=.95) ax4.plot(XX, gam.predict(XX), color=colours[colour_id], ls='-', lw=2.0) ax4.fill_between(XX[:, 0], CI[:, 0], CI[:, 1], color=colours[colour_id], alpha=0.7) colour_id += 1 plt.setp(ax1.get_xticklabels(), visible=False) plt.setp(ax2.get_xticklabels(), visible=False) ax1.set_xlim(0, 1300) ax1.set_ylim(0, 1000) ax2.set_xlim(0, 45) ax2.set_ylim(0, 1000) ax3.set_xlabel("SW down (W m$^{-2}$)") ax4.set_xlabel("Tair ($^\circ$C)") ax1.set_ylabel("Qh flux (W m$^{-2}$)") ax2.set_ylabel("Qle flux (W m$^{-2}$)") #ax1.legend(numpoints=1, loc="best") #fig.savefig(os.path.join(plot_dir, "%s.pdf" % (site)), # bbox_inches='tight', pad_inches=0.1) fig.savefig(os.path.join(plot_dir, "ozflux_by_pft.png"), bbox_inches='tight', pad_inches=0.1, dpi=150)
###################################################### # constraints from pygam import LinearGAM, s from pygam.datasets import hepatitis X, y = hepatitis(return_X_y=True) X.shape gam1 = LinearGAM(s(0, constraints='monotonic_inc')).fit(X, y) gam2 = LinearGAM(s(0, constraints='concave')).fit(X, y) fig, ax = plt.subplots(1, 2) ax[0].plot(X, y, label='data') ax[0].plot(X, gam1.predict(X), label='monotonic fit') ax[0].legend() ax[1].plot(X, y, label='data') ax[1].plot(X, gam2.predict(X), label='concave fit') ax[1].legend() ###################################################### # api from pygam import LogisticGAM, s, f from pygam.datasets import toy_classification X, y = toy_classification(return_X_y=True, n=5000) gam = LogisticGAM(s(0) + s(1) + s(2) + s(3) + s(4) + f(5))