def tweedie_test(X_train, y_train, X_test, y_test, pwr, alf): ''' runs tweedie algorithm ''' # Make Model tw = TweedieRegressor(power=pwr, alpha=alf) # 0 = normal distribution # Fit Model tw.fit(X_train, y_train) # Make Predictions tw_pred = tw.predict(X_test) # Compute root mean squared error tw_MAE = mean_absolute_error(y_test, tw_pred) return tw_MAE, tw, tw_pred
def tweedie(X_train_scaled, y_train): ''' runs tweedie algorithm ''' # Make Model tw = TweedieRegressor(power=0, alpha=.001) # 0 = normal distribution # Fit Model tw.fit(X_train_scaled, y_train) # Make Predictions tw_pred = tw.predict(X_train_scaled) # Compute root mean squared error tw_rmse = sqrt(mean_squared_error(y_train, tw_pred)) return tw_rmse
def tweedie05(X_train_scaled, y_train): ''' runs tweedie algorithm ''' # Make Model tw = TweedieRegressor(power=0, alpha=.5) # 0 = normal distribution # Fit Model tw.fit(X_train_scaled, y_train) # Make Predictions tw_pred = tw.predict(X_train_scaled) # Compute root mean squared error tw_MAE = mean_absolute_error(y_train, tw_pred) return tw_MAE
def tweedie_vt(X_train_scaled, X_validate_scaled, y_train, y_validate): ''' runs tweedie algorithm on validate and test but fits model on train ''' # Make Model tw = TweedieRegressor(power=0, alpha=0.001) # 0 = normal distribution # Fit Model tw.fit(X_train_scaled, y_train) # Make Predictions tw_pred = tw.predict(X_validate_scaled) # Compute root mean squared error tw_rmse = sqrt(mean_squared_error(y_validate, tw_pred)) return tw_rmse
def test_tweedie_link_argument(name, link_class): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = TweedieRegressor(power=1, link=name).fit(X, y) assert isinstance(glm._base_loss.link, link_class) glm = TweedieRegressor(power=1, link="not a link") with pytest.raises( ValueError, match=re.escape( "The link must be an element of ['auto', 'identity', 'log']"), ): glm.fit(X, y)
def hurdle(x, y, log=True, max_iter=1000): x, y = remove_nans(x, y) n_obs = len(x) clf = LogisticRegression(fit_intercept=True, penalty='none', max_iter=max_iter) if log: reg = TweedieRegressor(fit_intercept=True, power=0, link='log', alpha=0, tol=1e-8, max_iter=max_iter) else: reg = LinearRegression(fit_intercept=True) clf.fit(x, y > 0) reg.fit(x[y > 0, :], y[y > 0]) return HurdleModel(clf, reg, n_obs, log=log, x=x, y=y)
def sk_tweedie_regression(X_train, X_test, y_train, y_test, set_model='linear'): if set_model == 'Poisson': reg = TweedieRegressor( alpha=0, power=1, # Poisson distribution link='log', fit_intercept=False, max_iter=300) elif set_model == 'linear': reg = TweedieRegressor( alpha=0, power=0, # Normal distribution link='identity', fit_intercept=False, max_iter=300) else: print('Set the correct name.') return reg.fit(X_train, y_train) print('score: ', reg.score(X_test, y_test)) y_hat = reg.predict(X) fig = plt.figure(figsize=(6.0, 6.0)) plt.plot(X, y, 'o') plt.plot(X, y_hat, '*', color='r') plt.xlabel('x (total_bill)') plt.ylabel('y (tips)') plt.xlim(0, 60) plt.ylim(0, 12) plt.show()
# print(gks_test) gks_x = gks.iloc[:, :-1].values gks_y = gks.iloc[:, -1].values gks_x_test = gks_test.iloc[:, :-1].values gks_y_test = gks_test.iloc[:, -1].values scaler = StandardScaler() gks_x = scaler.fit_transform(gks_x) # reg = SVR(C=10, epsilon=0.2) reg = TweedieRegressor(power=1, alpha=0.5, link='log') reg.fit(gks_x, gks_y) gks_x_test = scaler.transform(gks_x_test) preds = reg.predict(gks_x_test) print(mean_squared_error(gks_y_test, preds)) # print(gks_test_names) with open('gks.csv', 'w') as file: for idx, val in enumerate(preds): file.write(gks_test_names.iloc[idx]['web_name'] + "," + str(val) + "," + str(gks_y_test[idx])) file.write('\n')
def all_models_info(): '''takes in data sets baseline sets SSE, MSE, and RMSE returns infor for all 4''' # get data df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) # pull from add to trian train = evaluate.add_to_train() X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest( ) #OLS Model lm = LinearRegression(normalize=True) lm.fit(X_train, y_train.appraised_value) y_train['appraised_value_pred_lm'] = lm.predict(X_train) rmse_train_lm = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2) y_validate['appraised_value_pred_lm'] = lm.predict(X_validate) rmse_validate_lm = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lm)**(1 / 2) #LARS Model lars = LassoLars(alpha=1.0) lars.fit(X_train, y_train.appraised_value) y_train['appraised_value_pred_lars'] = lars.predict(X_train) rmse_train_lars = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2 y_validate['appraised_value_pred_lars'] = lars.predict(X_validate) rmse_validate_lars = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lars)**1 / 2 #GLM glm = TweedieRegressor(power=1, alpha=0) glm.fit(X_train, y_train.appraised_value) y_train['appraised_value_pred_glm'] = glm.predict(X_train) rmse_train_glm = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2 y_validate['appraised_value_pred_glm'] = glm.predict(X_validate) rmse_validate_glm = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2 # PF pf = PolynomialFeatures(degree=2) X_train_degree2 = pf.fit_transform(X_train) X_validate_degree2 = pf.transform(X_validate) X_test_degree2 = pf.transform(X_test) # LM2 lm2 = LinearRegression(normalize=True) lm2.fit(X_train_degree2, y_train.appraised_value) y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2) rmse_train_lm2 = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2 y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2) rmse_validate_lm2 = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2 print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm) print("--------------------------------------------------------------") print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars, "\nValidation/Out-of-Sample: ", rmse_validate_lars) print("--------------------------------------------------------------") print( "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm) print("--------------------------------------------------------------") print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
# We will compare the performance of both approaches. # To quantify the performance of both models, one can compute # the mean deviance of the train and test data assuming a Compound # Poisson-Gamma distribution of the total claim amount. This is equivalent to # a Tweedie distribution with a `power` parameter between 1 and 2. # # The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power` # parameter. As we do not know the true value of the `power` parameter, we here # compute the mean deviances for a grid of possible values, and compare the # models side by side, i.e. we compare them at identical values of `power`. # Ideally, we hope that one model will be consistently better than the other, # regardless of `power`. glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000) glm_pure_premium.fit(X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]) tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] scores_product_model = score_estimator( (glm_freq, glm_sev), X_train, X_test, df_train, df_test, target="PurePremium", weights="Exposure", tweedie_powers=tweedie_powers, )
# Extract eco data from Sep 2018 to Jan 2020 df_eco_sel = df_eco.loc['2018-09-01':'2020-01-31'] # put together eco and transaction counts for regression df_all = pd.concat([df_eco_sel, df_period.set_index(df_eco_sel.index)], axis=1) y_train = df_all['Transaction_Count'].values X_train = df_all[[ 'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX' ]] # generalized linear model glm = TweedieRegressor(power=1, alpha=0.5, link='log') # Poisson distribution scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) glm.fit(X_train_scaled, y_train) # predict eco data for given year and month df_future = pd.DataFrame(columns=['Date']) for i, eco_var in enumerate(list(eco_vec_map.keys())): print("Forecasting " + eco_var + ' ' + str(Y) + ' ' + datetime.strptime(str(M), "%m").strftime("%b")) tmp = forecast_eco(df_eco, eco_var, Y, M) tmp = tmp[['ds', 'trend']] tmp.rename(columns={'ds': 'Date', 'trend': eco_var}, inplace=True) df_future = df_future.merge(tmp, on='Date', how='right') # predict transaction count using the glm model eco_forecast = df_future.tail(1)[[ 'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX' ]]
def tweedieregressor(self,X_train,X_test,y_train,y_test): regressor= TweedieRegressor() regfit=regressor.fit(self.X_train,self.y_train) return regressor.predict(self.X_test)
axes.tick_params(width=4) # change all spines for axis in ['top','bottom','left','right']: axes.spines[axis].set_linewidth(6) #%% from sklearn.linear_model import TweedieRegressor X = np.array(x).reshape(-1,1) Y = np.array(y) pr = TweedieRegressor(power = 1, alpha=0, fit_intercept=True) y_pred_pr = pr.fit(X, Y).predict(X) fig, axes = utils.plot_make(size_length=5) sns.scatterplot(data = sc_vs_quickness_group_fill, x = "sc_LR_mean", y= "inverse_quickness", linewidth=0, s=100) sns.lineplot(x = X.flatten(), y = y_pred_pr) pr.score(X, Y) #% X2 = sm.add_constant(X) glm = sm.GLM(Y, X2, family=sm.families.Tweedie()) glm_fit = glm.fit()
def main_Calib(filename, output, mode, alg, basis, order, figure, verbose, offset, qt, pre, split): ''' # main program # input: radius: %+.3f, 'str' (in makefile, str is default) # path: file storage path, 'str' # fout: file output name as .h5, 'str' (.h5 not included') # cut_max: cut off of Legendre # output: the gathered result EventID, ChannelID, x, y, z ''' if pre != 'r': print('begin reading file', flush=True) EventID, ChannelID, Q, PETime, photonTime, PulseTime, dETime, x, y, z = pub.ReadFile(filename) VertexTruth = (np.vstack((x, y, z))/1e3).T if(offset): off = pub.LoadBase(offset) else: off = np.zeros_like(PMTPos[:,0]) print('total event: %d' % np.size(np.unique(EventID)), flush=True) print('begin processing legendre coeff', flush=True) # this part for the same vertex tmp = time.time() EventNo = np.size(np.unique(EventID)) PMTNo = np.size(PMTPos[:,0]) if mode == 'PE': PMTPosRep = np.tile(PMTPos, (EventNo,1)) vertex = np.repeat(VertexTruth, PMTNo, axis=0) elif mode == 'time': counts = np.bincount(EventID) counts = counts[counts!=0] PMTPosRep = PMTPos[ChannelID] vertex = np.repeat(VertexTruth, counts, axis=0) elif mode == 'combined': PMTPosRep = np.tile(PMTPos, (EventNo,1)) vertex = np.repeat(VertexTruth, PMTNo, axis=0) if basis == 'Legendre': X, cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=True) elif basis == 'Zernike': from zernike import RZern cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=False) cart = RZern(order) nk = cart.nk m = cart.mtab n = cart.ntab rho = np.linalg.norm(vertex, axis=1)/0.65 theta = np.arccos(cos_theta) X = np.zeros((rho.shape[0], nk)) for i in np.arange(nk): if not i % 5: print(f'process {i}-th event') X[:,i] = cart.Zk(i, rho, theta) X = X[:,m>=0] print(f'rank: {np.linalg.matrix_rank(X)}') print(f'use {time.time() - tmp} s') # which info should be used if mode == 'PE': y = Q elif mode == 'time': y = PulseTime elif mode == 'combined': # PulseTime = PulseTime - np.min(PulseTime) # PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2 # print(np.min(PulseTime), np.max(PulseTime)) PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2 bins = np.arange(-1, 0.05, 0.1) N = 10 # Legendre coeff x = pub.legval(bins, np.eye(N).reshape(N, N, 1)) # 1st basis Y = np.tile(x, len(np.unique(EventID))*len(np.unique(ChannelID))).T # 2nd basis X = np.repeat(X, bins.shape[0], axis=0) # output y = np.zeros((len(np.unique(EventID)), len(np.unique(ChannelID)), len(bins))) ''' basis = np.zeros((X.shape[0], X.shape[1]*Y.shape[1])) for i_index, i in enumerate(np.arange(X.shape[1])): for j_index, j in enumerate(np.arange(Y.shape[1])): total_index = i_index*Y.shape[1] + j_index if not total_index % 10: print(total_index) basis[:, total_index] = X[:,i_index]*Y[:,j_index] X = basis ''' split_index = np.unique(EventID).shape[0] for k_index, k in enumerate(np.unique(EventID)): # event begin with 1 if k_index > split_index * split: break if not k % 100: print(k) index = EventID == k CID = ChannelID[index] Pulse_t = PulseTime[index] for i in np.unique(CID): # PMT begin with 0 y[k_index, i, 1:], _ = np.histogram(Pulse_t[CID==i], bins=bins) y = np.reshape(y,(-1)) if verbose: print(f'the basis shape is {X.shape}, and the dependent variable shape is {y.shape}') if pre =='w': if split != 1: split_index = np.int(split*y.shape[0]) X = X[:split_index] Y = Y[:split_index] y = y[:split_index] import pandas as pd import pyarrow as pa import pyarrow.parquet as pq y = np.atleast_2d(y).T #data = np.hstack((X, y, np.ones_like(y))) df_X = pd.DataFrame(X) X_names = [] for i in df_X.columns: X_names.append('X' + str(i)) df_X.columns = X_names df_Y = pd.DataFrame(Y) Y_names = [] for i in df_Y.columns: Y_names.append('Y' + str(i)) df_Y.columns = Y_names df_y = pd.DataFrame(y) df_y.columns = ['output'] df = pd.concat([df_X, df_Y, df_y], axis=1) table = pa.Table.from_pandas(df) pq.write_table(table, 'test1.parquet') return if not pre: # Regression methods: if alg == 'sms': import statsmodels.api as sm if mode == 'PE': model = sm.GLM(y, X, family=sm.families.Poisson(), fit_intercept=False) result = model.fit() if verbose: print(result.summary()) AIC = result.aic coef_ = result.params std = result.bse elif mode == 'time': import pandas as pd data = pd.DataFrame(data = np.hstack((X, np.atleast_2d(y).T))) strs = 'y ~ ' start = data.keys().start stop = data.keys().stop step = data.keys().step cname = [] cname.append('X0') for i in np.arange(start+1, stop, step): if i == start + 1: strs += 'X%d ' % i elif i == stop - step: pass else: strs += ' + X%d ' % i if i == stop - step: cname.append('y') else: cname.append('X%d' % i) data.columns = cname mod = sm.formula.quantreg(strs, data[cname]) result = mod.fit(q=qt,) coef_ = result.params AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) print('Waring! No AIC and std value') elif mode == 'combined': # data = pd.DataFrame(data = np.hstack((basis, np.atleast_2d(y).T))) with h5py.File(output,'w') as out: out.create_dataset('X', data = X) out.create_dataset('Y', data = y) print('begin...') model = sm.GLM(y, X, family=sm.families.Poisson()) result = model.fit() if verbose: print(result.summary()) coef_ = result.params std = result.bse AIC = result.aic if verbose: print(result.summary()) elif (alg == 'custom'): from scipy.optimize import minimize x0 = np.zeros_like(X[0]) # initial value (be careful of Zernike order) if mode == 'PE': x0[0] = 0.8 + np.log(2) # intercept is much more important result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X)) elif mode == 'time': x0[0] = np.mean(y) qt = 0.1 ts = 2.6 result = minimize(pub.CalibTime, x0=x0, method='SLSQP', args = (np.hstack((EventID, EventID)), y, X, qt, ts)) elif mode == 'combined': x0 = np.zeros_like(X[0]) x0[0] = 0.8 + np.log(2) # intercept is much more important result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X)) coef_ = np.array(result.x, dtype=float) if verbose: print(result.message) AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) H = pub.MyHessian(result.x, pub.CalibPE, *(y, PMTPos, X)) # H = pub.MyHessian(result.x, *(Q, PMTPos, X, pub.CalibTime)) # std = 1/np.sqrt(-np.diag(np.linalg.pinv(H1))) print(coef_) # print(std) print('Waring! No AIC and std value, std is testing') elif alg == 'sk': from sklearn.linear_model import TweedieRegressor alpha = 0.001 reg = TweedieRegressor(power=1, alpha=alpha, link='log', max_iter=1000, tol=1e-6, fit_intercept=False) reg.fit(X, y) # just for point data # pred = reg.predict(X[0:30,0:cut+1]) print('coeff:\n', reg.coef_,'\n') coef_ = reg.coef_ AIC = np.zeros_like(coef_) std = np.zeros_like(coef_) print('Waring! No AIC and std value') elif alg == 'h2o': import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator if mode != 'combined': y = np.atleast_2d(y).T data = np.hstack((X, y, np.ones_like(y))) h2o.init() hf = h2o.H2OFrame(data) predictors = hf.columns[0:-2] response_col = hf.columns[-2] if mode == 'PE': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, lambda_ = 0, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) coef_table = glm_model._model_json['output']['coefficients_table'] coef_ = glm_model.coef() elif mode == 'time': gbm = H2OGradientBoostingEstimator(distribution="quantile", seed = 1234, stopping_metric = "mse", stopping_tolerance = 1e-4) gbm.train(x = predictors, y = response_col, training_frame = hf) breakpoint() print(gbm) exit() elif mode == 'combined': y = np.atleast_2d(y).T data = np.hstack((X, Y, y, np.ones_like(y))) h2o.init() hf = h2o.H2OFrame(data) predictors = hf.columns[0:-2] response_col = hf.columns[-2] if verbose: print(coef_) if basis == 'Zernike': print(f'Regession coef shape is f{np.array(coef_).shape}, Zernike shape is {nk}') coef_ = coef_table['coefficients'] std = coef_table['std_error'] AIC = glm_model.aic() h2o.cluster().shutdown() elif pre == 'r': import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator h2o.init() hf = h2o.import_file("electron-1.parquet") pairs = [] for i in hf.columns: for j in hf.columns: if (i.startswith('Z') and j.startswith('L')): if ((i!='X0') and (j != 'Y0')): pairs.append((i,j)) predictors = hf.columns[2:] response_col = hf.columns[0] print(predictors) print(response_col) print(pairs) if mode == 'PE': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, lambda_ = 0, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) elif mode == 'combined': #offset_col = hf.columns[-1] glm_model = H2OGeneralizedLinearEstimator(family= "poisson", #offset_column = offset_col, interaction_pairs=pairs, lambda_ = 0, #remove_collinear_columns = True, compute_p_values = True) glm_model.train(predictors, response_col, training_frame=hf) breakpoint() coef_table = glm_model._model_json['output']['coefficients_table'] coef_ = coef_table['coefficients'] std = coef_table['std_error'] AIC = glm_model.aic() print(f'Regession coef is f{np.array(coef_)}') if (figure=='ON'): import matplotlib.pyplot as plt L, K = 500, 500 ddx = np.linspace(-1.0, 1.0, K) ddy = np.linspace(-1.0, 1.0, L) xv, yv = np.meshgrid(ddx, ddy) cart.make_cart_grid(xv, yv) # normal scale # im = plt.imshow(np.exp(cart.eval_grid(np.array(coef_), matrix=True)), origin='lower', extent=(-1, 1, -1, 1)) # log scale im = plt.imshow(cart.eval_grid(np.array(coef_), matrix=True), origin='lower', extent=(-1, 1, -1, 1)) plt.colorbar() plt.savefig('test.png') else: print('error regression algorithm') with h5py.File(output,'w') as out: out.create_dataset('coeff' + str(order), data = coef_) out.create_dataset('std' + str(order), data = std) out.create_dataset('AIC' + str(order), data = AIC)
def tweedie_regression(): reg = TweedieRegressor(power=1, alpha=0.5, link='log') reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) print(reg.coef_) print(reg.intercept_)
print('R2 ={}'.format(R2)) #generalized not orking from sklearn.linear_model import TweedieRegressor list=[] for i in np.arange(5,20): dfcorr=df[correlatedvar[:i]] from sklearn.preprocessing import MinMaxScaler scaler=MinMaxScaler(feature_range=(1,10)) dfscal=scaler.fit_transform(dfcorr) Y=dfscal[:,0] X=dfscal[:,1:] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25,shuffle=False) regr=TweedieRegressor(power=1, alpha=0.5, link='log') regr.fit(X_train, y_train) prediction=regr.predict(X_test) R2=sklearn.metrics.r2_score(y_test,prediction) list.append(R2) print('optimal amount of variables: {}, R2=' .format(list.index(max(list))+5),R2) #max in 12 #polynomial from sklearn.preprocessing import PolynomialFeatures list=[] for i in np.arange(2,10): dfcorr=df[correlatedvar[:i]] from sklearn.preprocessing import MinMaxScaler scaler=MinMaxScaler(feature_range=(1,10)) dfscal=scaler.fit_transform(dfcorr) Y=dfscal[:,0]