def clad_ilpa(yvar, xvars, maxiter_ilpa=20, quiet=False, b=None): x = np.array(xvars) N, K, xvars = labels(x) y = np.array(yvar).reshape(N, 1) if np.mean(1 * (y > 0)) < 0.5: print( 'Error: More than half of observations are censored. Beta is unidentified' ) return np.nan * np.ones((K, 1)) if b is None: # use lad estimates as starating values lad = sm.QuantReg(y, x).fit(q=0.5, p_tol=1e-05) b = np.array(lad.params).reshape(-1, 1) for i in range(maxiter_ilpa): b0 = b yhat = x @ b lad = sm.QuantReg(y[yhat > 0], x[yhat[:, 0] > 0, :]).fit(q=0.5, p_tol=1e-05) b = np.array(lad.params).reshape(-1, 1) if (b == b0).all(): # "convrgence achieved"; if quiet == False: print('CLAD finished in %d iterations using ILPA' % i) print('Fractions of observations that are censored: ', np.mean(1 * (y == 0))) print(lad.summary()) return np.array(b).reshape(-1, 1) return np.nan * np.ones((K, 1))
def qrTraining(horizon, inpEndo, inpExo, tar): ''' Arguments: - horizon: The forecast horizon for which a model should be learned. - inpEndo: A pandas DataFrame containing the endogenous inputs. - inpExo: A pandas DataFrame containing the endogenous inputs. - tar: A pandas DataFrame containting the targets. Returns: - Stored .pickle files containing the QR parameters, one for each horizon and nominal probability. ''' taus = np.arange(0.1, 0.91, 0.1) # Training months tr_m = [ 2, 3, 5, 6 ] # Removed months 1, 7 and 8 to speed it up but also because they're less relevant. # Test month April te_m = 4 cols = ["{}_{}".format("t", horizon)] train = inpEndo[inpEndo.index.month.isin(tr_m)] train = train.join(inpExo[inpExo.index.month.isin(tr_m)], how="inner") train = train.join(tar[tar.index.month.isin(tr_m)], how="inner") test = inpEndo[inpEndo.index.month == te_m] test = test.join(inpExo[inpEndo.index.month == te_m], how="inner") test = test.join(tar[tar.index.month == te_m], how="inner") feature_cols = inpEndo.filter(regex='y').columns.tolist() feature_cols_endo = inpEndo.filter(regex='y').columns.tolist() feature_cols.extend([ "Temperature_{}".format(horizon), "TotalCloudCover_{}".format(horizon) ]) # ,"WindUMS_{}".format(horizon),"WindVMS_{}".format(horizon) train = train[cols + feature_cols].dropna(how="any") test = test[cols + feature_cols].dropna(how="any") train_X = train[feature_cols].values test_X = test[feature_cols].values #scaler = preprocessing.StandardScaler().fit(train_X) #train_X = scaler.transform(train_X) train_y = train[cols].values test_y = test[cols].values # Perhaps add some jitter: #Xtra_jitter = np.random.normal(1*Xtra,0.01) # Add some random noise to avoid singular matrix quantreg = sm.QuantReg(train_y, train_X) tau = 1 for q in taus: res = quantreg.fit(q=q, max_iter=10000) #res.save("{}_{}_{}_{}.{}".format("ForecastModels\qr",horizon,"tau",tau,"pickle")) res.save( os.path.join( FORECASTMODELS, "{}_{}_{}_{}.{}".format("qr", horizon, "tau", tau, "pickle"))) tau += 1
def QR_fit(y, q=0.5): # QR fit for dtrend: should be constant, i.e. no slope in dtrend # dtrend model: y = slope X = np.ones(len(y)) res = sm.QuantReg(y, X).fit(q) slope = res.params[0] resid = res.resid std = np.std(resid) mean = np.mean(resid) print('std: ' + str(np.round(std, 4)) + ' mean: ' + str(np.round(mean, 4))) return slope
def fit_model(tr_df, controls, ivs, dv, model_class): X = tr_df[['const'] + list(controls) + list(ivs)] y = tr_df[dv] try: if model_class == 'logreg': model = sm.Logit(y, X, missing='drop', hasconst=True) res = model.fit() elif model_class == 'ols': model = sm.OLS(y, X, missing='drop', hasconst=True) res = model.fit() elif model_class == 'qr': # median regression model = sm.QuantReg(y, X, missing='drop', hasconst=True) res = model.fit(q=0.5) else: raise Exception('Do not recognize model "{}"'.format(model_class)) except Exception as ex: print(ex) return None, None return res, model
def stats(predictor, response, model): ##will apply the statistical model you enter to the variables inputed, the ##codes for each statistical model are viewable in the chain of if statements predictor = np.asarray(predictor) response = np.asarray(response) if model == 'logit': model = sm.Logit(predictor, response) elif model == 'lsr': model = sm.OLS(predictor, response) elif model == "probit": model = sm.Probit(predictor, response) elif model == "gls": model = sm.GLS(predictor, response) elif model == "glsar": model = sm.GLSAR(predictor, response) elif model == "quantreg": model = sm.QuantReg(predictor, response) else: pass model = model.fit() print(model.summary())
def cross_validation(features, data, y, model=None, quantile=None, max_iter=1000, p_tol=1e-6, logistic=False): is_statsmodel = (model is None) yrs = data["Draft Year"].unique() result_df = pd.DataFrame() # Initialize empty values for the Y^ model, Y^ baseline, and actual Y. model_predicted = np.array([]) actual_values = np.array([]) if logistic: model_proba = np.array([]) # Iterate through each year for yr in yrs: train = data[data["Draft Year"] != yr] test = data[data["Draft Year"] == yr] train_columns = train[features] X_train = np.array(train_columns.values.tolist()) test_columns = test[features] X_test = np.array(test_columns.values.tolist()) Y_train = np.array(train[[y]].values.tolist()) Y_test = np.array(test[[y]].values.tolist()) res = None if is_statsmodel: model = sm.QuantReg(Y_train, train_columns) res = model.fit(q=quantile, max_iter=max_iter, p_tol=p_tol) Y_pred = model.predict(res.params, exog=test_columns) else: model.fit(X_train, Y_train.ravel()) Y_pred = model.predict(X_test) if logistic: Y_proba = model.predict_proba(X_test) Y_proba = [sample[1] for sample in Y_proba] # Append predictions model_predicted = np.append(model_predicted, Y_pred) if logistic: model_proba = np.append(model_proba, Y_proba) actual_values = np.append(actual_values, Y_test) # Append to our result dataframe to export later test = test[['Name', "Draft Year"] + features] # test["Model Projected Average PPR Points Per First 48 Games"] = inv_boxcox(Y_pred, value) if logistic: test["Model"] = Y_proba test["Actual"] = Y_test if not logistic: test["Model"] = Y_pred test["Residual"] = np.abs( np.subtract(Y_pred.flatten(), Y_test.flatten())) result_df = result_df.append(test) # Calculate total r^2 and total RMSE on overall predictions, or if logistic balanced_accuracy and sensitivity if logistic: model_r_2 = log_loss(actual_values, model_predicted) model_rmse = f1_score(actual_values, model_predicted) else: model_r_2 = 1 - (1 - r2_score(actual_values, model_predicted)) * ( (len(model_predicted) - 1) / (len(model_predicted) - X_train.shape[1] - 1)) model_rmse = mean_squared_error(actual_values, model_predicted, squared=False) return model_rmse, model_r_2, result_df, res
def qr_train_forecast(horizon): ''' This function should read .pickle files and predict for all quantileLevels for "horizon" at the current time step i. Arguments: - horizon: the forecast horizon. - inpEndo: A pandas DataFrame containing the endogenous inputs. - inpExo: A pandas DataFrame containing the endogenous inputs. - tar: A pandas DataFrame containting the targets. - quantileLevels: a vector with the nominal probabilities expressed as quantiles. - i: current time step. Returns: - Numpy array (horizon x length(quantileLevels)) ''' taus = np.arange(0.05, 0.96, 0.05) #taus = np.linspace(0.001,0.999,num=21) # quantreg does not accept 0 # Training month April tr_m = [5] # Train the QR on the uncalibrated GBRT forecasts # Test month May te_m = [4] # Test the QR on the uncalibrated GBRT forecasts preds = [] gbrt_fc_str = os.path.join(FORECASTS, "{}_{}.{}".format( "gbrt", horizon, "txt" )) # glob.glob(os.path.join(FORECASTS, "g*.txt")) # Select gbrt forecasts gbrt_fc = pd.read_csv( gbrt_fc_str, sep="\t", parse_dates=True) #,header=True,index_col="DateTime",parse_dates=True gbrt_fc['DateTime'] = pd.to_datetime(gbrt_fc['DateTime']) gbrt_fc = gbrt_fc.set_index(pd.DatetimeIndex(gbrt_fc['DateTime'])) gbrt_fc = gbrt_fc.drop(['DateTime'], axis=1) gbrt_ob_str = os.path.join(OBSERVATIONS, "{}_{}.{}".format( "obs", horizon, "txt" )) # glob.glob(os.path.join(FORECASTS, "g*.txt")) # Select gbrt forecasts gbrt_ob = pd.read_csv( gbrt_ob_str, sep="\t", parse_dates=True) #,header=True,index_col="DateTime",parse_dates=True gbrt_ob['DateTime'] = pd.to_datetime(gbrt_ob['DateTime']) gbrt_ob = gbrt_ob.set_index(pd.DatetimeIndex(gbrt_ob['DateTime'])) gbrt_ob = gbrt_ob.drop(['DateTime'], axis=1) train = gbrt_fc[gbrt_fc.index.month.isin(tr_m)] train = train.join(gbrt_ob[gbrt_ob.index.month.isin(tr_m)], how="inner") test = gbrt_fc[gbrt_fc.index.month.isin(te_m)] test = test.join(gbrt_ob[gbrt_ob.index.month.isin(te_m)], how="inner") cols = ["{}_{}".format("t", horizon)] feature_cols = gbrt_fc.columns.tolist( ) # Take the fc columns as feature names train = train[cols + feature_cols].dropna(how="any") test = test[cols + feature_cols].dropna(how="any") train_X = train[feature_cols].values test_X = test[feature_cols].values train_y = train[cols].values test_y = test[cols] # To store as pandas series tau = 1 test_pred = [] quantreg = sm.QuantReg(train_y, train_X) for q in taus: model = quantreg.fit(q=q, max_iter=10000) test_pred.append(model.predict(test_X)) tau += 1 tmp = np.vstack(test_pred).T # List to NumPy array fc_df = pd.DataFrame(data=tmp, index=test.index) # To store as pandas DataFrame fc_df.to_csv(os.path.join(FORECASTS, "{}_{}.{}".format("qr", horizon, "txt")), sep="\t") test_y.to_csv(os.path.join(OBSERVATIONS, "{}_{}.{}".format("qr_obs", horizon, "txt")), sep="\t")
def _FittingFunctions(input_ax, input_X, input_Y, Pred_type, Clean_type, Visualization, test_frac_size, color, random_Seed): pred_summary = {} pred_summary["y_pred"] = 0 pred_summary["groundtruth"] = 0 pred_summary["y_pred_95confi"] = None if Pred_type == None: if Visualization == True: input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning( input_X, input_Y, Clean_type, random_Seed, test_frac_size) input_ax.plot(input_X, input_Y, 'ok', ms=4, color=color, alpha=0.5) else: pass ## Linear Regression elif Pred_type == "LR": ## train test sets split input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning( input_X, input_Y, Clean_type, random_Seed, test_frac_size) ## PolynomialFeatures with order 1 features = PolynomialFeatures(degree=1, include_bias=True) X_feature = features.fit_transform(input_X[:-1]) regress = LinearRegression() regress.fit(X_feature, input_Y[:-1]) y_model = regress.predict(X_feature) ## predict next utility next_X = np.atleast_2d(np.array(input_X[-1])).T next_X_feature = features.fit_transform(next_X) y_pred = regress.predict(next_X_feature) if Visualization == True: input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(input_X[-1], input_Y[-1], 'v', ms=8, color=color, alpha=0.5) input_ax.plot(input_X[:-1], y_model, c='k') input_ax.scatter(next_X, y_pred, color="black", s=50) pred_summary["y_pred"] = list(y_pred)[0] pred_summary["groundtruth"] = input_Y[-1] ## save prediction and 95% confi prediction in pred_summary pred_summary = PredictOppoBehavior._save_pred_summary( pred_summary, input_Y[-1], list(y_pred)[0], None) ## return slope init_X = np.atleast_2d(np.array(input_X[0])).T init_X_feature = features.fit_transform(init_X) y_init = regress.predict(init_X_feature) slope = (y_pred - y_init) / (next_X - init_X) pred_summary["slope"] = slope ## non-linear regression elif Pred_type == "NLR": input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning( input_X, input_Y, Clean_type, random_Seed, test_frac_size) ## PolynomialFeatures with order 3 (default, can be tuned for other projects) features = PolynomialFeatures(degree=3, include_bias=True) X_feature = features.fit_transform(input_X[:-1]) regress = LinearRegression() regress.fit(X_feature, input_Y[:-1]) y_model = regress.predict(X_feature) ## predict next utility next_X = np.atleast_2d(np.array(input_X[-1])).T next_X_feature = features.fit_transform(next_X) y_pred = regress.predict(next_X_feature) if Visualization == True: input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(input_X[-1], input_Y[-1], 'v', ms=8, color=color, alpha=0.5) input_ax.plot(input_X[:-1], y_model, c='k') input_ax.scatter(next_X, y_pred, color="black", s=50) pred_summary["y_pred"] = list(y_pred)[0] pred_summary["groundtruth"] = input_Y[-1] ## save prediction and 95% confi prediction in pred_summary pred_summary = PredictOppoBehavior._save_pred_summary( pred_summary, input_Y[-1], list(y_pred)[0], None) ## Gaussian process regression elif Pred_type == "GPR": input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning( input_X, input_Y, Clean_type, random_Seed, test_frac_size) ## the default kernel is RationalQuadratic (can be tuned for other projects) kernels = [ 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)), 1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1), 1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0, length_scale_bounds=(0.1, 10.0), periodicity_bounds=(1.0, 10.0)), 1.0 * Matern( length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5) ] ## Instantiate a Gaussian Process model gp = GaussianProcessRegressor(kernel=kernels[1]) ## Fit to data gp.fit(X_train, y_train) ## Make the prediction y_pred_fit, sigma = gp.predict(input_X, return_std=True) ## predict next utility next_X = np.atleast_2d(np.array(input_X[-1])).T y_pred = gp.predict(next_X) if Visualization == True: input_ax.plot(input_X[-1], input_Y[-1], 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(X_train, y_train, 'ok', ms=4, color="r", alpha=0.5, label="Observation") input_ax.plot(X_test, y_test, 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(input_X, y_pred_fit, 'k--', alpha=0.5, label='Prediction') input_ax.fill(np.concatenate([input_X, input_X[::-1]]), np.concatenate([ y_pred_fit - 1.9600 * sigma, (y_pred_fit + 1.9600 * sigma)[::-1] ]), alpha=.3, fc=color, ec='None', label='95% confidence interval') input_ax.plot(X_train, y_train, 'ok', ms=4, color="r", alpha=0.5, label="Observation") input_ax.scatter(next_X, y_pred, color="black", s=50) ## save prediction and 95% confi prediction in pred_summary pred_summary = PredictOppoBehavior._save_pred_summary( pred_summary, input_Y[-1], list(y_pred)[0], (y_pred + 1.9600 * sigma)[-1]) ## Gaussian process regression with random noise elif Pred_type == "GPRN": input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning( input_X, input_Y, Clean_type, random_Seed, test_frac_size) ## the default kernel is RationalQuadratic (can be tuned for other projects) kernels = [ 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)), 1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1), 1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0, length_scale_bounds=(0.1, 10.0), periodicity_bounds=(1.0, 10.0)), 1.0 * Matern( length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5) ] ## the random noise in the range of (0, 0.05) (default, can be tuned for other projects) dy = 0.0 + 0.05 * np.random.random(y_train.shape) noise = np.random.normal(0, dy) y_train += noise gp = GaussianProcessRegressor(kernel=kernels[1], alpha=dy**2) ## Fit to data gp.fit(X_train, y_train) ## Make the prediction y_pred_fit, sigma = gp.predict(input_X, return_std=True) ## predict next utility next_X = np.atleast_2d(np.array(input_X[-1])).T y_pred = gp.predict(next_X) if Visualization == True: input_ax.plot(input_X[-1], input_Y[-1], 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(X_train, y_train, 'ok', ms=4, color="r", alpha=0.5, label="Observation") input_ax.errorbar(X_train, y_train, dy, fmt='r.', markersize=4, label='Observations') input_ax.plot(X_test, y_test, 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(input_X, y_pred_fit, 'k--', alpha=0.5, label='Prediction') input_ax.fill(np.concatenate([input_X, input_X[::-1]]), np.concatenate([ y_pred_fit - 1.9600 * sigma, (y_pred_fit + 1.9600 * sigma)[::-1] ]), alpha=.3, fc=color, ec='None', label='95% confidence interval') input_ax.plot(X_train, y_train, 'ok', ms=4, color="r", alpha=0.5, label="Observation") input_ax.scatter(next_X, y_pred, color="black", s=50) pred_summary = PredictOppoBehavior._save_pred_summary( pred_summary, input_Y[-1], list(y_pred)[0], (y_pred + 1.9600 * sigma)[-1]) #elif Pred_type == "OLS": # # input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning(input_X, input_Y, Clean_type, random_Seed, test_frac_size) # # import statsmodels.api as sm # from statsmodels.regression.quantile_regression import QuantReg # from statsmodels.sandbox.regression.predstd import wls_prediction_std # # input_X = np.array(input_X) # input_Y = np.array(input_Y) # # input_X2 = sm.add_constant(input_X) # est = sm.OLS(input_Y[:-1], input_X2[:-1]) # results = est.fit() # # if Visualization == True: # print(results.summary()) # # input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms = 4, color = color, alpha=0.5) # input_ax.plot(input_X[-1], input_Y[-1], 'v', ms = 8, color = color, alpha=0.5) # # input_ax.plot(input_X[:-1], results.predict(), label='predicted', color = 'k') # # from statsmodels.sandbox.regression.predstd import wls_prediction_std # # sdev, lower, upper = wls_prediction_std(results, exog=input_X2[:-1], alpha=0.05) # # input_ax.plot(input_X[:-1], upper, color='#888888', alpha=1) # input_ax.plot(input_X[:-1], lower, color='#888888', alpha=1) ## the predictable check by quantile regression elif Pred_type == "QuantReg": input_X, input_Y, X_train, y_train, X_test, y_test = PredictOppoBehavior._utilityCleaning( input_X, input_Y, Clean_type, random_Seed, test_frac_size) import statsmodels.api as sm from statsmodels.regression.quantile_regression import QuantReg from statsmodels.sandbox.regression.predstd import wls_prediction_std input_X = np.array(input_X) input_Y = np.array(input_Y) input_X2 = sm.add_constant(input_X) est = sm.QuantReg(input_Y[:-1], input_X2[:-1]) results = est.fit() ## Build the model for other quantiles quantiles = np.array((0.1, 0.5, 0.9)) models = [] for qt in quantiles: res = est.fit(q=qt) models.append(res) y_pred1 = models[0].params[0] + models[0].params[1] * input_X[:-1] y_pred2 = models[1].params[0] + models[1].params[1] * input_X[:-1] y_pred3 = models[2].params[0] + models[2].params[1] * input_X[:-1] if Visualization == True: print(results.summary()) input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms=4, color=color, alpha=0.5) input_ax.plot(input_X[-1], input_Y[-1], 'v', ms=8, color=color, alpha=0.5) input_ax.scatter(input_X[-1], results.predict(input_X2[-1]), color="black", s=50) input_ax.plot(input_X[:-1], y_pred1, color='#888888', alpha=1, label='Q Reg : 0.1') input_ax.plot(input_X[:-1], y_pred2, color='k', alpha=1, label='Q Reg : 0.5') input_ax.plot(input_X[:-1], y_pred3, color='#888888', alpha=1, label='Q Reg : 0.9') input_ax.set_title( "predictable checking on Q Reg: 0.1, 0.5 and 0.9") ## record slopes for predictable check pred_summary["slope"] = [] upper_y_pred09 = models[2].params[ 0] + models[2].params[1] * input_X[-1] upper_y_init09 = models[2].params[ 0] + models[2].params[1] * input_X[0] slope09 = (upper_y_pred09 - upper_y_init09) / (input_X[-1] - input_X[0]) pred_summary["slope"].append(slope09) upper_y_pred05 = models[1].params[ 0] + models[1].params[1] * input_X[-1] upper_y_init05 = models[1].params[ 0] + models[1].params[1] * input_X[0] slope05 = (upper_y_pred05 - upper_y_init05) / (input_X[-1] - input_X[0]) pred_summary["slope"].append(slope05) pred_summary["y_pred"] = list(upper_y_pred05)[0] pred_summary["groundtruth"] = input_Y[-1] #### Exploratory Data Analysis (EDA) ## Moving average elif Pred_type == "MA" and Visualization == True: input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms=1, color=color, alpha=0.1) input_ax.plot(input_X[-1], input_Y[-1], 'v', ms=8, color=color, alpha=0.5) input_Y_pd = pd.DataFrame(input_Y) plot_moving_average(input_ax, input_Y_pd, 90, color, plot_intervals=True, scale=1.96) ## Exponential smoothing elif Pred_type == "ES" and Visualization == True: input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms=1, color=color, alpha=0.3) input_ax.plot(input_X[-1], input_Y[-1], 'v', ms=8, color=color, alpha=0.5) ## (default, can be tuned for other projects) plot_exponential_smoothing(input_ax, input_Y, [0.05]) ## Double exponential smoothing elif Pred_type == "DES" and Visualization == True: input_ax.plot(input_X[:-1], input_Y[:-1], 'ok', ms=1, color=color, alpha=0.3) input_ax.plot(input_X[-1], input_Y[-1], 'v', ms=8, color=color, alpha=0.5) ## (default, can be tuned for other projects) plot_double_exponential_smoothing(input_ax, input_Y, alphas=[0.02], betas=[0.02]) elif Pred_type == "PASS" and Visualization == True: pass return pred_summary
X = lsmod.model.wexog y = lsmod.model.wendog rlmod = sm.RLM(y,X).fit() rlmod.summary() # wts = rlmod.weights wts[wts < 1] # l1mod = sm.QuantReg(y,X).fit() l1mod.summary() # ### High Breakdown Estimators # import faraway.datasets.star star = faraway.datasets.star.load() gs1 = smf.ols('light ~ temp', star).fit() X = gs1.model.wexog gs2 = sm.RLM(star.light, X, data=star).fit() gs3 = smf.ols('light ~ temp', star.loc[star.temp > 3.6,:]).fit() plt.scatter(star.temp, star.light, label = None) xr = np.array([min(star.temp), max(star.temp)]) plt.plot(xr, gs1.params[0] + gs1.params[1]*xr,'k-',label="OLS")
def covar(banks, year_from, quarter_from, year_to, quarter_to): #Calcolo il portfolio system return psr = portfolio_system_return(banks,year_start=2000,year_end=2015) # Calcolo i B di Xsys = a + B * X # Preparo il vettore y filtrando i quarti mask = (psr['Year'] == year_from) & (psr['Quarter'] == quarter_from) start_index = psr[mask].index[0] mask = (psr['Year'] == year_to) & (psr['Quarter'] == quarter_to) end_index = psr[mask].index[0] y = psr['PSR'].iloc[start_index:end_index+1] y.reset_index(drop=True, inplace=True) y.name = 'PSR' # Preparo la matrice X X = pd.DataFrame() for b in banks: mask = (b.mva['Year'] == year_from) & (b.mva['Quarter'] == quarter_from) if any(mask == True): start_index = b.mva[mask].index[0] mask = (b.mva['Year'] == year_to) & (b.mva['Quarter'] == quarter_to) if any(mask == True): end_index = b.mva[mask].index[0] s = b.mva['DELTA_MVA'].iloc[start_index:end_index+1] s.reset_index(drop=True, inplace=True) s.name = b.ticker X.reset_index(drop=True, inplace=True) X = pd.concat([X, s], axis=1) # Eseguo la quantile regression per ogni banca covar_unc_matrix = pd.DataFrame(columns=['Ticker', 'Beta', 'COVAR', 'VAR_0.01', 'VAR_0.5']) for ticker in X.columns.values: x = X[ticker] if not x.isnull().values.sum(): x = sm.add_constant(x) model = sm.QuantReg(y, x) res = model.fit(q=0.01) x_pred = [1, X[ticker].quantile(q=0.01)] # Calcolo il covar covar = np.float(res.predict(x_pred)) covar_unc_matrix = covar_unc_matrix.append( {'Ticker': ticker, 'Beta': res.params[ticker], 'COVAR': covar, 'VAR_0.01': X[ticker].quantile(q=0.01), 'VAR_0.5': X[ticker].quantile(q=0.5)}, ignore_index=True) # Calcolo il delta covar unconditional covar_unc_matrix['DELTA_COVAR_UNC'] = covar_unc_matrix['Beta'] * ( covar_unc_matrix['VAR_0.01'] - covar_unc_matrix['VAR_0.5']) # Carico le variabili di stato states_variables = get_states_variable() # Preparo la parte delle X con le variabili di sistema e la chiamo X2 mask = (states_variables['Year'] == year_from) & (states_variables['Quarter'] == quarter_from) start_index = states_variables[mask].index[0] mask = (states_variables['Year'] == year_to) & (states_variables['Quarter'] == quarter_to) end_index = states_variables[mask].index[0] start_index -= 1 X2 = states_variables.iloc[start_index:end_index] X2 = X2[['V2X Index', 'SX7P Index', 'Spr_Liq_St', 'Incl_curv_rend', 'var_t-bill_3M','credit_spread']] X2.reset_index(drop=True, inplace=True) # Inizializzo la matrice covar covar_matrix = pd.DataFrame(columns=['Ticker', 'Beta', 'COVAR', 'VAR_0.01', 'VAR_0.5', 'DELTA_COVAR']) # Eseguo la regressione OLS per ogni banca for ticker in X.columns.values: X1 = X[ticker] if not X1.isnull().values.sum(): # Preparo gli input per la regressione OLS, le y non cambiano # Preparo le X X1_X2 = pd.concat([X1, X2], axis=1) X1_X2 = sm.add_constant(X1_X2) # Eseguo la regressione model = sm.OLS(y, X1_X2) results = model.fit() # Preparo X1 e X2 Predict X1_pred = pd.Series(X[ticker].quantile(q=0.01), name=b.ticker) X2_pred = X2.iloc[-1] X2_pred = pd.DataFrame(X2_pred).transpose() X2_pred.reset_index(drop=True, inplace=True) X1_X2_pred = pd.concat([X1_pred, X2_pred], axis=1, ignore_index=True) X1_X2_pred = sm.add_constant(X1_X2_pred) # Calcolo il covar covar = results.predict(X1_X2_pred) # Memorizzo il covar covar_matrix = covar_matrix.append({'Ticker': ticker, 'COVAR': covar[0], 'Beta': results.params[ticker], 'VAR_0.01': X[ticker].quantile(q=0.01), 'VAR_0.5': X[ticker].quantile(q=0.5)}, ignore_index=True) covar_matrix['DELTA_COVAR'] = covar_matrix['Beta'] * (covar_matrix['VAR_0.01'] - covar_matrix['VAR_0.5']) return covar_unc_matrix, covar_matrix
def __init__(self, x, y, args): super(LinearQR, self).__init__() self.model = sm.QuantReg(y, x) self.alpha = args.alpha self.model_name = "LinearQR"
def construct_quantile_regression_models(self, years, week, lags=6, future_intervals=3): """Construct regression models for each""" # Construct dataset lagged, future = self.construct_dataset(years, week) # DUIDs duids = list(set([i.split('_future')[0] for i in future.columns])) duids.sort() # Container for quantile regression results results = {} # Run model for each quantile for duid in duids: # for duid in [duid]: results[duid] = {} # Lagged values x = pd.concat( [lagged.loc[:, f'{duid}_lag_{i}'] for i in range(0, lags + 1)], axis=1) x = x.dropna() # For each future interval range for f in range(1, future_intervals + 1): results[duid][f] = {} # Split independent and dependent variables y = future[f'{duid}_future_{f}'] y = y.dropna() # Ensure index is the same new_index = y.index.intersection(x.index).sort_values() x = x.reindex(new_index) y = y.reindex(new_index) # Run model for each quantile for q in [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]: # print(f'Fitting model: duid={duid}, future_interval={f}, quantile={q}') try: # Construct and fit model m = sm.QuantReg(y, x) res = m.fit(q=q) # Make prediction for last time point last_observation = lagged.loc[:, [ f'{duid}_lag_{i}' for i in range(0, lags + 1) ]].iloc[-1].values pred = res.predict(last_observation)[0] results[duid][f][q] = pred except ValueError: results[duid][f][q] = None # print(f'Failed for: duid={duid}, quantile={q}') return results