def varmodel(self): self.mvdfg.index = pd.to_datetime(self.mvdfg.index) self.var_predicted = pd.DataFrame() self.var_forecast = pd.DataFrame() self.var_data_train = pd.DataFrame() self.var_data_test = pd.DataFrame() maxlag = 3 if splitdf.upper() == 'Y': #Validation Model self.var_data_train = self.mvdfg[(pd.to_datetime(self.mvdfg.index)) <= testdate] self.var_data_test = self.mvdfg[(pd.to_datetime(self.mvdfg.index)) > testdate] var_model = VAR(self.var_data_train) results = var_model.fit(maxlags = maxlag, ic = 'aic') print(results.summary()) lag_order = results.k_ar var_steps = len(self.var_data_test) pred_values = results.forecast(self.var_data_train.values[-lag_order:], var_steps) self.predicted = pd.DataFrame(pred_values, index = self.mvdfg.index[-var_steps:], columns = self.mvdfg.columns) self.var_predicted = self.predicted #Forecast startdate = self.mvdfg.index.max()+ pd.offsets.DateOffset(months = 1) maxdate = self.mvdfg.index.max() + pd.offsets.DateOffset(months = forecaststeps + 1) var_fc_index = np.asarray((pd.date_range(startdate, maxdate, freq = 'm').strftime('%Y-%m-01'))) var_fc_index = pd.to_datetime(var_fc_index) var_forecast_model = VAR(self.mvdfg) fc_results = var_forecast_model.fit(maxlags = maxlag, ic = 'aic') print(fc_results.summary()) fc_lag_order = fc_results.k_ar fc_values = fc_results.forecast(self.mvdfg.values[-fc_lag_order:], forecaststeps) self.forecast = pd.DataFrame(fc_values, index = var_fc_index, columns = self.mvdfg.columns) self.var_forecast = self.forecast print(self.var_forecast) return self.var_predicted, self.var_forecast
def pca(self): '''pca estimation of DFM''' #calculate loading and factor using pca self.pca_loading, self.pca_factor = hetero_pca(self.observations, self.n_factor) self.n_factor = self.pca_factor.shape[0] #calculate common part self.pca_common = self.pca_loading @ self.pca_factor #calculate observation residue obs_res = self.observations - self.pca_common #calculate observation residue covariance matrix self.obs_res_cov = np.cov(obs_res) #fit factor with VAR if self.n_factor > 1: model = VAR(self.pca_factor.T) #select and save lag number results = model.fit(maxlags=self.max_lag, ic='aic') self.lag = max(results.k_ar, 1) #save dynamic matrix self.pca_var_param = results.coefs self.pca_factor_var = results #calculate factor residue and its covariance matrix factor_resid = results.resid #(n_time-lag,n_factor) self.pca_factor_res_cov = np.cov(factor_resid.T) else: pass
def to_state_space_rep(self): '''use state space rep to trasfer VAR(P) to VAR(1)''' #put factors and their lags into one vector stacked_factor = np.array([ np.roll(self.pca_factor, i, axis=1)[:, (self.lag - 1):] for i in range(self.lag) ]) self.stacked_factor = stacked_factor.reshape(self.lag * self.n_factor, -1) # self.observations = self.observations[:, (self.lag - 1):] #estimate dynamics of factors using VAR(1) model = VAR(self.stacked_factor.T) results = model.fit(1) self.stacked_var_param = results.coefs[0] #choose the lag-1 coefs # constrcut loading of state space model which is of the form [\Lambda 0 0] self.stacked_loading = np.concatenate( (self.pca_loading, np.zeros((self.pca_loading.shape[0], self.pca_loading.shape[1] * (self.lag - 1)))), axis=1) # construct residue matrix G G = np.concatenate( (np.eye(self.n_factor), np.zeros((self.n_factor * (self.lag - 1), self.n_factor))), axis=0) # construct covariance matrix for stacked factor residues self.stacked_factor_res_cov = G @ self.pca_factor_res_cov @ (G.T)
def predict_var(data): ''' Predict continuous data using vector autoregression. Evaluation metric: RMSE As the Summary print is too long for some editors, it is stored as a Text file (var_res.txt) ''' data.drop(['risk_premium'], axis=1, inplace=True) # TODO Statsmodels VAR only seems to work with 10 variables # I randomly chose to drop risk_premium, if you leave it in # you will get an error message. data = diff_n_times(data, 1) # TODO: You should probably not difference all columns # but selectively the once which are not stationary. # This code includes a number of evaluation functions to check for stationarity subsets = create_subsets_nolag(data) for j in subsets: cmb = np.concatenate((j[0], np.expand_dims(j[1], 1)), axis=1) nobs = int(0.2*len(j[1])) train = cmb[:-nobs] # Simple train, validation split valid = cmb[-nobs:] train = pd.DataFrame(train, columns=list(j[0].columns)+['log_return']) model = VAR(train) results = model.fit(2) file1 = open("var_res.txt", "a") file1.write(j[2]) file1.write(str(results.summary())) file1.close() print(j[2]) get_durbin_watson(list(j[0].columns)+['log_return'], results) res = [] for x, y in zip(valid[:-2], valid[1:-1]): pred = results.forecast([x, y], 1) res.append(pred) res = np.vstack(res) df_res = undiff_once(train, valid, res) print('RMSE: ') print(rmse_loss(df_res['log_return'], df_res['log_return_forecast']))
def make_var_model(data, lags=1, actual_plot=False): # make a VAR model model = VAR(data) result_dict = {} for lag in range(1, lags + 1): results = model.fit(maxlags=lag) print 'Exogenous Variables for the model with Lag: %d \n ' % lag + str( results.exog_names) print results.summary() if actual_plot == True: results.plot() fitted_values = results.fittedvalues lag_order = results.k_ar forecast_values = pd.DataFrame(data=results.forecast( y=model_data.values[-lag_order:], steps=5), columns=results.names) results.forecast_interval(y=model_data.values[-lag_order:], steps=5) results.plot_forecast(steps=5, plot_stderr=False) result_dict['Lag_Order_{}'.format(lag)] = results return result_dict
def fit_model(data, p, mod): """ The function that estimates the coefficients of the AR model. Input: - data: The loaded dataset - p: The order of the AR model - mod: A string "AR" or "VAR" that selects the model to be used Returns: - A: The coefficients (scalar or matrix) """ if mod == "AR": A = fit_ar(data, p) elif mod == "myVAR": data_vectorized = data.reshape((data.shape[0]*data.shape[1], data.shape[2])) A = estimate_matrix_coefficients(data_vectorized, p) A = A[1:] elif mod == "VAR": data_vectorized = data.reshape((data.shape[0]*data.shape[1], data.shape[2])) model = VAR(data_vectorized.T) results = model.fit(p) A2 = results.coefs A = [] for i in range(A2.shape[0]): A.append(A2[i, ...]) return A
def fit_VAR_model(self, lags): model = VAR(self.df) self.model_fitted = model.fit(lags) print("\n**********model_fitted, lag: " + str(lags) + "***********\n") print(self.model_fitted.summary()) return self.model_fitted
def find_optimal_lag_length( self, cols, time, min_lag=1, max_lag=8, criterion="aic" ): try: s = self.map_column_to_sheet(cols) multi = False except: s = self.map_column_to_sheet(cols[0]) multi = True df = s.df if multi: try: args_vector = np.append(cols, time) data = df[args_vector] data = data.set_index(time) except: data = df[cols] model = VAR(data) else: try: args_vector = np.array([cols, time]) data = df[args_vector] data = data.set_index(time) except: data = df[cols] model = s_ar.AR(data) info_loss = np.zeros(max_lag - min_lag + 1) if criterion == "aic": for i in range(max_lag - min_lag + 1): fit = model.fit(i + min_lag) info_loss[i] = fit.aic elif criterion == "bic": for i in range(max_lag - min_lag + 1): fit = model.fit(i + min_lag) info_loss[i] = fit.bic else: print("ERROR: Criterion argument not supported.") return x = np.argsort(info_loss) optimal = x[0] + min_lag utterance = ( "The optimal lag length according to the " + str(criterion) + " criterion is " ) utterance = utterance + str(optimal) + "." return QueryResult(optimal, utterance)
def forecast(data, lag, forcastStep): #with open('varModel.json') as f: # data = json.load(f) mdata = prepareData(data) model = VAR(mdata) results = model.fit(lag) #lag_order = results.k_ar createForcastModel(results, forcastStep) fevd = results.fevd(forcastStep) fevModel = [] for val in range(len(fevd.names)): for i in range(fevd.periods): for j in range(len(fevd.names)): fevItem = {} fevItem["ind"] = val fevItem["period"] = i fevItem["compindex"] = j fevItem["val"] = fevd.decomp[val][i][j] fevModel.append(fevItem) #irf = results.irf(forcastStep) #print(irf.orth_irfs) #print(irf.svar_irfs) #print(irf.irfs) mid, lower, upper = createForcastModel(results, forcastStep) print(fevd.summary()) return mid, lower, upper, fevModel
def VARPredict(pdData, steps: int): """ Takes a pandas dataframe, then predicts steps ahead pdData: Pandas steps: int Returns a (steps, series) shaped ndarray of forecast """ # Compute VAR model = VAR(pdData) # fit data try: results = model.fit(maxlags=15, ic=fit_ic, trend='nc') except Exception as e: print(e) print(pdData) exit(1) #print(results.summary()) #results.plot() #plt.show() #results.plot_acorr() #plt.show() #Forecast diffs forecast = results.forecast(pdData.values[-results.k_ar:], steps) #results.plot_forecast(10) #plt.show() return forecast
def load_data(): #import data X, Y = data.import_data(set='train') #do not plug in returns, but residuals #plug in residuals VAR_model = VAR(X) results = VAR_model.fit(1) ar_returns = results.fittedvalues #columns to drop from dataframe columns = [ 'XMRspread', 'XMRvolume', 'XMRbasevolume', 'XRPspread', 'XRPvolume', 'XRPbasevolume', 'LTCspread', 'LTCvolume', 'LTCbasevolume', 'DASHspread', 'DASHvolume', 'DASHbasevolume', 'ETHspread', 'ETHvolume', 'ETHbasevolume' ] ar_returns.drop(columns, 1, inplace=True) X = X.loc[ar_returns.index] x_returns = X[ar_returns.columns] residual_df = x_returns - ar_returns X = X.join(residual_df, how='inner', rsuffix='residual') y_ar_returns = ar_returns y_ar_returns.columns = Y.columns Y = (Y.loc[X.index] - y_ar_returns.shift(-1)).dropna() X = X.loc[Y.index] x = X.as_matrix() y = Y.as_matrix() return x, y, X, Y
def gc_graph(X, p = 2, signif = 0.01): ''' X should be a pandas dataframe ready to be consumed by VAR(-) p is the model order we will use. We then produce a granger causality graph, where GC is tested w.r.t. the whole information set. ''' G = nx.DiGraph() G.add_nodes_from(X.columns.values) model = VAR(X) results = model.fit(p) #Suppress output from test_causality import sys stdout_real = sys.stdout sys.stdout = open('/dev/null', 'w') #itertools product for e in product(X.columns.values, X.columns.values): gc = results.test_causality(*e, signif = signif) if gc['conclusion'] == 'reject': G.add_edge(e[0], e[1]) sys.stdout = stdout_real return G
def test_gc(data, index, maxlag, header, alpha): VARResults.test_causality = a_test_causality # g = Digraph('G', filename='granger_all_new.gv', strict=True) # edgegranger = [] model = VAR(data) result = {} lag_dic = {} res_output = [] Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index) print(result) print(res_output) if not len(res_output) == 0: output_df = pd.DataFrame(res_output) output_df.columns = [ 'Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition' ] output_df = output_df.sort_values(by=['Strength']) print(output_df.head(20)) # print(g) # print(g.view()) # g # output_df.to_csv("gc_baseline_out.csv", header=False, index=False) # numpy_output = output_df.to_numpy # print(numpy_output) return res_output
def stats(self, p): ''' VaR Model from statsmodel for testing p: lag ''' Var_result = CRESULT() varmodel = VAR(self.data) results = varmodel.fit(p) Var_result.summary = results.summary() # AIC and BIC Var_result.aic = results.aic Var_result.bic = results.bic # Coefficient if p == 1: Var_result.coefs = pd.DataFrame(results.coefs[0], \ index = self.data.columns, \ columns = 'Lag_'+self.data.columns) else: Var_result.coefs = results.coefs # Correlation Var_result.corr = pd.DataFrame(results.resid_corr, \ index = self.data.columns, \ columns = self.data.columns) # Stable eignval_list = [abs(np.linalg.eig(i)[0]) for i in results.coefs] eignval_df = pd.DataFrame(eignval_list).T eignval_df.columns = ['lag' + str(i) for i in range(1, p + 1)] Var_result.stable = eignval_df return Var_result
def VAR_IRF(df, n=10, future=20): m = VAR(df) m.select_order(n) n = int(input('order:')) model = m.fit(maxlags=n) print('\n\n', model.summary()) model.irf(10).plot()
def run_VAR(data, param): p = param['p'] testsize = param['testsize'] T = data.shape[-1] T_test = int((T * testsize) // 1) result_full = np.zeros([data.shape[0], T_test]) total_time = 0 n_round = 0 for i in range(T_test): ts = data[..., i:T - T_test + i].copy() n_round += 1 model = VAR(ts) start = time.time() result = model.fit(p).forecast(ts, 1) end = time.time() total_time = total_time + (end - start) result_full[..., i] = result[..., -1] label = data[..., -T_test:] stat = {} stat['acc'] = get_acc(result_full, label) stat['nrmse'] = nrmse(result_full, label) stat['ave_time'] = total_time / n_round return (stat)
def forecast(df_train, number_of_forecast_points, forecast_index, lag_order=5, diff=2): """ Learn and forecast with VAR model (Max : 2 differencing) df_train (dataframe) : input data number_of_forecast_points (int) : number of time step that want to predict forecast_index (list) : index name of each predicted value lag_order (int) : window size of input (How many previous timestep will be used as input) diff (int 0, 1, 2) : number of differencing return real_forecast (dataframe) : dataframe with predicted value model (Object) : fitted model """ assert diff == 0 or diff == 1 or diff == 2, 'diff = 1 or 2 only' df_differenced = df_train for _ in range(diff): df_differenced = df_differenced.diff().dropna() model = VAR(df_differenced) model_fitted = model.fit(lag_order) forecast_input = df_differenced.values[-lag_order:] fc = model_fitted.forecast(y=forecast_input, steps=number_of_forecast_points) if diff == 0: real_forecast = pd.DataFrame(fc, index=forecast_index, columns=df_train.columns + '_forecast') elif diff == 1: df_forecast = pd.DataFrame(fc, index=forecast_index, columns=df_train.columns + '_1d') real_forecast = invert_transformation(df_train, df_forecast, second_diff=False) elif diff == 2: df_forecast = pd.DataFrame(fc, index=forecast_index, columns=df_train.columns + '_2d') real_forecast = invert_transformation(df_train, df_forecast, second_diff=True) return real_forecast, model_fitted
def predict(dataset=dataset_orig, future=1): args = parse_args() f = open(args.path, 'rb') dataset = pickle.load(f) future = args.future data = np.zeros((len(dataset), 4)) data[:] = dataset for step in range(future): data_st2 = np.zeros((len(data), 4)) data_st = np.log(data) data_st2[0] = data_st[0] data_st2[1:] = np.diff(data_st, axis=0) model = VAR(data_st2) results = model.fit() #print(results.summary()) prediction_st2 = results.forecast(data_st2, 1) prediction_st = np.zeros((2, 4)) prediction_st[0] = data_st[-1] prediction_st[1:] = prediction_st2 prediction = np.cumsum(prediction_st, axis=0)[1:] prediction = np.exp(prediction) data = np.append(data, prediction, 0) print(data[-future:]) return data[-future:]
def generate_forecast_1(date='2003-01-01', n_steps=6): # n_steps is how far into future you look # crop the data depending on n_steps and date neg_YOY_CPI = load_sentiment_YOY_CPI() # if date is most recent then test is empty train, test = crop_data(neg_YOY_CPI, date, n_steps) #take first difference and record first row first_row = train.iloc[0] train_1 = train.diff().dropna() first_YOY = first_row['YOY'] prev = train_1.values[:, 1] model = VAR(train_1, freq='MS') # create VAR model results = model.fit(4) #for now fit to 4 lag_order = results.k_ar prediction_input = train_1.values[-lag_order:] # I want last column infl_results = results.forecast(prediction_input, n_steps)[:, 1] # return triple: previous, forecast_1, first_YOY return prev, infl_results, first_YOY
def stoc_simulate(getfit_data, N=5000, nlag=8): #Transform tau to log scale fit_par_VAR = getfit_data['fit_par'].iloc[:, 0:3] fit_par_VAR.insert(3, '3', np.log(getfit_data['fit_par'].iloc[:, 3])) #Stochastic VAR fitting model = VAR(fit_par_VAR) results = model.fit(nlag) #Extract simulated scenarios u_L = np.linalg.cholesky(results.resid_corr) u_std = np.std(results.resid, axis=0) u_rand = np.random.normal(size=[fit_par_VAR.shape[1], N]) u = np.dot(u_L.conj(), u_rand) Var_Rand = np.dot(u.transpose(), np.diag(u_std)) Var_Betas = results.coefs Var_C = results.intercept return { 'Var_Rand': Var_Rand, 'Var_Betas': Var_Betas, 'Var_C': Var_C, 'nlag': nlag }
def VARprocess(df, log=False): # Log transformation, relative difference and drop NULL values if (log): df = np.log(df + 0.1).diff().dropna() # Vector Autoregression Process generation maxAttr = len(df.columns) # Find the right lag order orderFound = False while orderFound != True: try: model = VAR(df.ix[:, 0:maxAttr]) order = model.select_order() orderFound = True except: exc_type, exc_obj, exc_tb = sys.exc_info() if str(exc_obj) == "data already contains a constant.": maxAttr = maxAttr - 1 else: maxAttr = int(str(exc_obj).split("-th")[0]) - 1 print "Exception, reducing to n_attributes ", maxAttr orderFound = False n_lags = max(order.iteritems(), key=operator.itemgetter(1))[1] method = max(order.iteritems(), key=operator.itemgetter(1))[0] print "n_lags ", n_lags print "method ", method results = model.fit(maxlags=n_lags, ic=method) return results
def compute_pair_metrics(security, candidates): security = security.div(security.iloc[0]) ticker = security.name candidates = candidates.div(candidates.iloc[0]) spreads = candidates.sub(security, axis=0) n, m = spreads.shape X = np.ones(shape=(n, 2)) X[:, 1] = np.arange(1, n + 1) drift = (( np.linalg.inv(X.T @ X) @ X.T @ spreads).iloc[1].to_frame('drift')) vol = spreads.std().to_frame('vol') corr_ret = (candidates.pct_change().corrwith( security.pct_change()).to_frame('corr_ret')) corr = candidates.corrwith(security).to_frame('corr') metrics = drift.join(vol).join(corr).join(corr_ret).assign(n=n) tests = [] for candidate, prices in candidates.items(): df = pd.DataFrame({'s1': security, 's2': prices}) var = VAR(df.values) lags = var.select_order() # select VAR order k_ar_diff = lags.selected_orders['aic'] # Johansen Test with constant Term and estd. lag order cj0 = coint_johansen(df, det_order=0, k_ar_diff=k_ar_diff) # Engle-Granger Tests t1, p1 = coint(security, prices, trend='c')[:2] t2, p2 = coint(prices, security, trend='c')[:2] tests.append([ticker, candidate, t1, p1, t2, p2, k_ar_diff, *cj0.lr1]) columns = [ 's1', 's2', 't1', 'p1', 't2', 'p2', 'k_ar_diff', 'trace0', 'trace1' ] tests = pd.DataFrame(tests, columns=columns).set_index('s2') return metrics.join(tests)
def rolling_forecast(trainset,testset,lags): Pmse = [] forecastreturn = [] accuracys = [] ntest = len(testset) for i in range(0,ntest): if i == 0: X_in = trainset else: X_in = trainset.append(testset.iloc[:i,:]) X_out = testset.iloc[i,0] #buliding model model = VAR(X_in) results = model.fit(lags) forecasttest = results.forecast(results.y,steps =1)[0][0] if (forecasttest*X_out)>0: accuracy = 1 else: accuracy = 0 accuracys.append(accuracy) forecastreturn.append(forecasttest) Pmse.append(np.square(forecasttest-X_out)) return(Pmse,forecastreturn,accuracys)
def best_lag_dw(self, df, threshold=0.2): model = VAR(df, freq="MS") # Assumes stationary data. best_aic = 99999 best_lag = None best_dw = None # Searching for best lag order. for i in range(1, 16): result = model.fit(i) #print("Lag order: ", i, " AIC: ", result.aic) # Checking with Durbin-Watson test for autocorrelation as well. dw_out = durbin_watson(result.resid) #print("DW test: ", dw_out) #print(abs(2.0-dw_out[0])) if ((result.aic < best_aic) and (abs(2.0 - round(dw_out[0], 2)) <= threshold) and (abs(2.0 - round(dw_out[1], 2)) <= threshold)): #print("ENTRA") best_aic = result.aic best_lag = i best_dw = dw_out print("Best lag order: ", best_lag, " with an AIC score of: ", best_aic) print("Durbin-Watson results:") for col, val in zip(df.columns, best_dw): print(col, ':', round(val, 2)) print("-------------------------------------------------") return best_aic, best_lag, best_dw
def var_prediction(df, train_perc, incidence_file, window=18, diff=True): # Limpiamos el df df_aux = df.drop('Unnamed: 0', axis=1) df_aux = df_aux.drop('tref_start', axis=1) X = df_aux.values[:, :] if diff: X = np.diff(X, axis=0) # Obtenemos estandarizador de valores v = int(len(X) * train_perc) scaler = StandardScaler() X_train = scaler.fit_transform(X[:v]) # Entrenamos el modelo model = VAR(X_train) results = model.fit(window) # df de validación con incidencias inc = get_working_incidence(incidence_file) df = df.iloc[v:] df = generar_incidencias(df, inc).sort_values(by=['tref_start']) # Array de incidencias incidencias = df['incidencia'].values[window:] df = df.drop('incidencia', axis=1) # Obtenemos valores de la red reales df = df.drop('Unnamed: 0', axis=1) df = df.drop('tref_start', axis=1) X = df.values[:, :] if diff: X = np.diff(X, axis=0) incidencias = incidencias[1:] X = scaler.transform(X) # Obtengamos predicciones ys = X[window:] yhats = [] for i in range(window, len(X)): yhats.append(results.forecast(X[i - window:i], 1)[0]) return ys, yhats, incidencias
def fit(self, alpha=0.05): """ :param alpha: threshold of F-test :return: granger causality denpendencies """ model_full = VAR(self.X) model_full_fit = model_full.fit(maxlags=self.p, ic='aic') # make prediction x_hat = self.predict(model_full_fit, self.X) # compute error err_full = np.subtract(x_hat.values, self.X.values[self.p:]) var_full = list(np.var(err_full, axis=0)) for j in range(self.d): x_temp = self.X.drop(columns=[self.names[j]]) model_rest = VAR(x_temp) model_rest_fit = model_rest.fit(maxlags=self.p, ic='aic') # make prediction x_hat = self.predict(model_rest_fit, x_temp) # compute error err_rest = np.subtract(x_hat.values, x_temp.values[self.p:]) var_rest = list(np.var(err_rest, axis=0)) # F test (extremely sensitive to non-normality of X and Y) var_full_rest = var_full.copy() del var_full_rest[j] m = x_hat.shape[0] for i in range(len(x_hat.columns.values)): # Start Test using F-test p_value = self.f_test(var_rest[i], var_full_rest[i], m) if p_value < alpha: self.pa[x_hat.columns.values[i]].append(self.names[j]) res_df = pd.DataFrame(np.ones([self.d, self.d]), columns=self.names, index=self.names) for e in self.pa.keys(): for c in self.pa[e]: res_df[e].loc[c] = 2 if res_df[c].loc[e] == 0: res_df[c].loc[e] = 1 return res_df
def forecast_DNS_VAR(ts, pred): #IMPORTANT : ts has undergone the DNS_OLS function previously. pred is the date pred months after the last entry of the time series ts model = VAR(ts) model_fitted = model.fit(1, method='mle') #See Diebold and Rudebusch. All use VAR(1) lag_order=model_fitted.k_ar return model_fitted.forecast(ts.values[-lag_order:],pred)
def impact_value(self, data, lag): model = VAR(data) results = model.fit(lag) numerator = 0 for i in range(1, lag + 1): numerator += results.params[results.params.columns.values[0]][ 'L' + str(i) + '.' + results.params.columns.values[1]] return numerator / np.abs(lag)
def time_series(data, future_forcast, location): #[[people, violations, time, location],[people, violations, time, location],[people, violations, time, location]] columns = ["people", "violations", "time", "location"] df = pd.DataFrame(data=data, columns=columns) df = df[df["location"] == location] df['time'] = pd.to_datetime(df['time']) for i in range(len(df)): df['time'][i] = df['time'][i].hour dict_p = {} dict_v = {} for i in range(len(df)): if (df['time'][i] not in dict_p.keys()): dict_p[df['time'][i]] = [df["people"][i]] else: dict_p[df['time'][i]].append(df["people"][i]) if (df['time'][i] not in dict_v.keys()): dict_v[df['time'][i]] = [df["violations"][i]] else: dict_v[df['time'][i]].append(df["violations"][i]) people = [] violations = [] times = [] for k, v in dict_p.items(): people.append(sum(v) / float(len(v))) timet = pd.Timestamp(year=2000, month=1, day=1, hour=k, minute=0, second=0) times.append(timet) for k, v in dict_v.items(): violations.append(sum(v) / float(len(v))) n_df = pd.DataFrame(columns=["people", "violations", "time"]) n_df["people"] = people n_df["violations"] = violations n_df["time"] = times n_df = n_df.sort_values(by=['time']) n_df.time = pd.DatetimeIndex(n_df.time).to_period('H') data1 = n_df[["people", 'violations']] data1.index = n_df["time"] print(data1) model = VAR(data1) model_fit = model.fit() freq = (n_df["time"][0].hour - n_df["time"][len(n_df) - 1].hour) / (len(n_df) - 1) steps = (future_forcast + n_df["time"][0].hour - n_df["time"][0].hour) / freq pred = model_fit.forecast(model_fit.y, steps) return pred[0], pred[1]
def decide_degree_best(self): # make a VAR model model = VAR(self.X) model.select_order(15) # determine the optimal VAR model order using AIC print(model.select_order(15)) results = model.fit(maxlags=15, ic='aic') print(results.summary())