def grangerTest(exog, endog): MAX_LAG = 30 ARaic = ar_model.AR(exog.tolist()).fit(maxlag=MAX_LAG, ic="aic") ARbic = ar_model.AR(exog.tolist()).fit(maxlag=MAX_LAG, ic="bic") # select the fewer number of parameters between both criteria. numExog = len(ARaic.params) if len(ARaic.params) < len(ARbic.params) else len(ARbic.params) print ("Optimal number of lags for exog data is " + str(numExog)) ARaic = ar_model.AR(endog.tolist()).fit(maxlag=MAX_LAG, ic="aic") ARbic = ar_model.AR(endog.tolist()).fit(maxlag=MAX_LAG, ic="bic") # select the fewer number of parameters between both criteria. numEndog = len(ARaic.params) if len(ARaic.params) < len(ARbic.params) else len(ARbic.params) print ("Optimal number of lags for endog data is " + str(numEndog)) # now that I know the optimal number of parameters, I can call the # granger causality function of statsmodels. data = pd.concat([endog, exog], axis=1) print ("\nGranger causality results of indep onto dep") results = stattools.grangercausalitytests(data, maxlag=numEndog) data = pd.concat([exog, endog], axis=1) print ("\nGranger causality results of dep onto indep") results = stattools.grangercausalitytests(data, maxlag=numExog) regr = results[2][1] print (regr[0].params) print (regr[1].params) print (regr[1].pvalues)
def testForCointegration(self, assets): x = np.random.normal(0,1, 1000) y = np.random.normal(0,1, 1000) x = np.array(x) y = np.array(y) c =np.column_stack((x,y)) a = grangercausalitytests(c,-1,verbose=True) return a
def cluster_vs_cluster_granger(C,X,lags=4,thresh=0.01): Xc = [np.average(X[:,c],1) for c in C] R = [] for i in range(len(C)): x1 = Xc[i] for j in range(i+1,len(C)): x2 = Xc[j] result = stattools.grangercausalitytests(np.transpose([x1,x2]),lags) for l in range(lags): pv = result[l+1][0]['ssr_ftest'][1] if pv < thresh: R.append((pv,(i,j,l+1))) result = stattools.grangercausalitytests(np.transpose([x2,x1]),lags) for l in range(lags): pv = result[l+1][0]['ssr_ftest'][1] if pv < thresh: R.append((pv,(j,i,l+1))) return sorted(R)
def granger(x1, x2): if len(x1) == 1: x1[0][0] += .00000000001 else: x1[0] += .00000000001 if len(x2) == 1: x2[0][0] += .00000000001 else: x2[0] += .00000000001 res = sts.grangercausalitytests(np.vstack((x1, x2)).T, 2, verbose=False)[1][0]['params_ftest'][0] return res
def test_grangercausality(self): # some example data mdata = macrodata.load().data mdata = mdata[['realgdp', 'realcons']] data = mdata.view((float, 2)) data = np.diff(np.log(data), axis=0) #R: lmtest:grangertest r_result = [0.243097, 0.7844328, 195, 2] # f_test gr = grangercausalitytests(data[:, 1::-1], 2, verbose=False) assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7) assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'], decimal=7)
def cluster_vs_meta_granger(c,X,M,Ml,lags=7,thresh=0.05): x1 = np.average(X[:,c],1) R = [] for x2 in M.T: have_values = np.isfinite(x2) result = stattools.grangercausalitytests(np.transpose([x1[have_values],x2[have_values]]),lags) R.append([(result[i+1][0]['ssr_ftest'][1],i+1) for i in range(lags) if result[i+1][0]['ssr_ftest'][1] < thresh]) RM = [] for i,r in enumerate(R): if r: avgLag = np.average([x[1] for x in r]) avgPvalue = np.average([x[0] for x in r]) RM.append((avgPvalue,Ml[i],avgLag)) return sorted(RM)
def granger_causality(data, max_lag=10, alpha=.05, *, callback=None): """ Return results of Granger-causality tests. Parameters ---------- data : Timeseries A table of features to compute Granger causality between. max_lag : int The maximum lag to compute Granger-causality for. alpha : float in (0, 1) Confidence of test is 1 - alpha. callback : callable A callback to call in each iteration with ratio of completion. Returns ------- res : list of lists Each internal list is [lag, antecedent, consequent] where lag is the minimum lag at which antecedent feature in data is Granger-causal for the consequent feature in data. """ from statsmodels.tsa.stattools import grangercausalitytests from Orange.data import Table, Domain # TODO: use VAR Granger causality # TODO: consider CCM in stead/addition of GC: https://en.wikipedia.org/wiki/Convergent_cross_mapping # http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.vector_ar.var_model.VARResults.test_causality.html # http://statsmodels.sourceforge.net/devel/vector_ar.html#granger-causality data = data.interp() domain = [var for var in data.domain.variables if var.is_continuous] res = [] for row_attr in domain: for col_attr in domain: if row_attr == col_attr or data.time_variable in (row_attr, col_attr): continue X = Table(Domain([], [], [col_attr, row_attr], data.domain), data).metas try: tests = grangercausalitytests(X, max_lag, verbose=False) lag = next((lag for lag in range(1, 1 + max_lag) if tests[lag][0]['ssr_ftest'][1] < alpha), 0) except ValueError: lag = 0 if lag: res.append([lag, row_attr.name, col_attr.name]) if callback: callback(1 / ((len(domain) - 1)**2 - len(domain))) return res
tempData = web.DataReader(theTickers[i],"yahoo",start,end) lastPrice = tempData['Close'][len(tempData)-1:len(tempData)][0] tempData['retClose'] = np.log(tempData['Adj Close'].astype(float)) - np.log(tempData['Adj Close'].astype(float).shift(1)) tempData['ret'] = np.log(tempData['Close'].astype(float)) - np.log(tempData['Open'].astype(float)) tempData = tempData.dropna() tempData = tempData[['retClose','ret','Close']] tempData.reindex() tempData = pd.merge(tempBase,tempData,how='outer', left_index=True, right_index=True) tempData['diff'] = tempData['closeBase'] - tempData['Close'] tempData = tempData.dropna() tempData = tempData[['retBase','ret','retClose','diff']] theCor = pearsonr(tempData['retBase'],tempData['retClose']) gCause = ts.grangercausalitytests(tempData[['retClose','retBase']],1,verbose=False)[1][0]['params_ftest'][1] #second position --> first position #if(theCor[1] <= statSig and (theCor[0] >= corThresh or theCor[0] <= -corThresh) and gCause <= statSig): if(gCause <= statSig and lastPrice >= priceThresh): tempData['rollMean'] = tempData['ret'].rolling(window=theWindow).mean() tempData['rollMeanBase'] = tempData['retBase'].rolling(window=theWindow).mean() tempData['rollCor'] = pd.rolling_corr(tempData['retBase'],tempData['ret'],theWindow) #rollCorrelation tempData = tempData.dropna() theLen = len(tempData) trainLen = int(round(testSize*theLen,0)) testLen = int(theLen - trainLen) try: y = tempData['ret']#[0:theLen-2] #next day assset return X = tempData[['retBase','rollCor','rollMeanBase','rollMean','diff']]#[1:theLen-1] #event day features
def find_granger_causality(x1_df, x2_df, max_lags=5): data = pd.concat([x1_df, x2_df], axis=1).dropna() lag_1_2, causes_1_2 = is_granger_caused(grangercausalitytests(data[[0, 1]], max_lags, verbose=False)) # x1 causes x2 lag_2_1, causes_2_1 = is_granger_caused(grangercausalitytests(data[[1, 0]], max_lags, verbose=False)) # x2 causes x1 # x2 only granger-causes x1 if, and only if, x1 does NOT cause x2 and x2 CAUSES x1 return (lag_2_1, True if causes_2_1 and not causes_1_2 else False)
emotion['anger'].append(0) emotion['happiness'].append(0) emotion['surprise'].append(0) emotion['sadness'].append(0) emotion['fear'].append(0) emotion['disgust'].append(0) n_price.append(post['price']) # n_return.append(post['return']) # n_dates.append(date) date = date + datetime.timedelta(days=1) for key in emotion.keys(): granger_array = np.array([np.array(n_price), np.array(emotion[key])]).T granger_results = st.grangercausalitytests(granger_array, 4, addconst=True, verbose=False) print key.upper() results.write('\t' + key.upper() + '\t\t\t\tLag1\t\t\tLag2\t\t\tLag3\t\t\tLag4') results.write('\n') results.write('\tssr_chi2test\t\t' + str(granger_results[1][0]['ssr_chi2test'][1]) + '\t\t' + str(granger_results[2][0]['ssr_chi2test'][1]) + '\t\t' + str(granger_results[3][0]['ssr_chi2test'][1]) + '\t\t' + str(granger_results[4][0]['ssr_chi2test'][1]) + '\n') results.write('\tssr_ftest\t\t' + str(granger_results[1][0]['ssr_ftest'][1]) + '\t\t' + str(granger_results[2][0]['ssr_ftest'][1]) + '\t\t' + str(granger_results[3][0]['ssr_ftest'][1]) + '\t\t' + str(granger_results[4][0]['ssr_ftest'][1]) + '\n') exit()
def test_granger_fails_on_nobs_check(self): # Test that if maxlag is too large, Granger Test raises a clear error. X = np.random.rand(10, 2) grangercausalitytests(X, 2, verbose=False) # This should pass. assert_raises(ValueError, grangercausalitytests, X, 3, verbose=False)
# for i in range(len(price7)): # dst.write(str(i) + ',' + str(price1[i]) + ',' + str(n_price[i]) + ',' + str(returnn1[i]) + ',' + # str(n_return[i]) + ',' + str(pos[i]) + ',' + str(neg[i]) + ',' + str(bind[i]) + ',' + # str(tis[i]) + ',' + str(rtis[i]) + ',' + str(returnn2[i]) + ',' + str(returnn7[i]) + ',' + # str(price2[i]) + ',' + str(price7[i]) + '\n') # dst.close() # exit() granger_array_positive = np.array([np.array(n_price), np.array(n_emotion)]).T # print len(n_price) # print len(n_negativos) # print granger_array_positive granger_positive = st.grangercausalitytests(granger_array_positive, 4, addconst=True, verbose=True) print granger_positive[1][0] granger_array_negative = np.array([np.array(n_price), np.array(n_negativos)]).T # print granger_array_negative granger_negative = st.grangercausalitytests(granger_array_negative, 4, addconst=True, verbose=True) # results.write('\tPositivos\n') # results.write('\t\t\t\tLag1\t\t\tLag2\t\t\tLag3\t\t\tLag4') # results.write('\n') results.write(ticker.upper() + ' & ' + ('%.4f' % granger_positive[1][0]['ssr_ftest'][1]) + ' & ' + ('%.4f' % granger_positive[2][0]['ssr_ftest'][1]) + ' & ' +
# In[22]: from statsmodels.tsa.ar_model import AR model = AR(a) model_fit = model.fit() print('Lag: %s' % model_fit.k_ar) maxlag = model_fit.k_ar # In[23]: addconst = True verbose = True # In[24]: result = stm.grangercausalitytests(x, maxlag, addconst, verbose) optimal_lag = -1 F_test = -1.0 for key in result.keys(): _F_test_ = result[key][0]['params_ftest'][0] if _F_test_ > F_test: F_test = _F_test_ optimal_lag = key # In[25]: print("{} {}".format("We are going to look into the GC with Optimal Lag of", optimal_lag)) # We consider the p-value of the test as a measure for Granger causality: rejection of ℋ0 (p < 0.03) signifies Granger causality, acceptance means non-causality.
fig = plt.figure() ax = plt.subplot(241) ax.plot(t, x1, label='time series 1') ax.plot(t, x2, label='time series 2') # ax.plot(t, lm.predict(t.reshape(-1, 1)), color='red', label='linear regression on ts2') ax.set_xlabel('t') ax.set_ylabel('x') ax.legend() lags = 5 mat = np.zeros((n, 2)) mat[:, 0] = x1 mat[:, 1] = x2 gr = grangercausalitytests(mat, lags, verbose=False) p_values = [gr[x][0]['params_ftest'][1] for x in gr.keys()] ax = plt.subplot(245) ax.plot(range(1, lags + 1), list(p_values), label='granger causality') ax.set_xlabel('lags') ax.set_ylabel('p-value') ax = plt.subplot(242) ax.plot(t, x1) ax.plot(t, x3) # ax.plot(t, lm.predict(t.reshape(-1, 1)), color='red', label='linear regression on ts2') ax.set_xlabel('t') mat = np.zeros((n, 2)) mat[:, 0] = x1
def test_granger_fails_on_nobs_check(self, reset_randomstate): # Test that if maxlag is too large, Granger Test raises a clear error. x = np.random.rand(10, 2) grangercausalitytests(x, 2, verbose=False) # This should pass. with pytest.raises(ValueError): grangercausalitytests(x, 3, verbose=False)
def compute_relationship( v1: np.ndarray, v2: np.ndarray, v1_label: Text = 'v1', v2_label: Text = 'v2', maxlag: int = 4, fname: Text = '', verbose: bool = True) -> dict: """Computes the relationship between two vectors. Granger causality tests whether the time series in the 2nd column Granger causes the time series in the 1st column. In here it means, if v2 Granger causes v1 or not. Args: v1: First array of numbers. v2: Second array of numbers. v1_label: The string label for v1. v2_label: The string label for v2. maxlag: Maximum lag in the Granger causality test. fname: File name. If empty string, it does not save it. verbose: If we the function to print the full report. Returns: Dictionary of correlation p-value, r-value and causality report. Raises: If there was insufficient observations for the given lag. """ # Correlation test. rval, pval = pearsonr(v1, v2) if verbose: significant = '' if pval < 0.05: significant = 'yay!!!!' print('r-val: {}\np-val: {} \t{}'.format(rval, pval, significant)) # Scatter plot. f = plt.figure() sns.scatterplot(v2, v1) # plt.plot((min(v1), max(v2)), (max(v1), min(v2)), 'r') plt.plot(np.linspace(min(v2), max(v2)), np.linspace(min(v1), max(v1)), 'r') plt.xlabel(v2_label) plt.ylabel(v1_label) plt.show() if fname: f.savefig('{}.png'.format(fname), bbox_inches='tight') f.savefig('{}.pdf'.format(fname), bbox_inches='tight') # Causality test. causality_res = grangercausalitytests( np.column_stack((v1, v2)), maxlag=maxlag, verbose=verbose) return {'rval': rval, 'pval': pval, 'causality': causality_res}
def read_csv_by_Pandas(file): df = (pd.read_csv(file, sep=",", header=0))[st:ed] dfIt = df['iteration'] dfS_1 = df['1'] dfS_2 = df['2'] dfS_3 = df['3'] #dfS_4 = df['4'] #dfS_5 = df['5'] print('\n\n1 eats 2?') grangercausalitytests(df[['1', '2']], maxlag=[maxlag]) print('\n\n2 eats 1?') grangercausalitytests(df[['2', '1']], maxlag=[maxlag]) print('\n\n1 eats 3?') grangercausalitytests(df[['1', '3']], maxlag=[maxlag]) print('\n\n3 eats 1?') grangercausalitytests(df[['3', '1']], maxlag=[maxlag]) print('\n\n2 eats 3?') grangercausalitytests(df[['2', '3']], maxlag=[maxlag]) print('\n\n3 eats 2?') grangercausalitytests(df[['3', '2']], maxlag=[maxlag]) #print('\n\n5 eats 3?') #grangercausalitytests(df[['5', '3']], maxlag=[50]) #print('\n\n3 eats 5?') #grangercausalitytests(df[['3', '5']], maxlag=[50]) #print('\n\n1 eats 4?') #grangercausalitytests(df[['1', '4']], maxlag=[100]) #print('\n\n4 eats 1?') #grangercausalitytests(df[['4', '1']], maxlag=[100]) #print('\n\n2 eats 4?') #grangercausalitytests(df[['2', '4']], maxlag=[100]) #print('\n\n4 eats 2?') #grangercausalitytests(df[['4', '2']], maxlag=[100]) ax = plt.gca() df.plot(x='iteration', y='1', kind='line', ax=ax) df.plot(x='iteration', y='2', kind='line', ax=ax) df.plot(x='iteration', y='3', kind='line', ax=ax) #df.plot(x='iteration', y='4', kind='line', ax=ax) #df.plot(x='iteration', y='5', kind='line', ax=ax) plt.show()
from statsmodels.tsa.stattools import grangercausalitytests import pandas as pd df = pd.read_excel('data12.xlsx', usecols=[1, 2]) grangercausalitytests(df, maxlag=3)
#%% from statsmodels.tsa.stattools import grangercausalitytests #Drop rows with nan values #Because we can not implement granger causalitytest on values contains NaN or inf wmt.dropna(subset=['Vader Sentiment', 'returns'], inplace=True) import statsmodels statsmodels.tsa.stattools.adfuller(wmt.returns, regression='ct') #non unit root # According to the results, we believe there is no granger causality from sentiment to returns, ie all p-values are all above .05, accepting the null hypothesis, the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. grangercausalitytests(wmt[['returns', 'Vader Sentiment']], maxlag=4) # ## Regress # # $$ # r_{i, t}=\alpha+\beta_{1} \Delta s_{1, t}+\beta_{2} \Delta s_{i, t-1}+\epsilon_{t} # $$ # wmt['sentiment_lag'] = wmt['Vader Sentiment'].shift(1) wmt['L_s1'] = wmt['Vader Sentiment'].pct_change(1) wmt['L_s2'] = wmt['sentiment_lag'].pct_change(1) wmt.head() import statsmodels.formula.api as smf
import pymongo import datetime import math import numpy as np import matplotlib import matplotlib.pyplot as plt import matplotlib.dates as mdates import matplotlib.cbook as cbook import statsmodels.tsa.stattools as st client = pymongo.MongoClient() db = client.vix n_vix = [] n_fear = [] for post in db['vix'].find().sort('date', 1): n_vix.append(post['vnx']) n_fear.append(post['fear_ip']) granger_array = np.array([np.array(n_vix), np.array(n_fear)]).T granger_positive = st.grangercausalitytests(granger_array, 7, addconst=True, verbose=True) print granger_positive[1][0]
def Granger(self,arr,max_lag=12,addconst=True,disp=True): x=np.vstack((self.arr,arr)).T return tsa_tools.grangercausalitytests(x,maxlag=max_lag,addconst=addconst,verbose=disp)
from statsmodels.compat.python import iteritems import numpy as np from numpy.testing import assert_almost_equal from statsmodels.datasets import macrodata import statsmodels.tsa.stattools as tsa_stats # some example data mdata = macrodata.load_pandas().data mdata = mdata[['realgdp','realcons']].values data = mdata data = np.diff(np.log(data), axis=0) #R: lmtest:grangertest r_result = [0.243097, 0.7844328, 195, 2] #f_test gr = tsa_stats.grangercausalitytests(data[:,1::-1], 2, verbose=False) assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7) assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'], decimal=7) lag = 2 print('\nTest Results for %d lags' % lag) print() print('\n'.join(['%-20s statistic: %f6.4 p-value: %f6.4' % (k, res[0], res[1]) for k, res in iteritems(gr[lag][0]) ])) print('\n Results for auxiliary restricted regression with two lags') print() print(gr[lag][1][0].summary()) print('\n Results for auxiliary unrestricted regression with two lags')
# ### Grangers Causality Test # We have done grangers causality test to figure out the which variable lags impacts the other variables. Clearly Export and import data are independent. But CPI and FX is dependent to some extent to other variables. # In[23]: gc_matrix = pd.DataFrame(np.zeros( [len(adj_data.columns), len(adj_data.columns)]), columns=adj_data.columns, index=adj_data.columns) # In[24]: for y in gc_matrix.columns: for x in gc_matrix.index: res = grangercausalitytests(adj_data[[y, x]], maxlag=6, verbose=False) p_values = [ round(res[i + 1][0]["ssr_chi2test"][1], 4) for i in range(6) ] min_values = np.min(p_values) gc_matrix.loc[x, y] = min_values # In[25]: gc_matrix # ### Cointegraton Test # Lets see if the variables we considered are cointegrated. For that we have used Johansen test. The result shows no cointegration for the four variables. But If we take first difference of the series, It becomes cointegrated. # In[26]:
#twitter dataset twitter_data = pd.read_csv('C:/Users/Smit/combined_csv1.csv') dfVolume = twitter_data.groupby(['Date'])['Date'].count() dfT = pd.Series.to_frame(dfVolume) dfT['id'] = list(dfT.index) dfT.columns = ['Count', 'DateTime'] dfT.to_csv('C:/Users/Smit/TWvol.csv') dfGC = pd.DataFrame() dfGC['StockPrice'] = df.Close dfGC['TweetsCount'] = dfT.Count dfGC['TweetsCount'] = dfGC['TweetsCount'].ffill() #dfGC1 = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/a10.csv', parse_dates=['date']) #df['month'] = df.date.dt.month grangercausalitytests(dfGC[['TweetsCount', 'StockPrice']], maxlag=3) ####################################################################### #twitter dataset dfPosRatio = pd.read_csv('C:/Users/Smit/PosRatioTweets.csv', index_col=1) dfGC1 = pd.DataFrame() dfGC1['StockPrice'] = df.Close dfGC1['Ratio'] = dfPosRatio.Ratio dfGC1['Ratio'] = dfGC1['Ratio'].ffill() grangercausalitytests(dfGC1[['Ratio', 'StockPrice']], maxlag=3) ####################################################################### # stock price vs crude oil dfSP = pd.read_csv("C:/Users/Smit/Dataset/yahoo/stockMarket.csv")
X = pd.DataFrame(datas_minus_signal.iloc[:, k]).apply( lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) figure_count = 1 plt.figure(figure_count) figure_count += 1 plt.plot(y, 'k-', label='%s' % j+ '_' + '%s' % stage) plt.plot(X, 'b-', label='%s' % i + '_' + '%s' % stage+ '%s' % columns_name) plt.legend(loc='upper right') plt.legend(prop=zhfont1) plt.savefig(para.path_results + '%s' % i + '_' +'%s' % j + '_' + '%s' % stage + '%s' % columns_name+ '%d.jpg' % (k)) plt.close() # step 4:时间序列检验:格兰杰因果检验 x_value = datas_minus_signal.iloc[:, k].diff().dropna().values # 原来的代码是对环比数据进行格兰杰因果检验 y = datas_minus_signal.iloc[:, 0] y_value = y.loc[datas_minus_signal.iloc[:, k].dropna().index].pct_change().dropna() gr_result = grangercausalitytests(np.array([x_value, y_value]).T, maxlag=period, addconst=True, verbose=False) dict_roll[k] = [gr_result[j + 1][0]['lrtest'][1] for j in range(period)] # step 6:输出excel y = pd.DataFrame(datas_minus_signal.iloc[:, 0]) X = pd.DataFrame(datas_minus_signal.iloc[:, k]) df = pd.merge(y, X, left_index=True, right_index=True) #a = (k - 1) * 5 a, '%' print(stage, '%s' % j + ':%s' % i + ':%s'% columns_name, round(df.corr().iloc[1, 0],4)) X_name = 'percentile' + '%s' % columns_name + '%' df.columns = ['close price', str(X_name)] df.index = range(df.shape[0]) name = '%s' % j + '_' + '%s' % stage + '%' + '%s' % stage+'.xlsx' # 需要生成文件的话,取消下行的注释 df.to_excel(para.path_results + name)
def granger_causality_tests( ts_cause: TimeSeries, ts_effect: TimeSeries, maxlag: int, addconst: bool = True, verbose: bool = True, ) -> None: """ Provides four tests for granger non causality of 2 time series using :func:`statsmodels.tsa.stattools.grangercausalitytests`. See [1]_. Parameters ---------- ts_cause A univariate deterministic time series. The statistical test determines if this time series 'Granger causes' the time series ts_effect (second parameter). Missing values are not supported. if H_0 (non causality) is rejected (p near 0), then there is a 'granger causality'. ts_effect Univariate time series 'Granger caused' by ts_cause. maxlag If an integer, computes the test for all lags up to maxlag. If an iterable, computes the tests only for the lags in maxlag. addconst Include a constant in the model. verbose Print results. Returns ------- Dict All test results, dictionary keys are the number of lags. For each lag the values are a tuple, with the first element a dictionary with test statistic, pvalues, degrees of freedom, the second element are the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast) matrix for the parameter f_test. References ---------- .. [1] https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.grangercausalitytests.html """ ts_cause._assert_univariate() ts_effect._assert_univariate() ts_cause._assert_deterministic() ts_effect._assert_deterministic() raise_if_not( ts_cause.freq == ts_effect.freq, "ts_cause and ts_effect must have the same frequency.", ) if not ts_cause.has_same_time_as(ts_effect): logger.warning( "ts_cause and ts_effect time series have different time index. " "We will slice-intersect ts_cause with ts_effect.") ts_cause = ts_cause.slice_intersect(ts_effect) ts_effect = ts_effect.slice_intersect(ts_cause) if not stationarity_tests(ts_cause): logger.warning( "ts_cause doesn't seem to be stationary. Please review granger causality validity in your problem context." ) if not stationarity_tests(ts_effect): logger.warning( "ts_effect doesn't seem to be stationary. Please review granger causality validity in your problem context." ) return grangercausalitytests( np.concatenate( (ts_effect.values(copy=False), ts_cause.values(copy=False)), axis=1), maxlag, addconst, verbose, )
# In[89]: from statsmodels.tsa.stattools import grangercausalitytests for region in region_names: region_to_test = region HPcolumn_name = region_to_test +'_HP' granger_tests = [] for idx, reg_name in enumerate(region_names): if reg_name ==region_to_test: granger_tests.append(None) else: test = grangercausalitytests(ripple_matrices[idx]['2007':][['AveragePrice',HPcolumn_name]],maxlag = 24, verbose = False) granger_tests.append(test) for idx2, test in enumerate(granger_tests): if not test == None: if test[24][0]['ssr_ftest'][1] <0.01: print(region, 'causing', region_names[idx2] +': \n') print('SSR_test:',test[24][0]['ssr_ftest'], 'LR_test:', test[24][0]['lrtest']) # In[174]: from statsmodels.tsa.stattools import grangercausalitytests granger_dict1 = {}
def test_granger_causality_exceptions(dataset): with pytest.raises(InfeasibleTestError): grangercausalitytests(dataset, 4)
from statsmodels.compat.python import iteritems import numpy as np from numpy.testing import assert_almost_equal from statsmodels.datasets import macrodata import statsmodels.tsa.stattools as tsa_stats # some example data mdata = macrodata.load_pandas().data mdata = mdata[['realgdp', 'realcons']].values data = mdata data = np.diff(np.log(data), axis=0) #R: lmtest:grangertest r_result = [0.243097, 0.7844328, 195, 2] #f_test gr = tsa_stats.grangercausalitytests(data[:, 1::-1], 2, verbose=False) assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7) assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'], decimal=7) lag = 2 print('\nTest Results for %d lags' % lag) print() print('\n'.join([ '%-20s statistic: %f6.4 p-value: %f6.4' % (k, res[0], res[1]) for k, res in iteritems(gr[lag][0]) ])) print('\n Results for auxiliary restricted regression with two lags') print() print(gr[lag][1][0].summary())
def test_granger_fails_on_finite_check(self, reset_randomstate): x = np.random.rand(1000, 2) x[500, 0] = np.nan x[750, 1] = np.inf with pytest.raises(ValueError, match="x contains NaN"): grangercausalitytests(x, 2)
line2 = ax.plot(data_in_fn, c='orange', alpha=0.9) line3 = ax2.plot(btc, c='red', alpha=0.7) ax.set_ylabel('Normalised TextBlob Sentiment', color='b', labelpad=10) ax2.set_ylabel('BTC Price', color='r', labelpad=10) plt.savefig( f'../plots/btc_textblob_sentiment/rolling_blob_btc_{file_key}.png', dpi=150, transparent=True) ### Run plotting functions ### plotter(bbc_per_fn, bbc_per_month, 'bbc') plotter(cnn_per_fn, cnn_per_month, 'cnn') plotter(nyt_per_fn, nyt_per_month, 'nyt') plotter(reuters_per_fn, reuters_per_month, 'reuters') plotter(agg_per_fn, agg_per_month, 'agg') ### Correlation of time series ### agg_per_fn = agg_per_fn.iloc[1:-1] agg_per_fn.corr(btc['price']) ### Correlation of time series ### test_frame_01 = pd.concat([btc['price'], agg_per_fn], axis=1) grangercausalitytests(test_frame_01, 70, addconst=True, verbose=True) test_frame_02 = pd.concat([agg_per_fn, btc['price']], axis=1) grangercausalitytests(test_frame_02, 70, addconst=True, verbose=True)
tmp_d=add_months(d,-(j)) index_list.append(tmp_d) data = pd.DataFrame(list(zip(leader_list, sum_list)), index =index_list ,columns =['leader', 'sum']) rawData = data.copy(deep=True) # adf test X1sta=0 X2sta=0 X1 = np.array(data['leader']) X1 = X1[~np.isnan(X1)] result = adfuller(X1) for key, value in result[4].items(): if result[0]<value: #stationary X1sta=1 break X2 = np.array(data['sum']) X2 = X2[~np.isnan(X2)] result = adfuller(X2) for key, value in result[4].items(): if result[0]<value: #stationary X2sta=1 break if X1sta==0: data['leader'] = data['leader'] - data['leader'].shift(1) if X2sta==0: data['sum'] = data['sum'] - data['sum'].shift(1) data = data.dropna() res = grangercausalitytests(data[['sum','leader']], maxlag=1)
def granger_causality(df, time_key, causality_key): return grangercausalitytests(df[[time_key, causality_key]], maxlag=maxlag)
dfout = pd.Series(dftest[0:4], index =['ADF Test Statisctics','p-value','#lags used','# Observations'] ) for key, val in dftest[4].items(): dfout[f'critical value ({key})'] = val dftest1 = adfuller(df2['Births']) dfout1 = pd.Series(dftest1[0:4], index =['ADF Test Statisctics','p-value','#lags used','# Observations'] ) df3 = pd.read_csv('samples.csv', index_col = 0, parse_dates= True) df3.index.freq = 'MS' from statsmodels.tsa.stattools import grangercausalitytests grangercausalitytests(df3[['a','d']], maxlag=3) grangercausalitytests(df3[['b','d']], maxlag=3) np.random.seed(42) df = pd.DataFrame(np.random.randint(20,30,(50,2)),columns = ['test','predictions']);
def granger_run(plsa_file, df_all_normalized): #This is a granger test running to find relevent topics based on an external time series min_probability = 1.0 df_plsa = pd.read_csv(plsa_file, error_bad_lines=False) # df_plsa = df_plsa.drop(df_plsa.columns[3], axis=1) # df_plsa = df_plsa.drop(df_plsa.columns[3], axis=1) # We find all unique topics and adds each topic as a key to a dictionary: df_topics_collection. # The value for each key in this topic is a dataframe containing the dates and probabilities for that topic/ topics = df_plsa.topic.unique() df_topics_collection = {} for topic in topics: df_topics_collection[topic] = df_plsa.loc[df_plsa['topic'] == topic] # This method iterates through each topic and creates stationary time series for the probabilities # And the stock data. This is then used in a granger test, who's results are manually inspected to find # Relevent topics. These topics are added to a dictionary, relevent_topic relevent_topic = {} for topic in topics: df = df_topics_collection[topic] df = df.rename(columns={"date": "Date"}) df = pd.merge(df, df_all_normalized, on="Date") temp_df = df.loc[(df['Contract'] == 'Dem')] temp_df['Probablity_stationary'] = temp_df['probability'] - temp_df['probability'].shift(1) temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice'] - temp_df['NormalizedPrice'].shift(1) temp_df = temp_df.dropna() try: res = grangercausalitytests(temp_df[['Probablity_stationary', 'NormalizedPrice_stationary']], maxlag=5) relevent_topic[topic] = res print(res) except: continue # This is a pearson test to find relevant words within a topic based on an external CSV # We find relevant words for each topic, and create a new CSV 'words_per_topic.csv'. # We also create a new CSV 'word_frequency.csv' with each word and its frequency throughout each day's files. # These files are loaded into a data frame. word_retriever.retrieve_words_per_topic_and_frequency('plsa_without_prior.csv') df_word_per_topic = pd.read_csv('words_per_topic.csv', error_bad_lines=False, header=None) df_word_frequency = pd.read_csv('word_frequency.csv', error_bad_lines=False) # We find the positive and negatively correlated words for each topic. # We run a pearson coefficient test on these topics and then find all topics that add up to our probability mass. # This then creates two new topics which are reported in two new CSVs '[topic]_[positive/negative].csv' # which are written to disk. for index, row in df_word_per_topic.iterrows(): topic = row[0] words = row[1].split(",") pearson_results = {} prob_mass = 0.75 for word in words: df_word_freq = df_word_frequency[df_word_frequency.word == word] df_word_freq = df_word_freq.rename(columns={"date": "Date"}) df_word_freq = pd.merge(df_word_freq, df_all_normalized, on="Date") df_word_freq = df_word_freq.loc[(df_word_freq['Contract'] == 'Dem')] # df_word_freq['frequency_stationary'] = df_word_freq['frequency']-df_word_freq['frequency'].shift(1) df_word_freq['NormalizedPrice_stationary'] = df_word_freq['NormalizedPrice'] - df_word_freq[ 'NormalizedPrice'].shift(1) # print(df_word_freq) df_word_freq = df_word_freq.dropna() corr, _ = pearsonr(df_word_freq['frequency'], df_word_freq['NormalizedPrice']) if not math.isnan(corr): pearson_results[word] = corr sorted_ascending = sorted(pearson_results.items(), key=operator.itemgetter(1)) temp_mass = prob_mass negative_words = [] for key in sorted_ascending: if key[1] > 0: break elif temp_mass + key[1] > 0: negative_words.append([key[0], key[1]]) temp_mass = temp_mass + key[1] df_neg = pd.DataFrame(negative_words, columns=['Word', 'Probability']) file_name = "../CourseProject/prior_csvs/" + topic + "_negative.csv" df_neg.to_csv(file_name) sorted_descending = sorted(pearson_results.items(), key=operator.itemgetter(1), reverse=True) temp_mass = prob_mass positive_words = [] min_probability = 1.0 for key in sorted_descending: if key[1] < 0: break elif temp_mass - key[1] > 0: positive_words.append([key[0], key[1]]) temp_mass = temp_mass - key[1] min_probability = min(min_probability, key[1]) print(temp_mass) df_pos = pd.DataFrame(positive_words, columns=['Word', 'Probability']) file_name = "../CourseProject/prior_csvs/" + topic + "_positive.csv" df_pos.to_csv(file_name) return min_probability
def RegressionAnalysis(df, Independent, Explanatory, Indicators, prefix=None): """ This function performs regression models, comparaison between series Arguments: ---------- - df: Pandas DataFrame Contains the data to be analyzed - Independent: str The name of column in df for the Independent variable data - Explanatory: str or list The name of the column in df for the Explanatory variable data. In case of a multivariate analysis, needed to pass a list object of all column names. - Indicators: list The list of the indicators/models names to compute Return: ---------- - df: Pandas DataFrame - Contains the initial df and all series indicators are added like the Residuals or the Fitted Values - OneValueIndicators: Pandas DataFrame - Contains all the indicators calculated with only one value like the FTest or the TTest """ if Indicators == None: Indicators = [ "OLS", "GLSAR", "RecursiveLS", "Yule Walker Order 1", "Yule Walker Order 2", "Yule Walker Order 3", "Burg Order 1", "Burg Order 2", "Burg Order 3", "QuantReg", "GLM Binomial", "GLM Gamma", "GLM Gaussian", "GLM Inverse Gaussian", "GLM Negative Binomial", "GLM Poisson", "GLM Tweedie" "AR", "ARMA", "ARIMA", "Granger Causality", "Levinson Durbin", "Cointegration" ] # Pre-processing Independent = df[Independent] Independent = pd.DataFrame(Independent) Explanatory = df[Explanatory] Explanatory = pd.DataFrame(Explanatory) y_sm = np.array(Independent).reshape((-1, 1)) x_sm = np.array(Explanatory) x_sm = sm.add_constant(x_sm) NumDecimal = 3 # Number of decimals for rounding numbers OneValueIndicators = {} if prefix == None: prefix = "" ################################################## ##### PART 1: Linear Regression ################################################## """ ########## Section 1: OLS """ name = "OLS" if name in Indicators: name = prefix + name model = sm.OLS(y_sm, x_sm) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 2: WLS """ ### Not Implemented """ ########## Section 3: GLS """ ### Not Implemented """ ########## Section 4: GLSAR """ name = "GLSAR" if name in Indicators: name = prefix + name model = sm.GLSAR(y_sm, x_sm, 1) results = model.iterative_fit(1) ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 5: RLS """ name = "RecursiveLS" if name in Indicators: name = prefix + name model = sm.RecursiveLS(y_sm, x_sm) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators[name + " Z Value"] = results.zvalues ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) # Cumsum # Not Implemented """ ########## Section 6: Yule Walker ORder 1 """ name = "Yule Walker Order 1" if name in Indicators and len(Explanatory.columns) == 1: name = prefix + name rho, sigma = statsmodels.regression.linear_model.yule_walker( x_sm[:, 1].flatten(), order=1) ### One Value Indicators # Rho OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal) # Sigma OneValueIndicators[name + " Sigma"] = round(sigma, NumDecimal) """ ########## Section 7: Yule Walker ORder 2 """ name = "Yule Walker Order 2" if name in Indicators and len(Explanatory.columns) == 1: name = prefix + name rho, sigma = statsmodels.regression.linear_model.yule_walker( x_sm[:, 1].flatten(), order=2) ### One Value Indicators # Rho OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal) # Sigma2 OneValueIndicators[name + " Sigma"] = round(sigma, NumDecimal) """ ########## Section 8: Yule Walker ORder 3 """ name = "Yule Walker Order 3" if name in Indicators and len(Explanatory.columns) == 1: name = prefix + name rho, sigma = statsmodels.regression.linear_model.yule_walker( x_sm[:, 1].flatten(), order=3) ### One Value Indicators # Rho OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal) # Sigma OneValueIndicators[name + " Sigma"] = round(sigma, NumDecimal) """ ########## Section 9: Burg's AR(p) ORder 1 """ name = "Burg Order 1" if name in Indicators and len(Explanatory.columns) == 1: name = prefix + name rho, sigma2 = statsmodels.regression.linear_model.burg( x_sm[:, 1].flatten(), order=1) ### One Value Indicators # Rho OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal) # Sigma2 OneValueIndicators[name + " Sigma2"] = round(sigma2, NumDecimal) """ ########## Section 10: Burg's AR(p) ORder 2 """ name = "Burg Order 2" if name in Indicators and len(Explanatory.columns) == 1: name = prefix + name rho, sigma2 = statsmodels.regression.linear_model.burg( x_sm[:, 1].flatten(), order=2) ### One Value Indicators # Rho OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal) # Sigma2 OneValueIndicators[name + " Sigma2"] = round(sigma2, NumDecimal) """ ########## Section 11: Burg's AR(p) ORder 3 """ name = "Burg Order 3" if name in Indicators and len(Explanatory.columns) == 1: name = prefix + name rho, sigma2 = statsmodels.regression.linear_model.burg( x_sm[:, 1].flatten(), order=3) ### One Value Indicators # Rho OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal) # Sigma2 OneValueIndicators[name + " Sigma2"] = round(sigma2, NumDecimal) """ ########## Section 12: Quantile Regression """ name = "QuantReg" if name in Indicators: name = prefix + name model = sm.QuantReg(y_sm, x_sm) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) ################################################## ##### PART 2: Generalized Linear Models ################################################## """ ########## Section 1: GLM Binomial """ name = "GLM Binomial" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.Binomial()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 2: GLM Gamma """ name = "GLM Gamma" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.Gamma()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 3: GLM Gaussian """ name = "GLM Gaussian" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.Gaussian()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 3: GLM InverseGaussian """ name = "GLM Inverse Gaussian" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.InverseGaussian()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 4: GLM NegativeBinomial """ name = "GLM Negative Binomial" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.NegativeBinomial()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 5: GLM Poisson """ name = "GLM Poisson" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.Poisson()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) """ ########## Section 6: GLM Tweedie """ name = "GLM Tweedie" if name in Indicators: name = prefix + name model = sm.GLM(y_sm, x_sm, family=sm.families.Tweedie()) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2, NumDecimal) ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) ################################################## ##### PART 3: Robust Linear Models ################################################## ################################################## ##### PART 4: AR models ################################################## name = "AR" if name in Indicators: name = prefix + name model = statsmodels.tsa.ar_model.AR(Independent) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators[name + " Final Prediction Error"] = results.fpe OneValueIndicators[ name + " Hannan-Quinn Information Criterion"] = results.hqic OneValueIndicators[name + " Roots"] = results.roots ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) ################################################## ##### PART 5: ARMA ################################################## name = "ARMA" if name in Indicators: name = prefix + name model = statsmodels.tsa.arima_model.ARMA(y_sm, (5, 5), x_sm) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators[name + " AR Params"] = results.arparams OneValueIndicators[name + " AR Roots"] = results.arroots OneValueIndicators[name + " AR Freq"] = results.arfreq OneValueIndicators[ name + " Hannan-Quinn Information Criterion"] = results.hqic OneValueIndicators[name + " MA Params"] = results.maparams try: OneValueIndicators[name + " MA Roots"] = results.maroots except: pass try: OneValueIndicators[name + " MA Freq"] = results.mafreq except: pass OneValueIndicators[name + " Sigma2"] = results.sigma2 ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) ################################################## ##### PART 6: ARIMA ################################################## name = "ARIMA" if name in Indicators: name = prefix + name model = statsmodels.tsa.arima_model.ARIMA(Independent, (2, 2, 2), Explanatory) results = model.fit() ### One Value Indicators OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators( OneValueIndicators, name, results, Explanatory, NumDecimal) OneValueIndicators[name + " AR Params"] = results.arparams OneValueIndicators[name + " AR Roots"] = results.arroots OneValueIndicators[name + " AR Freq"] = results.arfreq OneValueIndicators[ name + " Hannan-Quinn Information Criterion"] = results.hqic OneValueIndicators[name + " MA Params"] = results.maparams OneValueIndicators[name + " MA Roots"] = results.maroots OneValueIndicators[name + " MA Freq"] = results.mafreq OneValueIndicators[name + " Sigma2"] = results.sigma2 ### Time Series Indicators # Fitted Values df = Statsmodels_FittedValues(df, results, name) # Residuals df = Statsmodels_LR_Residuals(df, results, name) ################################################## ##### PART 7: Univariate Analysis ################################################## # Granger Causality name = "Granger Causality" name = prefix + name if name in Indicators: OneValueIndicators[name] = ts.grangercausalitytests( Independent.merge(Explanatory, how="inner", left_index=True, right_index=True), maxlag=10) # Levinson Durbin name = "Levinson Durbin" name = prefix + name if name in Indicators: OneValueIndicators[name] = ts.levinson_durbin(Independent) # Cointegration name = "Cointegration" name = prefix + name if name in Indicators: OneValueIndicators[name] = ts.coint(Independent, Explanatory, trend="ct", return_results=False) ################################################## ##### Not Implemented ################################################## # BDS Statistic (residuals analysis) # Not Implemented # Return’s Ljung-Box Q Statistic (AR) # Not Implemented OneValueIndicators = pd.DataFrame.from_dict(OneValueIndicators, orient="index") return df, OneValueIndicators
print("Optimal number of lags for click data is "+str(numLagsclick)) ARaic = ar_model.AR(encountersPerDay.tolist()).fit(maxlag=MAX_LAG, ic="aic") ARbic = ar_model.AR(encountersPerDay.tolist()).fit(maxlag=MAX_LAG, ic="bic") #select the fewer number of parameters between both criteria. numLagsEnc = len(ARaic.params) if len(ARaic.params) < len(ARbic.params) else len(ARbic.params) print("Optimal number of lags for encounter data is "+str(numLagsEnc)) #2. now that I know the optimal number of parameters, I can call the #granger causality function of statsmodels. data = pd.concat([encountersPerDay, clicksPerDay], axis=1) print("\nGranger causality results of clicks onto encounters") results = stattools.grangercausalitytests(data, maxlag=numLagsEnc) data = pd.concat([clicksPerDay, encountersPerDay], axis=1) print("\nGranger causality results of encounters onto clicks") results = stattools.grangercausalitytests(data, maxlag=numLagsclick) #ideally, I would implement this myself, however statsmodels is buggy and #does not deal with exogenous variables well, meaning I would have to #implement an AR fitting algorithm, which is non-ideal. MAX_LAG = 3 endog = encountersPerDay.tolist() exog = clicksPerDay.tolist()
x2 = np.array(abortion_data.iloc[1:-1, -1]) # Crime rates per year x1 = np.array(crime_data.loc[5]) x1 = np.delete(x1, [0, 1, 2]) for i in range(len(x1)): x1[i] = x1[i].replace(',', '') x1 = x1.astype(np.float) x2 = x2.astype(np.float) x = np.array([x1, x2]) x = np.transpose(x) res1 = st.grangercausalitytests(x, maxlag=10, verbose=True) print("{:<8} {:<15} {:<10}".format('LAG', 'F-value', 'p-value')) for k, v in res1.items(): d = v[0] print("{:<8} {:<15} {:<10}".format(k, d['params_ftest'][0], d['params_ftest'][1], d['params_ftest'][2])) x = np.array([x2, x1]) x = np.transpose(x) res2 = st.grangercausalitytests(x, maxlag=10, verbose=True) for k, v in res2.items(): d = v[0] print("{:<8} {:<15} {:<10}".format(k, d['params_ftest'][0], d['params_ftest'][1],
def test_granger_causality_exceptions(dataset): with pytest.raises(InfeasibleTestError): with pytest.warns(FutureWarning, match="verbose"): grangercausalitytests(dataset, 4, verbose=False)
period_summaries = db['period_summary'] query = period_summaries.aggregate([{ '$group': { '_id': 'date_range', 'min_date': { '$min': '$period_start' }, 'max_date': { '$max': '$period_end' } } }]).next() r = grangercausalitytests(np.random.random((100,2)), 5, addconst=True, verbose=True): example: r[1][0]['lrtest'] r = { 1: ({'lrtest': (2.5498146592526325, 0.11030719462362751, 1), 'params_ftest': (2.5046637818898794, 0.11679877991627048, 96.0, 1), 'ssr_ftest': (2.5046637818898669, 0.1167987799162722, 96.0, 1), 'ssr_chi2test': (2.5829345250739255, 0.1080212350855112, 1)},.... 2: ({'lrtest': ... 3: ... 4: ... 5: ... } >>> r[1][0]['lrtest'] only one df variable..
def test_granger_causality_exception_maxlag(gc_data): with pytest.raises(ValueError, match="maxlag must be"): grangercausalitytests(gc_data, maxlag=-1) with pytest.raises(NotImplementedError): grangercausalitytests(gc_data, 3, addconst=False)
def relate_term_n_stock_ts(term_panel, stocks_panel): # both term_df and stock_df must be stationary # for all stock dfs do: r = grangercausalitytests(stock, max_lags, verbose=False):
def test_granger_causality_verbose(gc_data): with pytest.warns(FutureWarning, match="verbose"): grangercausalitytests(gc_data, 3, verbose=True)
def test_granger_fails_on_nobs_check(self): # Test that if maxlag is too large, Granger Test raises a clear error. X = np.random.rand(10, 2) grangercausalitytests(X, 2) # This should pass. assert_raises(ValueError, grangercausalitytests, X, 3)
def granger_causality(primary_features, secondary_features, secondary_feature_names, change_points_primary, change_points_secondary, p_value=0.01): '''Given change points and representations of two perspectives, this function calculates the cause-effect relationships. A reduced primary and an unreduced secondary perspective along with the names of the secondary features have to be provided. This function filters the change points of the secondary perspective that precede a change point in the primary perspective and test, whether there are granger causal features, given the lag between drifts args: primary_perspective_reduced: Reduced time series retrieved by the previously executed dimensionality reduction. secondary_features: Feature representation for the secondary perspective. Retrieved when constructing the feature representation for the secondary perspective. secondary_feature_names: List of the feature names, that is retrieved when constructing the features. change_points_primary: List of primary change points change_points_secondary: List of secondary change points p_value: Maximum p-value ''' tmp = np.array(primary_features) transpose = tmp.T primary_features = transpose.tolist() tmp = np.array(secondary_features) transpose = tmp.T secondary_features = transpose.tolist() results = [] if not isinstance(primary_features[0], list): primary_features = [primary_features] for cp_1 in change_points_primary: for cp_2 in change_points_secondary: if cp_2 < cp_1: k = cp_1 - cp_2 feature_set = {} p = p_value for i in range(0, len(secondary_features)): f = secondary_features[i] for f_2 in primary_features: granger_data = pd.DataFrame(f_2) granger_data[secondary_feature_names[i]] = f granger_data = granger_data.dropna() try: gc_res = grangercausalitytests(granger_data, [k], verbose=False) #Increase the margin by 1% (or even less) to account for numeric approximation errors if gc_res[k][0]['params_ftest'][1] < p * 1.01: p_feat = primary_features.index(f_2) if p_feat not in feature_set.keys(): feature_set[p_feat] = [] if secondary_feature_names[ i] not in feature_set[p_feat]: feature_set[p_feat].append( secondary_feature_names[i]) except ValueError: pass results.append((cp_1, cp_2, feature_set, p)) return results
openStockPrice = reader.Open; #changesOfStockPrice changesOfStockPrice = openStockPrice - closeStockPrice; #find overlap days in two list newlist = set(day) & set(day2) newlist = list(newlist) priceChange = [] avgSenti = [] #joint changes of prices and averaged sentiment into a new 2D array by containing days for i in range(0,len(newlist)): x = newlist[i] if x in day: if x in day2: x = str(x) i = day.index(x) j = day2.index(x) priceChange.append(changesOfStockPrice[i]) avgSenti.append(avgSentiment[j]) X= np.vstack(([np.array(priceChange).T], [np.array(avgSenti).T])).T #lagged operations from 1 to 7, as 8 causes value error maxlag = 7 #perform a causality test and print d=stat.grangercausalitytests(X,maxlag,True,True)
def compute_casuality(data=placeholder_insights, data2=placeholder_activity): merged = pd.merge(data, data2, on=['ds', 'ds']) m = merged[['y', 'activity_time']].as_matrix() result = grangercausalitytests(m, maxlag=5) return result[2][0]['lrtest'][1]
else: caus_xy = G.causality_xy caus_yx = G.causality_yx caus_sim = G.simultaneous_causality g1 = np.mean(caus_xy, -1) g2 = np.mean(caus_yx, -1) g3 = np.mean(caus_sim, -1) Gs.append((g1[0, 1], g2[0, 1], g3[0, 1])) Gs maxlag = 10 rois rois.shape gcs_val = np.zeros((rois.shape[0], rois.shape[0], maxlag)) for i in range(rois.shape[0]): for j in range(rois.shape[0]): test, reg = grangercausalitytests(rois[[j, i]], maxlag) ssrEig = reg[0].ssr ssrBeid = reg[1].ssr gcs_val[i, j] = np.log(ssrEig / ssrBeid) rois.shape plt.plot(rois.T); plt.show() rois = rois.T gcs_val = np.zeros((rois.shape[0], rois.shape[0], maxlag)) for i in range(rois.shape[0]): for j in range(rois.shape[0]): test, reg = grangercausalitytests(rois[[j, i]], maxlag) ssrEig = reg[0].ssr ssrBeid = reg[1].ssr gcs_val[i, j] = np.log(ssrEig / ssrBeid) rois.shape
def thePar(theTickers, baseTicker, start, end, postThresh, theWindow, testSize, numTickers, fileName, thePath, mongoInsert): #variables statSig = .05 #obvious #postThresh = .7 #posterior probability threshold #corThresh = .15 #arbitrary p-vaue for pearson correl #theWindow = 10 #arbitrary rolling window rollRet = float(0) totalLong = 0 totalShort = 0 rollLongRight = 0 rollShortRight = 0 #testSize = .90 priceThresh = 5.0 tempBase = web.DataReader(baseTicker,"yahoo",start,end) tempBase['retBase'] = np.log(tempBase['Adj Close'].astype(float)) - np.log(tempBase['Adj Close'].astype(float).shift(1)) #tempBase['retBase'] = np.log(tempBase['Close'].astype(float)) - np.log(tempBase['Open'].astype(float)) tempBase = tempBase[['retBase','Close']] tempBase.columns = ['retBase','closeBase'] tempBase.reindex() tempBase = tempBase.dropna() thePerf = list() finalData = pd.DataFrame() todayData = pd.DataFrame() for i in range(0,numTickers): try: tempData = web.DataReader(theTickers[i],"yahoo",start,end) lastPrice = tempData['Close'][len(tempData)-1:len(tempData)][0] tempData['retClose'] = np.log(tempData['Adj Close'].astype(float)) - np.log(tempData['Adj Close'].astype(float).shift(1)) tempData['ret'] = np.log(tempData['Close'].astype(float)) - np.log(tempData['Open'].astype(float)) tempData = tempData.dropna() tempData = tempData[['retClose','ret','Close']] tempData.reindex() tempData = pd.merge(tempBase,tempData,how='outer', left_index=True, right_index=True) tempData['diff'] = tempData['closeBase'] - tempData['Close'] tempData = tempData.dropna() tempData = tempData[['retBase','ret','retClose','diff']] theCor = pearsonr(tempData['retBase'],tempData['retClose']) gCause = ts.grangercausalitytests(tempData[['retClose','retBase']],1,verbose=False)[1][0]['params_ftest'][1] #second position --> first position #if(theCor[1] <= statSig and (theCor[0] >= corThresh or theCor[0] <= -corThresh) and gCause <= statSig): if(gCause <= statSig and lastPrice >= priceThresh): tempData['rollMean'] = tempData['ret'].rolling(window=theWindow).mean() tempData['rollMeanBase'] = tempData['retBase'].rolling(window=theWindow).mean() tempData['rollCor'] = pd.rolling_corr(tempData['retBase'],tempData['ret'],theWindow) #rollCorrelation tempData = tempData.dropna() theLen = len(tempData) trainLen = int(round(testSize*theLen,0)) testLen = int(theLen - trainLen) try: y = tempData['ret']#[0:theLen-2] #next day assset return X = tempData[['retBase','rollCor','rollMeanBase','rollMean','diff']]#[1:theLen-1] #event day features trainY = y[1:trainLen-1] testY = y[trainLen:theLen-1] trainX = X[0:trainLen-2] testX = X[trainLen-1:theLen-2] tDate = list(trainY.index.values) startTrainDate = str(tDate[0])[:10] endTrainDate = str(tDate[len(tDate)-1])[:10] tDate = list(testY.index.values) startTestDate = str(tDate[0])[:10] endTestDate = str(tDate[len(tDate)-1])[:10] tDate = list(tempData.index.values) simDay = str(tDate[len(tDate)-1])[:10] model = RandomForestClassifier(n_estimators=25,random_state=42) #model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=300,learning_rate=1,algorithm="SAMME") #model = linear_model.LogisticRegression(C=1e5) #model = SVC(kernel='rbf', class_weight=None) #model = GaussianNB() model.fit(trainX,np.sign(trainY)) postProbs = model.predict_proba(testX) todayPostProbs = model.predict_proba(X[theLen-1:theLen]) theClasses = model.classes_ #[-1. 0. 1.] neg = int(np.where(theClasses == -1.0)[0]) pos = int(np.where(theClasses == 1.0)[0]) if(todayPostProbs[0][pos] >= postThresh): tempStr = pd.DataFrame({'ticker': [theTickers[i]],' Position': ['Long'],' Confidence': [todayPostProbs[0][pos]]}) todayData = todayData.append(tempStr) print("GO LONG ON: " + theTickers[i] + " Confidence: " + str(todayPostProbs[0][pos])) mongoInsert.insert({"theDate": simDay, "ticker": theTickers[i], "position": 1}) if(todayPostProbs[0][neg] >= postThresh): tempStr = pd.DataFrame({'ticker': [theTickers[i]],' Position': ['Short'],' Confidence': [todayPostProbs[0][neg]]}) todayData = todayData.append(tempStr) print("GO SHORT ON: " + theTickers[i] + " Confidence: " + str(todayPostProbs[0][neg])) mongoInsert.insert({"theDate": simDay, "ticker": theTickers[i], "position": -1}) theLongs = np.where(postProbs[:,pos] >= postThresh)[0] #LONG POSITIONS theShorts = np.where(postProbs[:,neg] >= postThresh)[0] #SHORT POSITIONS numPos = len(theLongs) totalLong = totalLong + numPos numNeg = len(theShorts) totalShort = totalShort + numNeg corLong = np.where(np.sign(testY[theLongs]) == 1)[0] longRet = np.sum(testY[theLongs]) rollLongRight = rollLongRight + len(corLong) corShort = np.where(np.sign(testY[theShorts]) == -1)[0] shortRet = np.sum(testY[theShorts]) rollShortRight = rollShortRight + len(corShort) theRet = round(float(longRet) - float(shortRet),8) rollRet = round(float(rollRet) + float(theRet),8) thePerf.append(theRet) tempStr = pd.DataFrame({'ticker': [theTickers[i]],'theRet': [theRet],'rollret': [rollRet],'RollShortTrd': [totalShort],'RollLongTrd': [totalLong], 'RollLongRight': [rollLongRight],'RollShortRight': [rollShortRight],'startTrainDate': [startTrainDate],'startTestDate': [startTestDate]}) finalData = finalData.append(tempStr) #print(theTickers[i] + " Ret: " + str(theRet) + " Roll Ret: " + str(rollRet) + " Short Cnt: " + str(numNeg) + " Long Cnt: " #+ str(numPos) + " Strt Tr: " + startTrainDate + " Strt Test: " + startTestDate + " sRollCnt: " + str(totalShort) + " lRollCnt: " + str(totalLong)) except: pass except: pass finalData.to_csv(thePath + time.strftime("%Y-%m-%d") + "_" + baseTicker + "_" + fileName + "_finalData.csv",index=False) todayData.to_csv(thePath + time.strftime("%Y-%m-%d") + "_" + baseTicker + "_" + fileName + "_todayData.csv",index=False) print("Sharpe: " + str(((np.mean(thePerf)/np.std(thePerf))*math.sqrt(252)))) temp = np.where(np.asarray(thePerf) < 0) print("Sortino: " + str(np.mean(thePerf)/np.std(np.asarray(thePerf)[temp])*math.sqrt(252)))