예제 #1
0
def grangerTest(exog, endog):
    MAX_LAG = 30
    ARaic = ar_model.AR(exog.tolist()).fit(maxlag=MAX_LAG, ic="aic")
    ARbic = ar_model.AR(exog.tolist()).fit(maxlag=MAX_LAG, ic="bic")
    # select the fewer number of parameters between both criteria.
    numExog = len(ARaic.params) if len(ARaic.params) < len(ARbic.params) else len(ARbic.params)

    print ("Optimal number of lags for exog data is " + str(numExog))

    ARaic = ar_model.AR(endog.tolist()).fit(maxlag=MAX_LAG, ic="aic")
    ARbic = ar_model.AR(endog.tolist()).fit(maxlag=MAX_LAG, ic="bic")
    # select the fewer number of parameters between both criteria.
    numEndog = len(ARaic.params) if len(ARaic.params) < len(ARbic.params) else len(ARbic.params)

    print ("Optimal number of lags for endog data is " + str(numEndog))

    # now that I know the optimal number of parameters, I can call the
    # granger causality function of statsmodels.
    data = pd.concat([endog, exog], axis=1)
    print ("\nGranger causality results of indep onto dep")
    results = stattools.grangercausalitytests(data, maxlag=numEndog)

    data = pd.concat([exog, endog], axis=1)
    print ("\nGranger causality results of dep onto indep")
    results = stattools.grangercausalitytests(data, maxlag=numExog)
    regr = results[2][1]
    print (regr[0].params)
    print (regr[1].params)
    print (regr[1].pvalues)
 def testForCointegration(self, assets):
     x = np.random.normal(0,1, 1000)
     y = np.random.normal(0,1, 1000)
     x = np.array(x)
     y = np.array(y)
     c =np.column_stack((x,y))
     a = grangercausalitytests(c,-1,verbose=True)
     return a
def cluster_vs_cluster_granger(C,X,lags=4,thresh=0.01):
	Xc = [np.average(X[:,c],1) for c in C]
	R = []
	for i in range(len(C)):
		x1 = Xc[i]
		for j in range(i+1,len(C)):
			x2 = Xc[j]
			result = stattools.grangercausalitytests(np.transpose([x1,x2]),lags)
			for l in range(lags):
				pv = result[l+1][0]['ssr_ftest'][1]
				if pv < thresh:
					R.append((pv,(i,j,l+1)))
			result = stattools.grangercausalitytests(np.transpose([x2,x1]),lags)
			for l in range(lags):
				pv = result[l+1][0]['ssr_ftest'][1]
				if pv < thresh:
					R.append((pv,(j,i,l+1)))
	return sorted(R)
def granger(x1, x2):
	if len(x1) == 1:
		x1[0][0] += .00000000001
	else:
		x1[0]  += .00000000001
	if len(x2) == 1:
		x2[0][0] += .00000000001
	else:
		x2[0]  += .00000000001
	res = sts.grangercausalitytests(np.vstack((x1, x2)).T, 2, verbose=False)[1][0]['params_ftest'][0]
	return res
예제 #5
0
    def test_grangercausality(self):
        # some example data
        mdata = macrodata.load().data
        mdata = mdata[['realgdp', 'realcons']]
        data = mdata.view((float, 2))
        data = np.diff(np.log(data), axis=0)

        #R: lmtest:grangertest
        r_result = [0.243097, 0.7844328, 195, 2]  # f_test
        gr = grangercausalitytests(data[:, 1::-1], 2, verbose=False)
        assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7)
        assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'], decimal=7)
def cluster_vs_meta_granger(c,X,M,Ml,lags=7,thresh=0.05):
	x1 = np.average(X[:,c],1)
	R = []
	for x2 in M.T:
		have_values = np.isfinite(x2)
		result = stattools.grangercausalitytests(np.transpose([x1[have_values],x2[have_values]]),lags)
		R.append([(result[i+1][0]['ssr_ftest'][1],i+1) for i in range(lags) if result[i+1][0]['ssr_ftest'][1] < thresh])
	RM = []
	for i,r in enumerate(R):
		if r:
			avgLag = np.average([x[1] for x in r])
			avgPvalue = np.average([x[0] for x in r])
			RM.append((avgPvalue,Ml[i],avgLag))
	return sorted(RM)
예제 #7
0
def granger_causality(data, max_lag=10, alpha=.05, *, callback=None):
    """
    Return results of Granger-causality tests.

    Parameters
    ----------
    data : Timeseries
        A table of features to compute Granger causality between.
    max_lag : int
        The maximum lag to compute Granger-causality for.
    alpha : float in (0, 1)
        Confidence of test is 1 - alpha.
    callback : callable
        A callback to call in each iteration with ratio of completion.

    Returns
    -------
    res : list of lists
        Each internal list is [lag, antecedent, consequent] where
        lag is the minimum lag at which antecedent feature in data is
        Granger-causal for the consequent feature in data.
    """
    from statsmodels.tsa.stattools import grangercausalitytests
    from Orange.data import Table, Domain
    # TODO: use VAR Granger causality
    # TODO: consider CCM in stead/addition of GC: https://en.wikipedia.org/wiki/Convergent_cross_mapping
    # http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.vector_ar.var_model.VARResults.test_causality.html
    # http://statsmodels.sourceforge.net/devel/vector_ar.html#granger-causality

    data = data.interp()
    domain = [var for var in data.domain.variables if var.is_continuous]
    res = []

    for row_attr in domain:
        for col_attr in domain:
            if row_attr == col_attr or data.time_variable in (row_attr, col_attr):
                continue
            X = Table(Domain([], [], [col_attr, row_attr], data.domain), data).metas
            try:
                tests = grangercausalitytests(X, max_lag, verbose=False)
                lag = next((lag for lag in range(1, 1 + max_lag)
                            if tests[lag][0]['ssr_ftest'][1] < alpha), 0)
            except ValueError:
                lag = 0
            if lag:
                res.append([lag, row_attr.name, col_attr.name])
            if callback:
                callback(1 / ((len(domain) - 1)**2 - len(domain)))
    return res
예제 #8
0
        tempData = web.DataReader(theTickers[i],"yahoo",start,end)
        lastPrice = tempData['Close'][len(tempData)-1:len(tempData)][0]
        tempData['retClose'] = np.log(tempData['Adj Close'].astype(float)) - np.log(tempData['Adj Close'].astype(float).shift(1))
        tempData['ret'] = np.log(tempData['Close'].astype(float)) - np.log(tempData['Open'].astype(float))
        tempData = tempData.dropna()
        tempData = tempData[['retClose','ret','Close']]
        tempData.reindex()
        tempData = pd.merge(tempBase,tempData,how='outer', left_index=True, right_index=True)
        tempData['diff'] = tempData['closeBase'] - tempData['Close']
        tempData = tempData.dropna()

        tempData = tempData[['retBase','ret','retClose','diff']]
        
        theCor = pearsonr(tempData['retBase'],tempData['retClose'])        
        
        gCause = ts.grangercausalitytests(tempData[['retClose','retBase']],1,verbose=False)[1][0]['params_ftest'][1] #second position --> first position
        #if(theCor[1] <= statSig and (theCor[0] >= corThresh or theCor[0] <= -corThresh) and gCause <= statSig):
        if(gCause <= statSig and lastPrice >= priceThresh):
            tempData['rollMean'] = tempData['ret'].rolling(window=theWindow).mean()
            tempData['rollMeanBase'] = tempData['retBase'].rolling(window=theWindow).mean()
            tempData['rollCor'] = pd.rolling_corr(tempData['retBase'],tempData['ret'],theWindow) #rollCorrelation
            tempData = tempData.dropna()
            
            theLen = len(tempData)
            
            trainLen = int(round(testSize*theLen,0))
            testLen = int(theLen  - trainLen)
            try:
                y = tempData['ret']#[0:theLen-2] #next day assset return
                X = tempData[['retBase','rollCor','rollMeanBase','rollMean','diff']]#[1:theLen-1] #event day features
                
예제 #9
0
def find_granger_causality(x1_df, x2_df, max_lags=5):
    data = pd.concat([x1_df, x2_df], axis=1).dropna()
    lag_1_2, causes_1_2 = is_granger_caused(grangercausalitytests(data[[0, 1]], max_lags, verbose=False)) # x1 causes x2
    lag_2_1, causes_2_1 = is_granger_caused(grangercausalitytests(data[[1, 0]], max_lags, verbose=False)) # x2 causes x1
    # x2 only granger-causes x1 if, and only if, x1 does NOT cause x2 and x2 CAUSES x1
    return (lag_2_1, True if causes_2_1 and not causes_1_2 else False)
			emotion['anger'].append(0)
			emotion['happiness'].append(0)
			emotion['surprise'].append(0)
			emotion['sadness'].append(0)
			emotion['fear'].append(0)
			emotion['disgust'].append(0)

		n_price.append(post['price'])
		# n_return.append(post['return'])
		# n_dates.append(date)

		date = date + datetime.timedelta(days=1)

	for key in emotion.keys():
		granger_array = np.array([np.array(n_price), np.array(emotion[key])]).T
		granger_results = st.grangercausalitytests(granger_array, 4, addconst=True, verbose=False)
		print key.upper()
		results.write('\t' + key.upper() + '\t\t\t\tLag1\t\t\tLag2\t\t\tLag3\t\t\tLag4')
		results.write('\n')
		results.write('\tssr_chi2test\t\t' + str(granger_results[1][0]['ssr_chi2test'][1]) + '\t\t' + 
			str(granger_results[2][0]['ssr_chi2test'][1]) + '\t\t' + 
			str(granger_results[3][0]['ssr_chi2test'][1]) + '\t\t' + 
			str(granger_results[4][0]['ssr_chi2test'][1]) + '\n')

		results.write('\tssr_ftest\t\t' + str(granger_results[1][0]['ssr_ftest'][1]) + '\t\t' + 
			str(granger_results[2][0]['ssr_ftest'][1]) + '\t\t' + 
			str(granger_results[3][0]['ssr_ftest'][1]) + '\t\t' + 
			str(granger_results[4][0]['ssr_ftest'][1]) + '\n')


exit()
예제 #11
0
 def test_granger_fails_on_nobs_check(self):
     # Test that if maxlag is too large, Granger Test raises a clear error.
     X = np.random.rand(10, 2)
     grangercausalitytests(X, 2, verbose=False)  # This should pass.
     assert_raises(ValueError, grangercausalitytests, X, 3, verbose=False)
	# for i in range(len(price7)):
	# 	dst.write(str(i) + ',' + str(price1[i]) + ',' + str(n_price[i]) + ',' + str(returnn1[i]) + ',' + 
	# 		str(n_return[i]) + ',' + str(pos[i]) + ',' + str(neg[i]) + ',' + str(bind[i]) + ',' + 
	# 		str(tis[i]) + ',' + str(rtis[i]) + ',' + str(returnn2[i]) + ',' + str(returnn7[i]) + ',' +
	# 		str(price2[i]) + ',' + str(price7[i]) + '\n')

	# dst.close()

	# exit()

	granger_array_positive = np.array([np.array(n_price), np.array(n_emotion)]).T
	# print len(n_price)
	# print len(n_negativos)
	# print granger_array_positive

	granger_positive = st.grangercausalitytests(granger_array_positive, 4, addconst=True, verbose=True)
	print granger_positive[1][0]
	


	granger_array_negative = np.array([np.array(n_price), np.array(n_negativos)]).T
	# print granger_array_negative
	granger_negative = st.grangercausalitytests(granger_array_negative, 4, addconst=True, verbose=True)


	# results.write('\tPositivos\n')
	# results.write('\t\t\t\tLag1\t\t\tLag2\t\t\tLag3\t\t\tLag4')
	# results.write('\n')

	results.write(ticker.upper() + ' & ' + ('%.4f' % granger_positive[1][0]['ssr_ftest'][1]) + ' & ' + 
		('%.4f' % granger_positive[2][0]['ssr_ftest'][1]) + ' & ' + 
# In[22]:

from statsmodels.tsa.ar_model import AR
model = AR(a)
model_fit = model.fit()
print('Lag: %s' % model_fit.k_ar)
maxlag = model_fit.k_ar

# In[23]:

addconst = True
verbose = True

# In[24]:

result = stm.grangercausalitytests(x, maxlag, addconst, verbose)
optimal_lag = -1
F_test = -1.0
for key in result.keys():
    _F_test_ = result[key][0]['params_ftest'][0]
    if _F_test_ > F_test:
        F_test = _F_test_
        optimal_lag = key

# In[25]:

print("{} {}".format("We are going to look into the GC with Optimal Lag of",
                     optimal_lag))

# We consider the p-value of the test as a measure for Granger causality: rejection of ℋ0 (p < 0.03) signifies Granger causality, acceptance means non-causality.
fig = plt.figure()

ax = plt.subplot(241)
ax.plot(t, x1, label='time series 1')
ax.plot(t, x2, label='time series 2')
# ax.plot(t, lm.predict(t.reshape(-1, 1)), color='red', label='linear regression on ts2')
ax.set_xlabel('t')
ax.set_ylabel('x')
ax.legend()

lags = 5
mat = np.zeros((n, 2))
mat[:, 0] = x1
mat[:, 1] = x2
gr = grangercausalitytests(mat, lags, verbose=False)
p_values = [gr[x][0]['params_ftest'][1] for x in gr.keys()]

ax = plt.subplot(245)
ax.plot(range(1, lags + 1), list(p_values), label='granger causality')
ax.set_xlabel('lags')
ax.set_ylabel('p-value')

ax = plt.subplot(242)
ax.plot(t, x1)
ax.plot(t, x3)
# ax.plot(t, lm.predict(t.reshape(-1, 1)), color='red', label='linear regression on ts2')
ax.set_xlabel('t')

mat = np.zeros((n, 2))
mat[:, 0] = x1
예제 #15
0
 def test_granger_fails_on_nobs_check(self, reset_randomstate):
     # Test that if maxlag is too large, Granger Test raises a clear error.
     x = np.random.rand(10, 2)
     grangercausalitytests(x, 2, verbose=False)  # This should pass.
     with pytest.raises(ValueError):
         grangercausalitytests(x, 3, verbose=False)
예제 #16
0
def compute_relationship(
        v1: np.ndarray,
        v2: np.ndarray,
        v1_label: Text = 'v1',
        v2_label: Text = 'v2',
        maxlag: int = 4,
        fname: Text = '',
        verbose: bool = True) -> dict:
    """Computes the relationship between two vectors.

    Granger causality tests whether the time series in the 2nd column Granger
    causes the time series in the 1st column. In here it means, if v2 Granger
    causes v1 or not.

    Args:
        v1: First array of numbers.

        v2: Second array of numbers.

        v1_label: The string label for v1.

        v2_label: The string label for v2.

        maxlag: Maximum lag in the Granger causality test.

        fname: File name. If empty string, it does not save it.
    
        verbose: If we the function to print the full report.

    Returns:
        Dictionary of correlation p-value, r-value and causality report.

    Raises:
        If there was insufficient observations for the given lag.
    """
    # Correlation test.
    rval, pval = pearsonr(v1, v2)

    if verbose:
        significant = ''
        if pval < 0.05:
            significant = 'yay!!!!'
        print('r-val: {}\np-val: {} \t{}'.format(rval, pval, significant))

        # Scatter plot.
        f = plt.figure()
        sns.scatterplot(v2, v1)
        # plt.plot((min(v1), max(v2)), (max(v1), min(v2)), 'r')
        plt.plot(np.linspace(min(v2), max(v2)), np.linspace(min(v1), max(v1)), 'r')
        plt.xlabel(v2_label)
        plt.ylabel(v1_label)
        plt.show()
        if fname:
            f.savefig('{}.png'.format(fname), bbox_inches='tight')
            f.savefig('{}.pdf'.format(fname), bbox_inches='tight')

    # Causality test.
    causality_res = grangercausalitytests(
        np.column_stack((v1, v2)),
        maxlag=maxlag,
        verbose=verbose)
    return {'rval': rval, 'pval': pval, 'causality': causality_res}
def read_csv_by_Pandas(file):
    df = (pd.read_csv(file, sep=",", header=0))[st:ed]
    dfIt = df['iteration']
    dfS_1 = df['1']
    dfS_2 = df['2']
    dfS_3 = df['3']
    #dfS_4 = df['4']
    #dfS_5 = df['5']

    print('\n\n1 eats 2?')
    grangercausalitytests(df[['1', '2']], maxlag=[maxlag])
    print('\n\n2 eats 1?')
    grangercausalitytests(df[['2', '1']], maxlag=[maxlag])
    print('\n\n1 eats 3?')
    grangercausalitytests(df[['1', '3']], maxlag=[maxlag])
    print('\n\n3 eats 1?')
    grangercausalitytests(df[['3', '1']], maxlag=[maxlag])
    print('\n\n2 eats 3?')
    grangercausalitytests(df[['2', '3']], maxlag=[maxlag])
    print('\n\n3 eats 2?')
    grangercausalitytests(df[['3', '2']], maxlag=[maxlag])
    #print('\n\n5 eats 3?')
    #grangercausalitytests(df[['5', '3']], maxlag=[50])
    #print('\n\n3 eats 5?')
    #grangercausalitytests(df[['3', '5']], maxlag=[50])
    #print('\n\n1 eats 4?')
    #grangercausalitytests(df[['1', '4']], maxlag=[100])
    #print('\n\n4 eats 1?')
    #grangercausalitytests(df[['4', '1']], maxlag=[100])
    #print('\n\n2 eats 4?')
    #grangercausalitytests(df[['2', '4']], maxlag=[100])
    #print('\n\n4 eats 2?')
    #grangercausalitytests(df[['4', '2']], maxlag=[100])

    ax = plt.gca()

    df.plot(x='iteration', y='1', kind='line', ax=ax)
    df.plot(x='iteration', y='2', kind='line', ax=ax)
    df.plot(x='iteration', y='3', kind='line', ax=ax)
    #df.plot(x='iteration', y='4', kind='line', ax=ax)
    #df.plot(x='iteration', y='5', kind='line', ax=ax)
    plt.show()
예제 #18
0
from statsmodels.tsa.stattools import grangercausalitytests
import pandas as pd
df = pd.read_excel('data12.xlsx', usecols=[1, 2])
grangercausalitytests(df, maxlag=3)
예제 #19
0
#%%
from statsmodels.tsa.stattools import grangercausalitytests

#Drop rows with nan values
#Because we can not implement granger causalitytest on values contains NaN or inf

wmt.dropna(subset=['Vader Sentiment', 'returns'], inplace=True)

import statsmodels

statsmodels.tsa.stattools.adfuller(wmt.returns, regression='ct')
#non unit root

# According to the results, we believe there is no granger causality from sentiment to returns, ie all p-values are all above .05, accepting the null hypothesis, the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1.

grangercausalitytests(wmt[['returns', 'Vader Sentiment']], maxlag=4)

# ## Regress
#
# $$
# r_{i, t}=\alpha+\beta_{1} \Delta s_{1, t}+\beta_{2} \Delta s_{i, t-1}+\epsilon_{t}
# $$
#

wmt['sentiment_lag'] = wmt['Vader Sentiment'].shift(1)

wmt['L_s1'] = wmt['Vader Sentiment'].pct_change(1)
wmt['L_s2'] = wmt['sentiment_lag'].pct_change(1)
wmt.head()

import statsmodels.formula.api as smf
import pymongo
import datetime
import math

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook


import statsmodels.tsa.stattools as st

client = pymongo.MongoClient()
db = client.vix

n_vix = []
n_fear = []

for post in db['vix'].find().sort('date', 1):

	n_vix.append(post['vnx'])
	n_fear.append(post['fear_ip'])

granger_array = np.array([np.array(n_vix), np.array(n_fear)]).T

granger_positive = st.grangercausalitytests(granger_array, 7, addconst=True, verbose=True)
print granger_positive[1][0]
예제 #21
0
 def Granger(self,arr,max_lag=12,addconst=True,disp=True):
     x=np.vstack((self.arr,arr)).T
     return tsa_tools.grangercausalitytests(x,maxlag=max_lag,addconst=addconst,verbose=disp)
예제 #22
0
from statsmodels.compat.python import iteritems
import numpy as np
from numpy.testing import assert_almost_equal
from statsmodels.datasets import macrodata

import statsmodels.tsa.stattools as tsa_stats

# some example data
mdata = macrodata.load_pandas().data
mdata = mdata[['realgdp','realcons']].values
data = mdata
data = np.diff(np.log(data), axis=0)

#R: lmtest:grangertest
r_result = [0.243097, 0.7844328, 195, 2]  #f_test
gr = tsa_stats.grangercausalitytests(data[:,1::-1], 2, verbose=False)
assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7)
assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'],
                    decimal=7)

lag = 2
print('\nTest Results for %d lags' % lag)
print()
print('\n'.join(['%-20s statistic: %f6.4   p-value: %f6.4' % (k, res[0], res[1])
                 for k, res in iteritems(gr[lag][0]) ]))

print('\n Results for auxiliary restricted regression with two lags')
print()
print(gr[lag][1][0].summary())

print('\n Results for auxiliary unrestricted regression with two lags')
예제 #23
0
# ### Grangers Causality Test
# We have done grangers causality test to figure out the which variable lags impacts the other variables. Clearly Export and import data are independent. But CPI and FX is dependent to some extent to other variables.

# In[23]:

gc_matrix = pd.DataFrame(np.zeros(
    [len(adj_data.columns), len(adj_data.columns)]),
                         columns=adj_data.columns,
                         index=adj_data.columns)

# In[24]:

for y in gc_matrix.columns:
    for x in gc_matrix.index:
        res = grangercausalitytests(adj_data[[y, x]], maxlag=6, verbose=False)
        p_values = [
            round(res[i + 1][0]["ssr_chi2test"][1], 4) for i in range(6)
        ]
        min_values = np.min(p_values)
        gc_matrix.loc[x, y] = min_values

# In[25]:

gc_matrix

# ### Cointegraton Test
# Lets see if the variables we considered are cointegrated. For that we have used Johansen test. The result shows no cointegration for the four variables. But If we take first difference of the series, It becomes cointegrated.

# In[26]:
예제 #24
0
#twitter dataset
twitter_data = pd.read_csv('C:/Users/Smit/combined_csv1.csv')
dfVolume = twitter_data.groupby(['Date'])['Date'].count()
dfT = pd.Series.to_frame(dfVolume)
dfT['id'] = list(dfT.index)
dfT.columns = ['Count', 'DateTime']
dfT.to_csv('C:/Users/Smit/TWvol.csv')

dfGC = pd.DataFrame()
dfGC['StockPrice'] = df.Close
dfGC['TweetsCount'] = dfT.Count
dfGC['TweetsCount'] = dfGC['TweetsCount'].ffill()
#dfGC1 = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/a10.csv', parse_dates=['date'])
#df['month'] = df.date.dt.month
grangercausalitytests(dfGC[['TweetsCount', 'StockPrice']], maxlag=3)

#######################################################################

#twitter dataset
dfPosRatio = pd.read_csv('C:/Users/Smit/PosRatioTweets.csv', index_col=1)

dfGC1 = pd.DataFrame()
dfGC1['StockPrice'] = df.Close
dfGC1['Ratio'] = dfPosRatio.Ratio
dfGC1['Ratio'] = dfGC1['Ratio'].ffill()
grangercausalitytests(dfGC1[['Ratio', 'StockPrice']], maxlag=3)

#######################################################################
# stock price vs crude oil
dfSP = pd.read_csv("C:/Users/Smit/Dataset/yahoo/stockMarket.csv")
                X = pd.DataFrame(datas_minus_signal.iloc[:, k]).apply(
                    lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
                figure_count = 1
                plt.figure(figure_count)
                figure_count += 1
                plt.plot(y, 'k-', label='%s' % j+ '_' + '%s' % stage)
                plt.plot(X, 'b-', label='%s' % i + '_' + '%s' % stage+ '%s' % columns_name)
                plt.legend(loc='upper right')
                plt.legend(prop=zhfont1)
                plt.savefig(para.path_results + '%s' % i + '_' +'%s' % j + '_' + '%s' % stage + '%s' % columns_name+ '%d.jpg' % (k))
                plt.close()
                # step 4:时间序列检验:格兰杰因果检验
                x_value = datas_minus_signal.iloc[:, k].diff().dropna().values  # 原来的代码是对环比数据进行格兰杰因果检验
                y = datas_minus_signal.iloc[:, 0]
                y_value = y.loc[datas_minus_signal.iloc[:, k].dropna().index].pct_change().dropna()
                gr_result = grangercausalitytests(np.array([x_value, y_value]).T, maxlag=period, addconst=True,
                                                  verbose=False)
                dict_roll[k] = [gr_result[j + 1][0]['lrtest'][1] for j in range(period)]

                # step 6:输出excel
                y = pd.DataFrame(datas_minus_signal.iloc[:, 0])
                X = pd.DataFrame(datas_minus_signal.iloc[:, k])
                df = pd.merge(y, X, left_index=True, right_index=True)
                #a = (k - 1) * 5    a, '%'
                print(stage, '%s' % j + ':%s' % i + ':%s'% columns_name, round(df.corr().iloc[1, 0],4))
                
                X_name = 'percentile' + '%s' % columns_name + '%'
                df.columns = ['close price', str(X_name)]
                df.index = range(df.shape[0])
                name = '%s' % j + '_' + '%s' % stage + '%' + '%s' % stage+'.xlsx'
                # 需要生成文件的话,取消下行的注释
                df.to_excel(para.path_results + name)
예제 #26
0
def granger_causality_tests(
    ts_cause: TimeSeries,
    ts_effect: TimeSeries,
    maxlag: int,
    addconst: bool = True,
    verbose: bool = True,
) -> None:
    """
    Provides four tests for granger non causality of 2 time series using
    :func:`statsmodels.tsa.stattools.grangercausalitytests`.
    See [1]_.


    Parameters
    ----------
    ts_cause
        A univariate deterministic time series. The statistical test determines if this time series
        'Granger causes' the time series ts_effect (second parameter). Missing values are not supported.
        if H_0 (non causality) is rejected (p near 0), then there is a 'granger causality'.
    ts_effect
        Univariate time series 'Granger caused' by ts_cause.
    maxlag
        If an integer, computes the test for all lags up to maxlag.
        If an iterable, computes the tests only for the lags in maxlag.
    addconst
        Include a constant in the model.
    verbose
        Print results.

    Returns
    -------
    Dict
        All test results, dictionary keys are the number of lags. For each lag the values are a tuple,
        with the first element a dictionary with test statistic, pvalues, degrees of freedom, the second element are
        the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast)
        matrix for the parameter f_test.

    References
    ----------
    .. [1] https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.grangercausalitytests.html
    """

    ts_cause._assert_univariate()
    ts_effect._assert_univariate()

    ts_cause._assert_deterministic()
    ts_effect._assert_deterministic()

    raise_if_not(
        ts_cause.freq == ts_effect.freq,
        "ts_cause and ts_effect must have the same frequency.",
    )

    if not ts_cause.has_same_time_as(ts_effect):
        logger.warning(
            "ts_cause and ts_effect time series have different time index. "
            "We will slice-intersect ts_cause with ts_effect.")

    ts_cause = ts_cause.slice_intersect(ts_effect)
    ts_effect = ts_effect.slice_intersect(ts_cause)

    if not stationarity_tests(ts_cause):
        logger.warning(
            "ts_cause doesn't seem to be stationary. Please review granger causality validity in your problem context."
        )
    if not stationarity_tests(ts_effect):
        logger.warning(
            "ts_effect doesn't seem to be stationary. Please review granger causality validity in your problem context."
        )

    return grangercausalitytests(
        np.concatenate(
            (ts_effect.values(copy=False), ts_cause.values(copy=False)),
            axis=1),
        maxlag,
        addconst,
        verbose,
    )
예제 #27
0

# In[89]:


from statsmodels.tsa.stattools import grangercausalitytests
for region in region_names:
    region_to_test = region
    HPcolumn_name = region_to_test +'_HP'

    granger_tests = []
    for idx, reg_name in enumerate(region_names):
        if reg_name ==region_to_test:
            granger_tests.append(None)
        else:
            test = grangercausalitytests(ripple_matrices[idx]['2007':][['AveragePrice',HPcolumn_name]],maxlag = 24, verbose = False)
            granger_tests.append(test)
            
    for idx2, test in enumerate(granger_tests):
        if not test == None:
            if test[24][0]['ssr_ftest'][1] <0.01:
                print(region, 'causing', region_names[idx2] +': \n')
                print('SSR_test:',test[24][0]['ssr_ftest'], 'LR_test:', test[24][0]['lrtest']) 


# In[174]:


from statsmodels.tsa.stattools import grangercausalitytests

granger_dict1 = {}
예제 #28
0
def test_granger_causality_exceptions(dataset):
    with pytest.raises(InfeasibleTestError):
        grangercausalitytests(dataset, 4)
from statsmodels.compat.python import iteritems
import numpy as np
from numpy.testing import assert_almost_equal
from statsmodels.datasets import macrodata

import statsmodels.tsa.stattools as tsa_stats

# some example data
mdata = macrodata.load_pandas().data
mdata = mdata[['realgdp', 'realcons']].values
data = mdata
data = np.diff(np.log(data), axis=0)

#R: lmtest:grangertest
r_result = [0.243097, 0.7844328, 195, 2]  #f_test
gr = tsa_stats.grangercausalitytests(data[:, 1::-1], 2, verbose=False)
assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7)
assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'], decimal=7)

lag = 2
print('\nTest Results for %d lags' % lag)
print()
print('\n'.join([
    '%-20s statistic: %f6.4   p-value: %f6.4' % (k, res[0], res[1])
    for k, res in iteritems(gr[lag][0])
]))

print('\n Results for auxiliary restricted regression with two lags')
print()
print(gr[lag][1][0].summary())
예제 #30
0
 def test_granger_fails_on_finite_check(self, reset_randomstate):
     x = np.random.rand(1000, 2)
     x[500, 0] = np.nan
     x[750, 1] = np.inf
     with pytest.raises(ValueError, match="x contains NaN"):
         grangercausalitytests(x, 2)
    line2 = ax.plot(data_in_fn, c='orange', alpha=0.9)
    line3 = ax2.plot(btc, c='red', alpha=0.7)

    ax.set_ylabel('Normalised TextBlob Sentiment', color='b', labelpad=10)
    ax2.set_ylabel('BTC Price', color='r', labelpad=10)
    plt.savefig(
        f'../plots/btc_textblob_sentiment/rolling_blob_btc_{file_key}.png',
        dpi=150,
        transparent=True)


### Run plotting functions ###
plotter(bbc_per_fn, bbc_per_month, 'bbc')
plotter(cnn_per_fn, cnn_per_month, 'cnn')
plotter(nyt_per_fn, nyt_per_month, 'nyt')
plotter(reuters_per_fn, reuters_per_month, 'reuters')
plotter(agg_per_fn, agg_per_month, 'agg')

### Correlation of time series ###

agg_per_fn = agg_per_fn.iloc[1:-1]
agg_per_fn.corr(btc['price'])

### Correlation of time series ###

test_frame_01 = pd.concat([btc['price'], agg_per_fn], axis=1)
grangercausalitytests(test_frame_01, 70, addconst=True, verbose=True)

test_frame_02 = pd.concat([agg_per_fn, btc['price']], axis=1)
grangercausalitytests(test_frame_02, 70, addconst=True, verbose=True)
        tmp_d=add_months(d,-(j))
        index_list.append(tmp_d)
    data = pd.DataFrame(list(zip(leader_list, sum_list)), index =index_list ,columns =['leader', 'sum']) 
    rawData = data.copy(deep=True)

    # adf test
    X1sta=0
    X2sta=0
    X1 = np.array(data['leader'])
    X1 = X1[~np.isnan(X1)]
    result = adfuller(X1)
    for key, value in result[4].items():
        if result[0]<value: #stationary
            X1sta=1
            break

    X2 = np.array(data['sum'])
    X2 = X2[~np.isnan(X2)]
    result = adfuller(X2)
    for key, value in result[4].items():
        if result[0]<value: #stationary
            X2sta=1
            break

    if X1sta==0:
        data['leader'] = data['leader'] - data['leader'].shift(1)
    if X2sta==0:
        data['sum'] = data['sum'] - data['sum'].shift(1)
    data = data.dropna()
    res = grangercausalitytests(data[['sum','leader']], maxlag=1)
예제 #33
0
def granger_causality(df, time_key, causality_key):
    return grangercausalitytests(df[[time_key, causality_key]], maxlag=maxlag)
dfout = pd.Series(dftest[0:4], index =['ADF Test Statisctics','p-value','#lags used','# Observations'] )
                  
                  
for key, val in dftest[4].items():
    dfout[f'critical value ({key})'] = val

dftest1 = adfuller(df2['Births'])  

dfout1 = pd.Series(dftest1[0:4], index =['ADF Test Statisctics','p-value','#lags used','# Observations'] )

    

df3 = pd.read_csv('samples.csv', index_col = 0, parse_dates= True)
    
df3.index.freq = 'MS' 

from statsmodels.tsa.stattools import grangercausalitytests

grangercausalitytests(df3[['a','d']], maxlag=3)

grangercausalitytests(df3[['b','d']], maxlag=3)

np.random.seed(42)

df = pd.DataFrame(np.random.randint(20,30,(50,2)),columns = ['test','predictions']);
    


    
    
                                      
예제 #35
0
def granger_run(plsa_file, df_all_normalized):
    
    #This is a granger test running to find relevent topics based on an external time series

    min_probability = 1.0
    df_plsa = pd.read_csv(plsa_file, error_bad_lines=False)
    # df_plsa = df_plsa.drop(df_plsa.columns[3], axis=1)
    # df_plsa = df_plsa.drop(df_plsa.columns[3], axis=1)

    # We find all unique topics and adds each topic as a key to a dictionary: df_topics_collection.
    # The value for each key in this topic is a dataframe containing the dates and probabilities for that topic/
    topics = df_plsa.topic.unique()
    df_topics_collection = {}
    for topic in topics:
        df_topics_collection[topic] = df_plsa.loc[df_plsa['topic'] == topic]

    # This method iterates through each topic and creates stationary time series for the probabilities
    # And the stock data. This is then used in a granger test, who's results are manually inspected to find
    # Relevent topics. These topics are added to a dictionary, relevent_topic
    relevent_topic = {}
    for topic in topics:
        df = df_topics_collection[topic]
        df = df.rename(columns={"date": "Date"})
        df = pd.merge(df, df_all_normalized, on="Date")
        temp_df = df.loc[(df['Contract'] == 'Dem')]
        temp_df['Probablity_stationary'] = temp_df['probability'] - temp_df['probability'].shift(1)
        temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice'] - temp_df['NormalizedPrice'].shift(1)
        temp_df = temp_df.dropna()
        try:
            res = grangercausalitytests(temp_df[['Probablity_stationary', 'NormalizedPrice_stationary']], maxlag=5)
            relevent_topic[topic] = res
            print(res)
        except:
            continue
            
    # This is a pearson test to find relevant words within a topic based on an external CSV 

    # We find relevant words for each topic, and create a new CSV 'words_per_topic.csv'.
    # We also create a new CSV 'word_frequency.csv' with each word and its frequency throughout each day's files.
    # These files are loaded into a data frame.
    word_retriever.retrieve_words_per_topic_and_frequency('plsa_without_prior.csv')
    df_word_per_topic = pd.read_csv('words_per_topic.csv', error_bad_lines=False, header=None)
    df_word_frequency = pd.read_csv('word_frequency.csv', error_bad_lines=False)

    # We find the positive and negatively correlated words for each topic.
    # We run a pearson coefficient test on these topics and then find all topics that add up to our probability mass.
    # This then creates two new topics which are reported in two new CSVs '[topic]_[positive/negative].csv'
    # which are written to disk.
    for index, row in df_word_per_topic.iterrows():
        topic = row[0]
        words = row[1].split(",")
        pearson_results = {}
        prob_mass = 0.75
        for word in words:
            df_word_freq = df_word_frequency[df_word_frequency.word == word]
            df_word_freq = df_word_freq.rename(columns={"date": "Date"})
            df_word_freq = pd.merge(df_word_freq, df_all_normalized, on="Date")
            df_word_freq = df_word_freq.loc[(df_word_freq['Contract'] == 'Dem')]
            # df_word_freq['frequency_stationary'] = df_word_freq['frequency']-df_word_freq['frequency'].shift(1)
            df_word_freq['NormalizedPrice_stationary'] = df_word_freq['NormalizedPrice'] - df_word_freq[
                'NormalizedPrice'].shift(1)
            # print(df_word_freq)
            df_word_freq = df_word_freq.dropna()
            corr, _ = pearsonr(df_word_freq['frequency'], df_word_freq['NormalizedPrice'])
            if not math.isnan(corr):
                pearson_results[word] = corr

        sorted_ascending = sorted(pearson_results.items(), key=operator.itemgetter(1))
        temp_mass = prob_mass
        negative_words = []
        for key in sorted_ascending:
            if key[1] > 0:
                break
            elif temp_mass + key[1] > 0:
                negative_words.append([key[0], key[1]])
                temp_mass = temp_mass + key[1]
        df_neg = pd.DataFrame(negative_words, columns=['Word', 'Probability'])
        file_name = "../CourseProject/prior_csvs/" + topic + "_negative.csv"
        df_neg.to_csv(file_name)

        sorted_descending = sorted(pearson_results.items(), key=operator.itemgetter(1), reverse=True)
        temp_mass = prob_mass
        positive_words = []
        min_probability = 1.0
        for key in sorted_descending:
            if key[1] < 0:
                break
            elif temp_mass - key[1] > 0:
                positive_words.append([key[0], key[1]])
                temp_mass = temp_mass - key[1]
                min_probability = min(min_probability, key[1])
                print(temp_mass)
        df_pos = pd.DataFrame(positive_words, columns=['Word', 'Probability'])
        file_name = "../CourseProject/prior_csvs/" + topic + "_positive.csv"
        df_pos.to_csv(file_name)

    return min_probability
예제 #36
0
def RegressionAnalysis(df, Independent, Explanatory, Indicators, prefix=None):
    """
    This function performs regression models, comparaison between series

    Arguments:
    ----------
        - df: Pandas DataFrame
            Contains the data to be analyzed
        - Independent: str
            The name of column in df for the Independent variable data
        - Explanatory: str or list
            The name of the column in df for the Explanatory variable data. In case of a multivariate analysis, needed to pass a list object of all column names.
        - Indicators: list
            The list of the indicators/models names to compute
    Return:
    ----------
        - df: Pandas DataFrame
            - Contains the initial df and all series indicators are added like the Residuals or the Fitted Values
        - OneValueIndicators: Pandas DataFrame
            - Contains all the indicators calculated with only one value like the FTest or the TTest

    """

    if Indicators == None:
        Indicators = [
            "OLS", "GLSAR", "RecursiveLS", "Yule Walker Order 1",
            "Yule Walker Order 2", "Yule Walker Order 3", "Burg Order 1",
            "Burg Order 2", "Burg Order 3", "QuantReg", "GLM Binomial",
            "GLM Gamma", "GLM Gaussian", "GLM Inverse Gaussian",
            "GLM Negative Binomial", "GLM Poisson", "GLM Tweedie"
            "AR", "ARMA", "ARIMA", "Granger Causality", "Levinson Durbin",
            "Cointegration"
        ]

    # Pre-processing
    Independent = df[Independent]
    Independent = pd.DataFrame(Independent)

    Explanatory = df[Explanatory]
    Explanatory = pd.DataFrame(Explanatory)

    y_sm = np.array(Independent).reshape((-1, 1))

    x_sm = np.array(Explanatory)
    x_sm = sm.add_constant(x_sm)

    NumDecimal = 3  # Number of decimals for rounding numbers

    OneValueIndicators = {}

    if prefix == None:
        prefix = ""

    ##################################################
    ##### PART 1: Linear Regression
    ##################################################
    """
    ########## Section 1: OLS
    """
    name = "OLS"

    if name in Indicators:
        name = prefix + name

        model = sm.OLS(y_sm, x_sm)
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 2: WLS
    """

    ### Not Implemented
    """
    ########## Section 3: GLS
    """

    ### Not Implemented
    """
    ########## Section 4: GLSAR
    """

    name = "GLSAR"

    if name in Indicators:
        name = prefix + name

        model = sm.GLSAR(y_sm, x_sm, 1)
        results = model.iterative_fit(1)

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 5: RLS
    """

    name = "RecursiveLS"

    if name in Indicators:
        name = prefix + name

        model = sm.RecursiveLS(y_sm, x_sm)
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators[name + " Z Value"] = results.zvalues

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)

        # Cumsum
        # Not Implemented
    """
    ########## Section 6: Yule Walker ORder 1
    """
    name = "Yule Walker Order 1"

    if name in Indicators and len(Explanatory.columns) == 1:
        name = prefix + name

        rho, sigma = statsmodels.regression.linear_model.yule_walker(
            x_sm[:, 1].flatten(), order=1)

        ### One Value Indicators

        # Rho
        OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal)

        # Sigma
        OneValueIndicators[name + " Sigma"] = round(sigma, NumDecimal)
    """
    ########## Section 7: Yule Walker ORder 2
    """
    name = "Yule Walker Order 2"

    if name in Indicators and len(Explanatory.columns) == 1:
        name = prefix + name

        rho, sigma = statsmodels.regression.linear_model.yule_walker(
            x_sm[:, 1].flatten(), order=2)

        ### One Value Indicators

        # Rho
        OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal)

        # Sigma2
        OneValueIndicators[name + " Sigma"] = round(sigma, NumDecimal)
    """
    ########## Section 8: Yule Walker ORder 3
    """
    name = "Yule Walker Order 3"

    if name in Indicators and len(Explanatory.columns) == 1:
        name = prefix + name

        rho, sigma = statsmodels.regression.linear_model.yule_walker(
            x_sm[:, 1].flatten(), order=3)

        ### One Value Indicators

        # Rho
        OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal)

        # Sigma
        OneValueIndicators[name + " Sigma"] = round(sigma, NumDecimal)
    """
    ########## Section 9: Burg's AR(p) ORder 1
    """

    name = "Burg Order 1"

    if name in Indicators and len(Explanatory.columns) == 1:
        name = prefix + name

        rho, sigma2 = statsmodels.regression.linear_model.burg(
            x_sm[:, 1].flatten(), order=1)

        ### One Value Indicators

        # Rho
        OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal)

        # Sigma2
        OneValueIndicators[name + " Sigma2"] = round(sigma2, NumDecimal)
    """
    ########## Section 10: Burg's AR(p) ORder 2
    """

    name = "Burg Order 2"

    if name in Indicators and len(Explanatory.columns) == 1:
        name = prefix + name

        rho, sigma2 = statsmodels.regression.linear_model.burg(
            x_sm[:, 1].flatten(), order=2)

        ### One Value Indicators

        # Rho
        OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal)

        # Sigma2
        OneValueIndicators[name + " Sigma2"] = round(sigma2, NumDecimal)
    """
    ########## Section 11: Burg's AR(p) ORder 3
    """

    name = "Burg Order 3"

    if name in Indicators and len(Explanatory.columns) == 1:
        name = prefix + name

        rho, sigma2 = statsmodels.regression.linear_model.burg(
            x_sm[:, 1].flatten(), order=3)

        ### One Value Indicators

        # Rho
        OneValueIndicators[name + " Rho"] = round(rho[0], NumDecimal)

        # Sigma2
        OneValueIndicators[name + " Sigma2"] = round(sigma2, NumDecimal)
    """
    ########## Section 12: Quantile Regression
    """

    name = "QuantReg"

    if name in Indicators:
        name = prefix + name

        model = sm.QuantReg(y_sm, x_sm)
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)

    ##################################################
    ##### PART 2: Generalized Linear Models
    ##################################################
    """
    ########## Section 1: GLM Binomial
    """

    name = "GLM Binomial"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.Binomial())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 2: GLM Gamma
    """

    name = "GLM Gamma"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.Gamma())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 3: GLM Gaussian
    """

    name = "GLM Gaussian"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.Gaussian())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 3: GLM InverseGaussian
    """

    name = "GLM Inverse Gaussian"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.InverseGaussian())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 4: GLM NegativeBinomial
    """

    name = "GLM Negative Binomial"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.NegativeBinomial())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 5: GLM Poisson
    """

    name = "GLM Poisson"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.Poisson())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)
    """
    ########## Section 6: GLM Tweedie
    """

    name = "GLM Tweedie"

    if name in Indicators:
        name = prefix + name

        model = sm.GLM(y_sm, x_sm, family=sm.families.Tweedie())
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators["Pearson chi2"] = round(results.pearson_chi2,
                                                   NumDecimal)

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)

    ##################################################
    ##### PART 3: Robust Linear Models
    ##################################################

    ##################################################
    ##### PART 4: AR models
    ##################################################

    name = "AR"

    if name in Indicators:
        name = prefix + name

        model = statsmodels.tsa.ar_model.AR(Independent)
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators[name + " Final Prediction Error"] = results.fpe

        OneValueIndicators[
            name + " Hannan-Quinn Information Criterion"] = results.hqic

        OneValueIndicators[name + " Roots"] = results.roots

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)

    ##################################################
    ##### PART 5: ARMA
    ##################################################

    name = "ARMA"

    if name in Indicators:

        name = prefix + name

        model = statsmodels.tsa.arima_model.ARMA(y_sm, (5, 5), x_sm)
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators[name + " AR Params"] = results.arparams

        OneValueIndicators[name + " AR Roots"] = results.arroots

        OneValueIndicators[name + " AR Freq"] = results.arfreq

        OneValueIndicators[
            name + " Hannan-Quinn Information Criterion"] = results.hqic

        OneValueIndicators[name + " MA Params"] = results.maparams

        try:
            OneValueIndicators[name + " MA Roots"] = results.maroots
        except:
            pass

        try:
            OneValueIndicators[name + " MA Freq"] = results.mafreq
        except:
            pass

        OneValueIndicators[name + " Sigma2"] = results.sigma2

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)

    ##################################################
    ##### PART 6: ARIMA
    ##################################################

    name = "ARIMA"

    if name in Indicators:

        name = prefix + name

        model = statsmodels.tsa.arima_model.ARIMA(Independent, (2, 2, 2),
                                                  Explanatory)
        results = model.fit()

        ### One Value Indicators

        OneValueIndicators = Statsmodels_Regression_All_OneValueIndicators(
            OneValueIndicators, name, results, Explanatory, NumDecimal)

        OneValueIndicators[name + " AR Params"] = results.arparams

        OneValueIndicators[name + " AR Roots"] = results.arroots

        OneValueIndicators[name + " AR Freq"] = results.arfreq

        OneValueIndicators[
            name + " Hannan-Quinn Information Criterion"] = results.hqic

        OneValueIndicators[name + " MA Params"] = results.maparams

        OneValueIndicators[name + " MA Roots"] = results.maroots

        OneValueIndicators[name + " MA Freq"] = results.mafreq

        OneValueIndicators[name + " Sigma2"] = results.sigma2

        ### Time Series Indicators

        # Fitted Values
        df = Statsmodels_FittedValues(df, results, name)

        # Residuals
        df = Statsmodels_LR_Residuals(df, results, name)

        ##################################################
        ##### PART 7: Univariate Analysis
        ##################################################

        # Granger Causality
        name = "Granger Causality"
        name = prefix + name
        if name in Indicators:
            OneValueIndicators[name] = ts.grangercausalitytests(
                Independent.merge(Explanatory,
                                  how="inner",
                                  left_index=True,
                                  right_index=True),
                maxlag=10)

        # Levinson Durbin
        name = "Levinson Durbin"
        name = prefix + name
        if name in Indicators:
            OneValueIndicators[name] = ts.levinson_durbin(Independent)

        # Cointegration
        name = "Cointegration"
        name = prefix + name
        if name in Indicators:
            OneValueIndicators[name] = ts.coint(Independent,
                                                Explanatory,
                                                trend="ct",
                                                return_results=False)

    ##################################################
    ##### Not Implemented
    ##################################################

    # BDS Statistic (residuals analysis)
    # Not Implemented

    # Return’s Ljung-Box Q Statistic (AR)
    # Not Implemented
    OneValueIndicators = pd.DataFrame.from_dict(OneValueIndicators,
                                                orient="index")

    return df, OneValueIndicators
print("Optimal number of lags for click data is "+str(numLagsclick))

ARaic = ar_model.AR(encountersPerDay.tolist()).fit(maxlag=MAX_LAG, ic="aic")
ARbic = ar_model.AR(encountersPerDay.tolist()).fit(maxlag=MAX_LAG, ic="bic")
#select the fewer number of parameters between both criteria.
numLagsEnc = len(ARaic.params) if len(ARaic.params) < len(ARbic.params) else len(ARbic.params)

print("Optimal number of lags for encounter data is "+str(numLagsEnc))


#2. now that I know the optimal number of parameters, I can call the
#granger causality function of statsmodels.
data = pd.concat([encountersPerDay, clicksPerDay], axis=1)
print("\nGranger causality results of clicks onto encounters")
results = stattools.grangercausalitytests(data, maxlag=numLagsEnc)

data = pd.concat([clicksPerDay, encountersPerDay], axis=1)
print("\nGranger causality results of encounters onto clicks")
results = stattools.grangercausalitytests(data, maxlag=numLagsclick)

#ideally, I would implement this myself, however statsmodels is buggy and
#does not deal with exogenous variables well, meaning I would have to
#implement an AR fitting algorithm, which is non-ideal.



MAX_LAG = 3

endog = encountersPerDay.tolist()
exog = clicksPerDay.tolist()
예제 #38
0
x2 = np.array(abortion_data.iloc[1:-1, -1])

# Crime rates per year
x1 = np.array(crime_data.loc[5])

x1 = np.delete(x1, [0, 1, 2])
for i in range(len(x1)):
    x1[i] = x1[i].replace(',', '')

x1 = x1.astype(np.float)
x2 = x2.astype(np.float)

x = np.array([x1, x2])
x = np.transpose(x)

res1 = st.grangercausalitytests(x, maxlag=10, verbose=True)
print("{:<8} {:<15} {:<10}".format('LAG', 'F-value', 'p-value'))
for k, v in res1.items():
    d = v[0]
    print("{:<8} {:<15} {:<10}".format(k, d['params_ftest'][0],
                                       d['params_ftest'][1],
                                       d['params_ftest'][2]))

x = np.array([x2, x1])
x = np.transpose(x)

res2 = st.grangercausalitytests(x, maxlag=10, verbose=True)
for k, v in res2.items():
    d = v[0]
    print("{:<8} {:<15} {:<10}".format(k, d['params_ftest'][0],
                                       d['params_ftest'][1],
예제 #39
0
def test_granger_causality_exceptions(dataset):
    with pytest.raises(InfeasibleTestError):
        with pytest.warns(FutureWarning, match="verbose"):
            grangercausalitytests(dataset, 4, verbose=False)
예제 #40
0

period_summaries = db['period_summary']
query = period_summaries.aggregate([{
      '$group': {
        '_id': 'date_range',
        'min_date': { '$min': '$period_start' },
        'max_date': { '$max': '$period_end' }
      }
    }]).next()





r = grangercausalitytests(np.random.random((100,2)), 5, addconst=True, verbose=True):

example: r[1][0]['lrtest']

r = {
    1: ({'lrtest': (2.5498146592526325, 0.11030719462362751, 1),
         'params_ftest': (2.5046637818898794, 0.11679877991627048, 96.0, 1),
          'ssr_ftest': (2.5046637818898669, 0.1167987799162722, 96.0, 1),
          'ssr_chi2test': (2.5829345250739255, 0.1080212350855112, 1)},....
    2: ({'lrtest': ...
    3: ...
    4: ...
    5: ...
}
>>> r[1][0]['lrtest']
only one df variable..
예제 #41
0
def test_granger_causality_exception_maxlag(gc_data):
    with pytest.raises(ValueError, match="maxlag must be"):
        grangercausalitytests(gc_data, maxlag=-1)
    with pytest.raises(NotImplementedError):
        grangercausalitytests(gc_data, 3, addconst=False)
예제 #42
0
def relate_term_n_stock_ts(term_panel, stocks_panel):
    # both term_df and stock_df must be stationary
    # for all stock dfs do:

    r = grangercausalitytests(stock, max_lags, verbose=False):
예제 #43
0
def test_granger_causality_verbose(gc_data):
    with pytest.warns(FutureWarning, match="verbose"):
        grangercausalitytests(gc_data, 3, verbose=True)
예제 #44
0
 def test_granger_fails_on_nobs_check(self):
     # Test that if maxlag is too large, Granger Test raises a clear error.
     X = np.random.rand(10, 2)
     grangercausalitytests(X, 2)  # This should pass.
     assert_raises(ValueError, grangercausalitytests, X, 3)
예제 #45
0
def granger_causality(primary_features,
                      secondary_features,
                      secondary_feature_names,
                      change_points_primary,
                      change_points_secondary,
                      p_value=0.01):
    '''Given change points and representations of two perspectives, this function calculates the cause-effect relationships.
    A reduced primary and an unreduced secondary perspective along with the
    names of the secondary features have to be provided. This function filters
    the change points of the secondary perspective that precede a change point
    in the primary perspective and test, whether there are granger causal features,
    given the lag between drifts
    args:
        primary_perspective_reduced: Reduced time series retrieved by the previously
          executed dimensionality reduction.
        secondary_features: Feature representation for the secondary perspective.
          Retrieved when constructing the feature representation for the secondary perspective.
        secondary_feature_names: List of the feature names, that is retrieved when
          constructing the features.
        change_points_primary: List of primary change points
        change_points_secondary: List of secondary change points
        p_value: Maximum p-value
    '''
    tmp = np.array(primary_features)
    transpose = tmp.T
    primary_features = transpose.tolist()

    tmp = np.array(secondary_features)
    transpose = tmp.T
    secondary_features = transpose.tolist()

    results = []
    if not isinstance(primary_features[0], list):
        primary_features = [primary_features]
    for cp_1 in change_points_primary:
        for cp_2 in change_points_secondary:
            if cp_2 < cp_1:
                k = cp_1 - cp_2
                feature_set = {}
                p = p_value
                for i in range(0, len(secondary_features)):
                    f = secondary_features[i]
                    for f_2 in primary_features:
                        granger_data = pd.DataFrame(f_2)
                        granger_data[secondary_feature_names[i]] = f
                        granger_data = granger_data.dropna()
                        try:
                            gc_res = grangercausalitytests(granger_data, [k],
                                                           verbose=False)
                            #Increase the margin by 1% (or even less) to account for numeric approximation errors
                            if gc_res[k][0]['params_ftest'][1] < p * 1.01:
                                p_feat = primary_features.index(f_2)
                                if p_feat not in feature_set.keys():
                                    feature_set[p_feat] = []
                                if secondary_feature_names[
                                        i] not in feature_set[p_feat]:
                                    feature_set[p_feat].append(
                                        secondary_feature_names[i])
                        except ValueError:
                            pass
                results.append((cp_1, cp_2, feature_set, p))
    return results
openStockPrice = reader.Open;
#changesOfStockPrice
changesOfStockPrice = openStockPrice - closeStockPrice;

#find overlap days in two list
newlist = set(day) & set(day2)
newlist = list(newlist)

priceChange = []
avgSenti = []
#joint changes of prices and averaged sentiment into a new 2D array by containing days
for i in range(0,len(newlist)):
    x = newlist[i]
    if x in day:
        if x in day2:
            x = str(x)
            i = day.index(x)
            j = day2.index(x)
            priceChange.append(changesOfStockPrice[i])           
            avgSenti.append(avgSentiment[j])
			
X= np.vstack(([np.array(priceChange).T], [np.array(avgSenti).T])).T
#lagged operations from 1 to 7, as 8 causes value error
maxlag = 7
#perform a causality test and print 
d=stat.grangercausalitytests(X,maxlag,True,True)         

    
    
         
예제 #47
0
def compute_casuality(data=placeholder_insights, data2=placeholder_activity):
    merged = pd.merge(data, data2, on=['ds', 'ds'])
    m = merged[['y', 'activity_time']].as_matrix()
    result = grangercausalitytests(m, maxlag=5)
    return result[2][0]['lrtest'][1]
예제 #48
0
    else:
        caus_xy = G.causality_xy
        caus_yx = G.causality_yx
        caus_sim = G.simultaneous_causality
    g1 = np.mean(caus_xy, -1)
    g2 = np.mean(caus_yx, -1)
    g3 = np.mean(caus_sim, -1)
    Gs.append((g1[0, 1], g2[0, 1], g3[0, 1]))
Gs
maxlag = 10
rois
rois.shape
gcs_val = np.zeros((rois.shape[0], rois.shape[0], maxlag))
for i in range(rois.shape[0]):
    for j in range(rois.shape[0]):
        test, reg = grangercausalitytests(rois[[j, i]], maxlag)
        ssrEig = reg[0].ssr
        ssrBeid = reg[1].ssr
        gcs_val[i, j] = np.log(ssrEig / ssrBeid)
rois.shape
plt.plot(rois.T);
plt.show()
rois = rois.T
gcs_val = np.zeros((rois.shape[0], rois.shape[0], maxlag))
for i in range(rois.shape[0]):
    for j in range(rois.shape[0]):
        test, reg = grangercausalitytests(rois[[j, i]], maxlag)
        ssrEig = reg[0].ssr
        ssrBeid = reg[1].ssr
        gcs_val[i, j] = np.log(ssrEig / ssrBeid)
rois.shape
예제 #49
0
def thePar(theTickers, baseTicker, start, end, postThresh, theWindow, testSize, numTickers, fileName, thePath, mongoInsert):
        
    #variables
    statSig = .05 #obvious
    #postThresh = .7 #posterior probability threshold
    #corThresh = .15 #arbitrary p-vaue for pearson correl
    #theWindow = 10 #arbitrary rolling window
    rollRet = float(0)
    totalLong = 0
    totalShort = 0
    rollLongRight = 0
    rollShortRight = 0
    #testSize = .90
    priceThresh = 5.0
    
    tempBase = web.DataReader(baseTicker,"yahoo",start,end)
    tempBase['retBase'] = np.log(tempBase['Adj Close'].astype(float)) - np.log(tempBase['Adj Close'].astype(float).shift(1))
    #tempBase['retBase'] = np.log(tempBase['Close'].astype(float)) - np.log(tempBase['Open'].astype(float))
    tempBase = tempBase[['retBase','Close']]
    tempBase.columns = ['retBase','closeBase']
    tempBase.reindex()
    tempBase = tempBase.dropna()
    
    thePerf = list()
    finalData = pd.DataFrame()
    todayData = pd.DataFrame()
    for i in range(0,numTickers):
        try:
            tempData = web.DataReader(theTickers[i],"yahoo",start,end)
            lastPrice = tempData['Close'][len(tempData)-1:len(tempData)][0]
            tempData['retClose'] = np.log(tempData['Adj Close'].astype(float)) - np.log(tempData['Adj Close'].astype(float).shift(1))
            tempData['ret'] = np.log(tempData['Close'].astype(float)) - np.log(tempData['Open'].astype(float))
            tempData = tempData.dropna()
            tempData = tempData[['retClose','ret','Close']]
            tempData.reindex()
            tempData = pd.merge(tempBase,tempData,how='outer', left_index=True, right_index=True)
            tempData['diff'] = tempData['closeBase'] - tempData['Close']
            tempData = tempData.dropna()
    
            tempData = tempData[['retBase','ret','retClose','diff']]
            
            theCor = pearsonr(tempData['retBase'],tempData['retClose'])        
            
            gCause = ts.grangercausalitytests(tempData[['retClose','retBase']],1,verbose=False)[1][0]['params_ftest'][1] #second position --> first position
            #if(theCor[1] <= statSig and (theCor[0] >= corThresh or theCor[0] <= -corThresh) and gCause <= statSig):
            if(gCause <= statSig and lastPrice >= priceThresh):
                tempData['rollMean'] = tempData['ret'].rolling(window=theWindow).mean()
                tempData['rollMeanBase'] = tempData['retBase'].rolling(window=theWindow).mean()
                tempData['rollCor'] = pd.rolling_corr(tempData['retBase'],tempData['ret'],theWindow) #rollCorrelation
                tempData = tempData.dropna()
                
                theLen = len(tempData)
                
                trainLen = int(round(testSize*theLen,0))
                testLen = int(theLen  - trainLen)
                try:
                    y = tempData['ret']#[0:theLen-2] #next day assset return
                    X = tempData[['retBase','rollCor','rollMeanBase','rollMean','diff']]#[1:theLen-1] #event day features
                    
                    trainY = y[1:trainLen-1]
                    testY = y[trainLen:theLen-1]
    
                    trainX = X[0:trainLen-2]
                    testX = X[trainLen-1:theLen-2]
                
                    tDate = list(trainY.index.values)
                    startTrainDate = str(tDate[0])[:10]
                    endTrainDate = str(tDate[len(tDate)-1])[:10]
                    
                    tDate = list(testY.index.values)
                    startTestDate = str(tDate[0])[:10]
                    endTestDate = str(tDate[len(tDate)-1])[:10]
                    
                    tDate = list(tempData.index.values)
                    simDay = str(tDate[len(tDate)-1])[:10]
    
                    model = RandomForestClassifier(n_estimators=25,random_state=42)
                    #model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=300,learning_rate=1,algorithm="SAMME")
                    #model = linear_model.LogisticRegression(C=1e5)
                    #model = SVC(kernel='rbf', class_weight=None)
                    #model = GaussianNB()
    
                    model.fit(trainX,np.sign(trainY))
    
                    postProbs = model.predict_proba(testX)
    
                    todayPostProbs = model.predict_proba(X[theLen-1:theLen])
                    
                    theClasses = model.classes_ #[-1.  0.  1.]
                    neg = int(np.where(theClasses == -1.0)[0])
                    pos = int(np.where(theClasses == 1.0)[0])
    
                    if(todayPostProbs[0][pos] >= postThresh):
                        tempStr = pd.DataFrame({'ticker': [theTickers[i]],' Position': ['Long'],' Confidence': [todayPostProbs[0][pos]]})
                        todayData = todayData.append(tempStr)
                        print("GO LONG ON: " + theTickers[i] + " Confidence: " + str(todayPostProbs[0][pos]))
                        mongoInsert.insert({"theDate": simDay, "ticker": theTickers[i], "position": 1})
                    if(todayPostProbs[0][neg] >= postThresh):
                        tempStr = pd.DataFrame({'ticker': [theTickers[i]],' Position': ['Short'],' Confidence': [todayPostProbs[0][neg]]})
                        todayData = todayData.append(tempStr)
                        print("GO SHORT ON: " + theTickers[i] + " Confidence: " + str(todayPostProbs[0][neg])) 
                        mongoInsert.insert({"theDate": simDay, "ticker": theTickers[i], "position": -1})
    
                    theLongs = np.where(postProbs[:,pos] >= postThresh)[0] #LONG POSITIONS
                    theShorts = np.where(postProbs[:,neg] >= postThresh)[0] #SHORT POSITIONS
                    
                    numPos = len(theLongs)
                    totalLong = totalLong + numPos
                    numNeg = len(theShorts)
                    totalShort = totalShort + numNeg
                    
                    corLong = np.where(np.sign(testY[theLongs]) == 1)[0]
                    longRet = np.sum(testY[theLongs])
                    rollLongRight = rollLongRight + len(corLong)
                    
                    corShort = np.where(np.sign(testY[theShorts]) == -1)[0] 
                    shortRet = np.sum(testY[theShorts])
                    rollShortRight = rollShortRight + len(corShort)
                    
                    theRet = round(float(longRet) - float(shortRet),8)
                    rollRet = round(float(rollRet) + float(theRet),8)
                    thePerf.append(theRet)
                    tempStr = pd.DataFrame({'ticker': [theTickers[i]],'theRet': [theRet],'rollret': [rollRet],'RollShortTrd': [totalShort],'RollLongTrd': [totalLong],
                    'RollLongRight': [rollLongRight],'RollShortRight': [rollShortRight],'startTrainDate': [startTrainDate],'startTestDate': [startTestDate]})
                    finalData = finalData.append(tempStr)
                    #print(theTickers[i] + " Ret: " + str(theRet) + " Roll Ret: " + str(rollRet) + " Short Cnt: " + str(numNeg) + " Long Cnt: " 
                    #+ str(numPos) + " Strt Tr: " + startTrainDate + " Strt Test: " + startTestDate + " sRollCnt: " + str(totalShort) + " lRollCnt: " +  str(totalLong))
                except:        
                    pass
        except:         
            pass      
    
    finalData.to_csv(thePath + time.strftime("%Y-%m-%d") + "_" + baseTicker + "_" + fileName + "_finalData.csv",index=False)
    todayData.to_csv(thePath + time.strftime("%Y-%m-%d") + "_" + baseTicker + "_" + fileName + "_todayData.csv",index=False)
    print("Sharpe: " + str(((np.mean(thePerf)/np.std(thePerf))*math.sqrt(252))))
    temp = np.where(np.asarray(thePerf) < 0)
    print("Sortino: " + str(np.mean(thePerf)/np.std(np.asarray(thePerf)[temp])*math.sqrt(252)))