def adf_test(x):
    stats = ['Test Statistic', 'p-value' , '# of lags', '# of observations']
    adf_test = smt.adfuller(x, autolag = 'AIC') # number of considered lags is automatically selected based on the Akaike Information Criterion (AIC).
    results = pd.Series(adf_test[0:4], index = stats)
    for key, values in adf_test[4].items():
        results[f'Critical Value({key})'] = values
    return results
def cointegrated(all_pairs, X_train):
    # creating a list to hold cointegrated pairs
    cointegrated = []
    # iterate over each pair in possible pairs list; pair is a list of our 2 stock symbols
    for count, allo in enumerate(all_pairs):
        for pair in all_pairs[count]: 
            # getting data for each stock in pair from training_df
            ols = linregress(X_train[str(pair[1])], X_train[str(pair[0])]) #note scipy's linregress takes in Y then X
            # storing slope or hedge ratio in variable
            slope = ols[0]
            # creating spread
            spread = X_train[str(pair[1])] - (slope * X_train[str(pair[0])])
            # testing spread for cointegration
            cadf = adfuller(spread,1)
            # checking to see if spread is cointegrated, if so then store pair in cointegrated list
            if cadf[0] < cadf[4]['1%']:
                print('Pair Cointegrated at 99% Confidence Interval')
                # appending the X and Y of pair
                cointegrated.append([pair[0],pair[1]])
            elif cadf[0] < cadf[4]['5%']:
                print('Pair Cointegrated at 95% Confidence Interval')
                # appending the X and Y of pair
                cointegrated.append([pair[0],pair[1]])
            elif cadf[0] < cadf[4]['10%']:
                print('Pair Cointegrated at 90% Confidence Interval')
                cointegrated.append(pair[0],pair[1])
            else:
                print('Pair Not Cointegrated ')

    return cointegrated 
Пример #3
0
def test_stationarity(timeseries, maxlag=2, regression='c', autolag=None,
                      window=None, plot=False, verbose=False):
    """
    Check unit root stationarity of a time series array or an entire dataframe.
    Note that you must send in a dataframe as df.values.ravel() - otherwise ERROR.
    Null hypothesis: the series is non-stationary.
    If p >= alpha, the series is non-stationary.
    If p < alpha, reject the null hypothesis (has unit root stationarity).
    Original source: http://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
    Function: http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html
    window argument is only required for plotting rolling functions. Default=4.
    """
    # set defaults (from function page)
    if type(timeseries) == pd.DataFrame:
        print('modifying time series dataframe into an array to test')
        timeseries = timeseries.values.ravel()
    if regression is None:
        regression = 'c'
    if verbose:
        print('Running Augmented Dickey-Fuller test with paramters:')
        print('maxlag: {}'.format(maxlag))
        print('regression: {}'.format(regression))
        print('autolag: {}'.format(autolag))
    alpha = 0.05
    if plot:
        if window is None:
            window = 4
        # Determing rolling statistics
        rolmean = timeseries.rolling(window=window, center=False).mean()
        rolstd = timeseries.rolling(window=window, center=False).std()
        # Plot rolling statistics:
        orig = plt.plot(timeseries, color='blue', label='Original')
        mean = plt.plot(rolmean, color='red', label='Rolling Mean ({})'.format(window))
        std = plt.plot(rolstd, color='black', label='Rolling Std ({})'.format(window))
        plt.legend(loc='best')
        plt.title('Rolling Mean & Standard Deviation')
        plt.show(block=False)
    # Perform Augmented Dickey-Fuller test:
    try:
        dftest = smt.adfuller(timeseries, maxlag=maxlag, regression=regression, autolag=autolag)
        dfoutput = pd.Series(dftest[0:4], index=['Test Statistic',
                                                 'p-value',
                                                 '#Lags Used',
                                                 'Number of Observations Used',
                                                 ])
        for key, value in dftest[4].items():
            dfoutput['Critical Value (%s)' % key] = value
        if verbose:
            print('Results of Augmented Dickey-Fuller Test:')
            print(dfoutput)
        if dftest[1] >= alpha:
            print(' this series is non-stationary')
        else:
            print(' this series is stationary')
        return dfoutput
    except:
        print('Augment Dickey-Fuller test gives an error')
        return
Пример #4
0
    def ARIMA_GARCH(self, data, dataType, dataSize):
        X = data

        #Performs an Augmented Dickey-Fuller test to check for stationarity.
        result = smt.adfuller(X[0])
        pvalue = result[1]
        if pvalue < 0.2:
            print('p-value = ' + str(pvalue) +
                  ' The series is likely stationary.')
            differentiation = "None"
        else:
            print('p-value = ' + str(pvalue) +
                  ' The series is likely NON-stationary.')
            #differentiation = "Once"
            differentiation = "None"
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))

        #Starts a subprocess of the ./arima_garch.r program which is a program written in R and uses the rugarch packaged to perform anomaly detection using an ARMA-GARCH
        nr_of_series = len(data)
        print("Starting subprocess")
        subprocess.call([
            "Rscript", "--vanilla", "./arima_garch.r",
            str(dataType),
            str(dataSize),
            str(nr_of_series), differentiation
        ])
        print("Ended subprocess")

        #Load the results that were created by the R program, from file.
        forecasts_mean = genfromtxt('forecasts_mean_' + dataType + "_" +
                                    dataSize + '.csv',
                                    skip_header=0,
                                    delimiter=' ',
                                    dtype=float)
        forecasts_variance = genfromtxt('forecasts_variance_' + dataType +
                                        "_" + dataSize + '.csv',
                                        skip_header=0,
                                        delimiter=' ',
                                        dtype=float)

        #Correcting for NaN's and 0's
        for row in range(len(forecasts_mean)):
            for col in range(len(forecasts_mean[row])):
                if forecasts_mean[row][col] == 0 and col > 0:
                    forecasts_mean[row][col] = forecasts_mean[row][col - 1]
                if np.isnan(forecasts_variance[row][col]):
                    if col > 0:
                        forecasts_variance[row][col] = forecasts_variance[row][
                            col - 1]
                    else:
                        forecasts_variance[row][col] = 1

        return forecasts_mean, forecasts_variance
Пример #5
0
    def analysis(self):
        """http://necochan.com/2014/06/07/python-for-economist-6/"""
        eq = self.criterion_column + "~" + "+".join(self.explanatory_columns)
        self.rm = smf.ols(formula=eq, data=self.data).fit()
        print(self.rm.summary())
        # ADF test, H0: Non-stationary
        tsa.adfuller(self.rm.resid, regression='nc')

        # Autocorrel plot of resid
        autocorrelation_plot(self.rm.resid)  # Show ACF of residuals
        ACF_resid = tsa.acf(self.rm.resid)  # Keep ACF of residuals
        # Checking Multicolinearity by VIF
        VIF = pd.DataFrame([
            oti.variance_inflation_factor(self.rm.model.exog, i)
            for i in range(1, self.rm.model.exog.shape[1])
        ],
                           index=self.rm.model.exog_names[1:],
                           columns=['VIF'])  # VIF&gt;10 should be cared
Пример #6
0
def has_unit_root(X, debug=True):  # = not able to reject null hypothesis
    # Null hypothesis: x has a unit root (= is not stationary, might be trend stationary)
    adf_stat, p_value, used_lag, nobs, critical_values, icbest = smt.adfuller(
        X)
    if debug:
        print(
            '-' * 10,
            ' ADF ',
            '-' * 10,
        )
        print(f'{adf_stat}, {p_value}, {used_lag}, {critical_values}')
    return abs(adf_stat) < abs(critical_values['5%'])
Пример #7
0
def unit_root(ser):

    t_boundary = [
        {
            '1%': -3.96,
            '5%': -3.41,
            '10%': -3.12
        },
        {
            '1%': -3.43,
            '5%': -2.86,
            '10%': -2.57
        },
        {
            '1%': -2.58,
            '5%': -1.95,
            '10%': -1.61
        },
    ]
    """
    t分布临界值(n=∞)
    1%  -2.33
    5%  -1.65
    10% -1.28
    """
    result = adfuller(ser, maxlag=40, store=False, regresults=False)
    #print result

    for i, row in enumerate(t_boundary):
        if result[4]['1%'] > row['1%']:
            print "计算式%s,0.01显著水平下t>临界值%s,不能拒绝原假设,存在单位根,时间序列数据不平稳" % (
                3 - i, row['1%'])
        else:
            print "计算式%s,0.01显著水平下t<临界值%s,拒绝原假设,存在单位根,时间序列数据平稳" % (3 - i,
                                                                   row['1%'])

        if result[4]['5%'] > row['5%']:
            print "计算式%s,0.05显著水平下t>临界值%s,不能拒绝原假设,存在单位根,时间序列数据不平稳" % (
                3 - i, row['5%'])
        else:
            print "计算式%s,0.05显著水平下t<临界值%s,拒绝原假设,存在单位根,时间序列数据平稳" % (3 - i,
                                                                   row['5%'])

        if result[4]['10%'] > row['10%']:
            print "计算式%s,0.10显著水平下t>临界值%s,不能拒绝原假设,存在单位根,时间序列数据不平稳" % (
                3 - i, row['10%'])
        else:
            print "计算式%s,0.10显著水平下t<临界值%s,拒绝原假设,存在单位根,时间序列数据平稳" % (3 - i,
                                                                   row['10%'])

    #print "数据取%s阶滞后下,LM检验表明模型残差项不存在自相关性"%result[2]
    print result[4]
Пример #8
0
    def __adf(self, residuals: array):
        '''
        critical values are in the following dictionary form:
            {'1%': -3.4304385694773387,
             '5%': -2.8615791461685034,
             '10%': -2.566790836162312}
        '''

        adf_results = adfuller(residuals)
        adf_test_statistic: float = adf_results[0]
        adf_critical_values: Dict[str, float] = adf_results[4]

        return adf_test_statistic, adf_critical_values
Пример #9
0
def test_for_stationarity(y):
    cout("Results of Augmented Dickey-Fuller test:")
    dftest = smt.adfuller(y, autolag='AIC')
    rounded = map(lambda x: round(x, 6), dftest[0:4])
    dfoutput = pd.Series(
        rounded,
        index=['test statistic', 'p-value', '# of lags', '# of observations'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value ({})'.format(key)] = round(value, 6)
    cout(dfoutput)
    b, desc = is_stationary(dftest)
    b = "TRUE " if b else "false"
    cout("                 Stationary? {0}   '{1}'".format(b, desc))
    return dftest
Пример #10
0
def stationary(TS):
    """
    Augmented Dickey-Fuller test

    Null Hypothesis (H0): [if p-value > 0.5, non-stationary]
    >   Fail to reject, it suggests the time series has a unit root, meaning it is non-stationary.
    >   It has some time dependent structure.
    Alternate Hypothesis (H1): [if p-value =< 0.5, stationary]
    >   The null hypothesis is rejected; it suggests the time series does not have a unit root, meaning it is stationary.
    >   It does not have time-dependent structure.
    """
    result = smt.adfuller(TS)

    print(f'[ADF Statistic] : {result[0]}')
    print(f'[p-value] : {result[1]}')
    for key, value in result[4].items():
        print(f'[Critical Values {key} ] : {value}')
Пример #11
0
def check_each_var_for_stationarity(time_df, autolag, verbose=0):
    alpha = 0.05
    all_vars = 1
    copy_cols = time_df.columns.tolist()
    for each_var in copy_cols:
        timeseries = time_df[each_var].values
        dftest = smt.adfuller(timeseries, autolag=autolag)
        if verbose >= 2:
            ############################ Print Summary #####################
            output = {
                'test_statistic': round(dftest[0], 4),
                'pvalue': round(dftest[1], 4),
                'n_lags': round(dftest[2], 4),
                'n_obs': dftest[3]
            }
            p_value = output['pvalue']
            print(f'    Augmented Dickey-Fuller Test on "{each_var}"', "\n   ",
                  '-' * 47)
            print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
            print(f' Significance Level    = {alpha}')
            print(f' Test Statistic        = {output["test_statistic"]}')
            print(f' No. Lags Chosen       = {output["n_lags"]}')

            for key, val in dftest[4].items():
                print(f' Critical value {adjust(key)} = {round(val, 3)}')

            if p_value <= alpha:
                print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
                print(f" => Series is Stationary.")
            else:
                print(
                    f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis."
                )
                print(f" => Series is Non-Stationary.")
            ####################################################################
        if dftest[1] < alpha:
            all_vars = 1 * all_vars
        else:
            all_vars = 0 * all_vars
    return all_vars


##################################################################################
Пример #12
0
    def run(self):
        if not self._args:
            return None

        data = self._data_service.get_data(self._args)

        original_data = data.fillna(method='bfill')

        ran = pd.date_range(self._args['date_from'],
                            self._args['date_to'],
                            freq='D')
        original_data = pd.Series(original_data['close'], index=ran)

        original_data = original_data.fillna(method='bfill')
        split = len(original_data) - int(self._args['days_to_predict'])

        train_data, prediction_data = original_data[:split], original_data[
            split:]

        ADF = namedtuple('ADF', 'adf pvalue usedlag nobs critical icbest')
        stationarity_results = ADF(*smt.adfuller(train_data))._asdict()
        significance_level = 0.01

        # if the series are stationary, no need for an integrated order
        order = (1, 0, 1)
        if stationarity_results['pvalue'] > significance_level:
            order = (1, 2, 1)

        # result = self._model_fit(train_data, order)

        # prediction = result.predict(prediction_data.index[0],
        # prediction_data.index[-1],
        # typ='levels')
        # print(prediction.tail(self._args['days_to_predict']))

        result = self._model_fit(original_data, order)
        print(result.summary())

        forecast = result.forecast(steps=int(self._args['days_to_predict']))[0]
        print(forecast)

        return 'object'
Пример #13
0
def ad_fuller_test(timeseries: pd.Series):
    """
    Ad fuller documentation here:
    https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.adfuller.html#statsmodels.tsa.stattools.adfuller
    
    Tests the unit root in a univariate process in the presence of serial
    correlation.
    
    Null hypothesis:
    there is a unit root
    
    Alternative hypothesis:
    there is no unit root, in otherwords the process is stationary.
    
    If the series has a unit root, then there is said to be no regression
    to the mean, while stationary processes will regress to the mean.
    """
    result = adfuller(timeseries)
    AdFullerResult = namedtuple('AdFullerResult', 'statistic pvalue')
    return AdFullerResult(result[0], result[1])
Пример #14
0
def adf_stationary_test(df: pd.DataFrame,
                        alpha: float = 0.05,
                        criterion: str = 'AIC') -> bool:
    """
    Test whether dataframe is stationary using the Augmented Dickey Fuller (ADF)
    test found in statsmodel.
    Source: https://www.insightsbot.com/augmented-dickey-fuller-test-in-python/

    Parameters
    ----------
    df : The pd.DataFrame to test for stationarity. Currently must be univariate.
    alpha : The number that is (1 - confidence interval). The default is 0.05 for 95% CI.
    criterion : The criterion used to automatically determine lag. The default
                is 'AIC' or Akaike information criterion.

    Returns
    -------
    stationary : Whether the df stationary or not.

    """
    # Run Augmented Dickey-Fuller Test (ADF) statistical test:
    adf_test = adfuller(df, autolag=criterion)
    p_value = adf_test[1]

    if (p_value < alpha):
        stationary = True
    else:
        stationary = False

    results = pd.Series(adf_test[0:4],
                        index=[
                            '      ADF Test Statistic', '      P-Value',
                            '      # Lags Used', '      # Observations Used'
                        ])
    # Add Critical Values
    for key, value in adf_test[4].items():
        results[f'      Critical Value ({key})'] = value
    print("    - Augmented Dickey-Fuller Test Results:\n")
    print(results.to_string() + "\n")

    return stationary
Пример #15
0
def get_cointegrated(all_pairs, training_df):

    cointegrated = []

    for count, pair in enumerate(all_pairs):
        try:

            ols = linregress(training_df[str(pair[1])],
                             training_df[str(pair[0])])

            slope = ols[0]

            spread = training_df[str(
                pair[1])] - (slope * training_df[str(pair[0])])

            cadf = adfuller(spread, 1)

            if cadf[0] < cadf[4]['1%']:
                print('Pair Cointegrated at 99% Confidence Interval')

                cointegrated.append([pair[0], pair[1]])
            elif cadf[0] < cadf[4]['5%']:
                print('Pair Cointegrated at 95% Confidence Interval')

                cointegrated.append([pair[0], pair[1]])
            elif cadf[0] < cadf[4]['10%']:
                print('Pair Cointegrated at 90% Confidence Interval')
                cointegrated.append(pair[0], pair[1])
            else:
                print('Pair Not Cointegrated ')
                continue
        except:
            print('Exception: Symbol not in Dataframe')
            continue

    return cointegrated
Пример #16
0
def example_3():
	import pandas_datareader as pdr

	gs = pdr.data.DataReader("GS", data_source='yahoo', start='2006-01-01', end='2010-01-01')
	print(gs.head().round(2))
	print(gs.loc[pd.Timestamp('2006-01-01'):pd.Timestamp('2006-12-31')].head())
	print(gs.loc['2006'].head())

	#--------------------
	# Resampling.
	if True:
		print(gs.resample("5d").mean().head())
		print(gs.resample("W").agg(['mean', 'sum']).head())

		# You can up-sample to convert to a higher frequency. The new points are filled with NaNs.
		print(gs.resample("6h").mean().head())

	#--------------------
	# Rolling, expanding, exponential weighted (EW).
	if False:
		gs.Close.plot(label='Raw')
		gs.Close.rolling(28).mean().plot(label='28D MA')
		gs.Close.expanding().mean().plot(label='Expanding Average')
		gs.Close.ewm(alpha=0.03).mean().plot(label='EWMA($\\alpha=.03$)')

		plt.legend(bbox_to_anchor=(1.25, .5))
		plt.tight_layout()
		plt.ylabel("Close ($)")
		sns.despine()

		# Each of .rolling, .expanding, and .ewm return a deferred object, similar to a GroupBy.
		roll = gs.Close.rolling(30, center=True)

		m = roll.agg(['mean', 'std'])
		plt.figure()
		ax = m['mean'].plot()
		ax.fill_between(m.index, m['mean'] - m['std'], m['mean'] + m['std'], alpha=.25)
		plt.tight_layout()
		plt.ylabel("Close ($)")
		sns.despine()

	#--------------------
	# Grab bag.
	if False:
		# Offsets.
		#	These are similar to dateutil.relativedelta, but works with arrays.
		print(gs.index + pd.DateOffset(months=3, days=-2))

		# Holiday calendars.
		from pandas.tseries.holiday import USColumbusDay
		print(USColumbusDay.dates('2015-01-01', '2020-01-01'))

		# Timezones.
		# tz naiive -> tz aware..... to desired UTC
		print(gs.tz_localize('US/Eastern').tz_convert('UTC').head())

	#--------------------
	# Modeling time series.
	if True:
		from collections import namedtuple
		import statsmodels.formula.api as smf
		import statsmodels.tsa.api as smt
		import statsmodels.api as sm
		from modern_pandas_utils import download_timeseries

		def download_many(start, end):
			months = pd.period_range(start, end=end, freq='M')
			# We could easily parallelize this loop.
			for i, month in enumerate(months):
				download_timeseries(month)

		def time_to_datetime(df, columns):
			'''
			Combine all time items into datetimes.
			2014-01-01,1149.0 -> 2014-01-01T11:49:00
			'''
			def converter(col):
				timepart = (col.astype(str)
					.str.replace('\.0$', '')  # NaNs force float dtype
					.str.pad(4, fillchar='0'))
				return  pd.to_datetime(df['fl_date'] + ' ' + timepart.str.slice(0, 2) + ':' + timepart.str.slice(2, 4), errors='coerce')
				return datetime_part
			df[columns] = df[columns].apply(converter)
			return df

		def unzip_one(fp):
			try:
				zf = zipfile.ZipFile(fp)
				csv = zf.extract(zf.filelist[0])
				return csv
			except zipfile.BadZipFile as ex:
				print('zipfile.BadZipFile raised in {}: {}.'.format(fp, ex))
				raise

		def read_one(fp):
			df = (pd.read_csv(fp, encoding='latin1')
				.rename(columns=str.lower)
				.drop('unnamed: 6', axis=1)
				.pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time'])
				.assign(fl_date=lambda x: pd.to_datetime(x['fl_date'])))
			return df

		store = './modern_pandas_data/ts.hdf5'

		if not os.path.exists(store):
			download_many('2000-01-01', '2016-01-01')

			zips = glob.glob(os.path.join('modern_pandas_data', 'timeseries', '*.zip'))
			csvs = [unzip_one(fp) for fp in zips]
			dfs = [read_one(fp) for fp in csvs]
			df = pd.concat(dfs, ignore_index=True)

			df['origin'] = df['origin'].astype('category')
			df.to_hdf(store, 'ts', format='table')
		else:
			df = pd.read_hdf(store, 'ts')

		with pd.option_context('display.max_rows', 100):
			print(df.dtypes)

		daily = df.fl_date.value_counts().sort_index()
		y = daily.resample('MS').mean()
		print(y.head())

		ax = y.plot()
		ax.set(ylabel='Average Monthly Flights')
		sns.despine()

		X = (pd.concat([y.shift(i) for i in range(6)], axis=1, keys=['y'] + ['L%s' % i for i in range(1, 6)]).dropna())
		print(X.head())

		mod_lagged = smf.ols('y ~ trend + L1 + L2 + L3 + L4 + L5', data=X.assign(trend=np.arange(len(X))))
		res_lagged = mod_lagged.fit()
		res_lagged.summary()

		sns.heatmap(X.corr())

		ax = res_lagged.params.drop(['Intercept', 'trend']).plot.bar(rot=0)
		plt.ylabel('Coefficeint')
		sns.despine()

		# Autocorrelation.
		# 'Results.resid' is a series of residuals: y - ŷ.
		mod_trend = sm.OLS.from_formula('y ~ trend', data=y.to_frame(name='y').assign(trend=np.arange(len(y))))
		res_trend = mod_trend.fit()

		def tsplot(y, lags=None, figsize=(10, 8)):
			fig = plt.figure(figsize=figsize)
			layout = (2, 2)
			ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
			acf_ax = plt.subplot2grid(layout, (1, 0))
			pacf_ax = plt.subplot2grid(layout, (1, 1))
			
			y.plot(ax=ts_ax)
			smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
			smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
			[ax.set_xlim(1.5) for ax in [acf_ax, pacf_ax]]
			sns.despine()
			plt.tight_layout()
			return ts_ax, acf_ax, pacf_ax

		tsplot(res_trend.resid, lags=36)

		y.to_frame(name='y').assign(Δy=lambda x: x.y.diff()).plot(subplots=True)
		sns.despine()

		ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest")

		#ADF(*smt.adfuller(y))._asdict()
		ADF(*smt.adfuller(y.dropna()))._asdict()
		ADF(*smt.adfuller(y.diff().dropna()))._asdict()

		data = (y.to_frame(name='y').assign(Δy=lambda df: df.y.diff()).assign(LΔy=lambda df: df.Δy.shift()))
		mod_stationary = smf.ols('Δy ~ LΔy', data=data.dropna())
		res_stationary = mod_stationary.fit()

		tsplot(res_stationary.resid, lags=24)

		# Seasonality.
		#smt.seasonal_decompose(y).plot()
		smt.seasonal_decompose(y.fillna(method='ffill')).plot()

		# ARIMA.
		mod = smt.SARIMAX(y, trend='c', order=(1, 1, 1))
		res = mod.fit()
		tsplot(res.resid[2:], lags=24)

		res.summary()

		mod_seasonal = smt.SARIMAX(y, trend='c', order=(1, 1, 2), seasonal_order=(0, 1, 2, 12), simple_differencing=False)
		res_seasonal = mod_seasonal.fit()

		res_seasonal.summary()

		tsplot(res_seasonal.resid[12:], lags=24)

		# Forecasting.
		pred = res_seasonal.get_prediction(start='2001-03-01')
		pred_ci = pred.conf_int()

		plt.figure()
		ax = y.plot(label='observed')
		pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7)
		ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2)
		ax.set_ylabel("Monthly Flights")
		plt.legend()
		sns.despine()

		pred_dy = res_seasonal.get_prediction(start='2002-03-01', dynamic='2013-01-01')
		pred_dy_ci = pred_dy.conf_int()

		plt.figure()
		ax = y.plot(label='observed')
		pred_dy.predicted_mean.plot(ax=ax, label='Forecast')
		ax.fill_between(pred_dy_ci.index, pred_dy_ci.iloc[:, 0], pred_dy_ci.iloc[:, 1], color='k', alpha=.25)
		ax.set_ylabel("Monthly Flights")

		# Highlight the forecast area.
		ax.fill_betweenx(ax.get_ylim(), pd.Timestamp('2013-01-01'), y.index[-1], alpha=.1, zorder=-1)
		ax.annotate('Dynamic $\\longrightarrow$', (pd.Timestamp('2013-02-01'), 550))

		plt.legend()
		sns.despine()

	plt.show()
Пример #17
0

def test_stationarity(timeseries):    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12, center=False).mean()
    rolstd = timeseries.rolling(window=12, center=False).std()
    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = smt.adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    

>>> test_stationarity(ts)  
#variation in standard deviation is small, mean is clearly increasing with time 
#p-value>0.05, hence accept H0(=TS is non-stationary)
Results of Dickey-Fuller Test:
Test Statistic                    0.411488
p-value                           0.981920
#Lags Used                       14.000000
Number of Observations Used    4370.000000
Critical Value (10%)             -2.567122
Пример #18
0
plt.subplot
help(plt.subplot)
ax1=plt.subplot()
ax2 = ax1.
df_merge = pd.merge(df_log, pred_df, left_index=True, right_index=True)
df_merge.columns = ["y", "yhat"]
df_merge["resid"] = df_merge["y"] - df_merge["yhat"]
df_merge[["y", "yhat"]].plot()
mse = pow(df_merge["resid"],2).sum()/df_merge["resid"].__len__()
rmse = np.sqrt(pow(df_merge["resid"],2).sum()/df_merge["resid"].__len__())

df_merge["resid"]
df_merge["x"] = np.exp(df_merge.iloc[:,0])
df_merge["xhat"] = np.exp(df_merge.iloc[:,1])
df_merge["x_resid"] = df_merge["x"] - df_merge["xhat"]
tsa.adfuller(df_merge["resid"])
df_merge["resid"].plot()
tsa.graphics.plot_acf(df_merge["resid"])
tsa.graphics.plot_pacf(df_merge["resid"])


# prediction
pred = best["model"].predict(start=test_df.index[0], end=test_df.index[-1], dynamic= True)
plt.figure(figsize=(22,10))
plt.plot(train_df.index, train_df, label="Train")
plt.plot(pred.index, pred, label="SARIMA", color="r")
plt.plot(test_df.index, test_df, label="Test", color="k")
plt.legend(loc="best", fontsize="xx-large")
plt.show()

Пример #19
0
# is the **Augmented Dickey-Fuller** test.
#
# The null hypothesis in the test
# is that the data is non-stationary,
# and therefore needs to be differenced.
#
# The alternate hypothesis is that is is stationary,
# and therefore does not need to be differenced.
#
# This test is available in `smt.adfuller` in stastmodels.

# +
from collections import namedtuple

ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest")
ADF(*smt.adfuller(y))._asdict()
# -

# So here we failed to reject the null hypothesis.
# Difference it and try again.

ADF(*smt.adfuller(y.diff().dropna()))._asdict()

# Now fit another OLS model.

data = (
    y.to_frame(name="y")
    .assign(Δy=lambda df: df.y.diff())
    .assign(LΔy=lambda df: df.Δy.shift())
)
mod_stationary = smf.ols("Δy ~ LΔy", data=data.dropna())
Пример #20
0
Summaryで特筆すべきはDurbin-Watson比率です。これが2よりも十分に大きい時は負の系列相関。2より十分小さいときには正の系列相関が疑われます。経済時系列データを使った回帰分析では、系列相関が頻繁に生じますから、特に注意が必要です。

ちなみに、系列相関をはじめ、古典的な回帰モデルの診断手続きは経済企画庁[1988]が詳しいです。だいぶ昔のレポートですが、線形回帰モデルは古典的な方法ですので、その基本は変わっていません。http://www.esri.go.jp/jp/archive/bun/bun112/bun112a.pdf

誤差項に系列相関が残っている場合、トレンドも含めて、モデルに含まれていない要因が大きい影響を持っている可能性がありますので、思い当たる説明変数を加えてみたり、タイム・トレンドやラグ項を足したり、変分を取るなりして、コントロールしたほうがよいでしょう。

このような系列相関のチェックには、ADF検定によって誤差項の定常性を確認するのも有効だと思います。

"""




# ADF test, H0: Non-stationary
tsa.adfuller(rlt.resid,regression='nc')


# Autocorrel plot of resid
autocorrelation_plot(rlt.resid) # Show ACF of residuals
ACF_resid=tsa.acf(rlt.resid) # Keep ACF of residuals

"""
誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。

系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。

以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。

リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。
Пример #21
0
def test_stationarity(time_df,
                      maxlag=31,
                      regression='c',
                      autolag='BIC',
                      window=None,
                      plot=False,
                      verbose=False,
                      var_only=False):
    """
    Check unit root stationarity of a time series array or an entire dataframe.
    Note that you must send in a dataframe as df.values.ravel() - otherwise ERROR.
    Null hypothesis: the series is non-stationary.
    If p >= alpha, the series is non-stationary.
    If p < alpha, reject the null hypothesis (has unit root stationarity).
    Original source: http://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
    Function: http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html
    window argument is only required for plotting rolling functions. Default=4.
    """
    time_df = copy.deepcopy(time_df)
    if len(time_df) <= int(1.5 * maxlag):
        maxlag = 5  ## set it to a low number
    # set defaults (from function page)
    if type(time_df) == pd.DataFrame:
        #print('modifying time series dataframe into an array to test')
        timeseries = time_df.values.ravel()
    if regression is None:
        regression = 'c'
    if verbose:
        print('\nRunning Augmented Dickey-Fuller test with paramters:')
        print('    maxlag: {}'.format(maxlag),
              'regression: {}'.format(regression),
              'autolag: {}'.format(autolag))
    alpha = 0.05
    if plot:
        try:
            if window is None:
                window = 4
            # Determing rolling statistics
            rolmean = timeseries.rolling(window=window, center=False).mean()
            rolstd = timeseries.rolling(window=window, center=False).std()
            # Plot rolling statistics:
            orig = plt.plot(timeseries, color='blue', label='Original')
            mean = plt.plot(rolmean,
                            color='red',
                            label='Rolling Mean ({})'.format(window))
            std = plt.plot(rolstd,
                           color='black',
                           label='Rolling Std ({})'.format(window))
            plt.legend(loc='best')
            plt.title('Rolling Mean & Standard Deviation')
            plt.show(block=False)
        except:
            print('Data must have date-time as index to plot!')
            return
    # Perform Augmented Dickey-Fuller test:
    if var_only:
        ### In VAR models, check all_vars for stationarity
        ### if it is 1, then all vars are stationary. If not difference it once and try again!
        ### Use Statsmodels for tests ###########
        diff_limit = 0
        for i in range(3):
            stationary_test = check_each_var_for_stationarity(
                time_df, autolag, verbose)
            if stationary_test:
                if i == 0:
                    print('Data is already stationary')
                    diff_limit = 0
                    break
                elif i == 1:
                    print('Data is stationary after one differencing')
                    diff_limit = 1
                    break
                elif i == 2:
                    diff_limit = 2
                    print('Data is stationary after two differencing')
                    break
            else:
                if i == 2:
                    print(
                        'Alert! Data is not stationary even after two differencing. Continuing...'
                    )
                    diff_limit = 0
                    break
                else:
                    time_df = time_df.diff(1).dropna()
                    continue
        return diff_limit
    else:
        ### In non-VAR models you need to test only the target variable for stationarity ##
        timeseries = copy.deepcopy(time_df)
        dftest = smt.adfuller(timeseries,
                              maxlag=maxlag,
                              regression=regression,
                              autolag=autolag)
        dfoutput = pd.Series(dftest[0:4],
                             index=[
                                 'Test Statistic',
                                 'p-value',
                                 '#Lags Used',
                                 'Number of Observations Used',
                             ],
                             name='Dickey-Fuller Augmented Test')
        for key, value in dftest[4].items():
            dfoutput['Critical Value (%s)' % key] = value
        if verbose:
            print('Results of Augmented Dickey-Fuller Test:')
            pretty_print_table(dfoutput)
        if dftest[1] >= alpha:
            print(
                ' this series is non-stationary. Trying test again after differencing...'
            )
            timeseries = pd.Series(timeseries).diff(1).dropna().values
            dftest = smt.adfuller(timeseries,
                                  maxlag=maxlag,
                                  regression=regression,
                                  autolag=autolag)
            dfoutput = pd.Series(dftest[0:4],
                                 index=[
                                     'Test Statistic',
                                     'p-value',
                                     '#Lags Used',
                                     'Number of Observations Used',
                                 ],
                                 name='Dickey-Fuller Augmented Test')
            for key, value in dftest[4].items():
                dfoutput['Critical Value (%s)' % key] = value
            if verbose:
                print(
                    'After differencing=1, results of Augmented Dickey-Fuller Test:'
                )
                pretty_print_table(dfoutput)
            if dftest[1] >= alpha:
                print(' this series is not stationary')
                return False
            else:
                print(' this series is stationary')
                return True
        else:
            print(' this series is stationary')
            return True
Пример #22
0

■ 推計結果の診断: 系列相関と多重共線性

Summaryで特筆すべきはDurbin-Watson比率です。これが2よりも十分に大きい時は負の系列相関。2より十分小さいときには正の系列相関が疑われます。経済時系列データを使った回帰分析では、系列相関が頻繁に生じますから、特に注意が必要です。

ちなみに、系列相関をはじめ、古典的な回帰モデルの診断手続きは経済企画庁[1988]が詳しいです。だいぶ昔のレポートですが、線形回帰モデルは古典的な方法ですので、その基本は変わっていません。http://www.esri.go.jp/jp/archive/bun/bun112/bun112a.pdf

誤差項に系列相関が残っている場合、トレンドも含めて、モデルに含まれていない要因が大きい影響を持っている可能性がありますので、思い当たる説明変数を加えてみたり、タイム・トレンドやラグ項を足したり、変分を取るなりして、コントロールしたほうがよいでしょう。

このような系列相関のチェックには、ADF検定によって誤差項の定常性を確認するのも有効だと思います。

"""

# ADF test, H0: Non-stationary
tsa.adfuller(rlt.resid, regression='nc')

# Autocorrel plot of resid
autocorrelation_plot(rlt.resid)  # Show ACF of residuals
ACF_resid = tsa.acf(rlt.resid)  # Keep ACF of residuals
"""
誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。

系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。

以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。

リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。

"""