def test_variances_and_selection(self): adj_models = self.models + linspace(-2, 0.5, self.k) spa = SPA(self.benchmark, adj_models, block_size=10, reps=10) spa.seed(23456) spa.compute() variances = spa._loss_diff_var loss_diffs = spa._loss_diff demeaned = spa._loss_diff - loss_diffs.mean(0) t = loss_diffs.shape[0] kernel_weights = np.zeros(t) p = 1 / 10.0 for i in range(1, t): kernel_weights[i] = ((1.0 - (i / t)) * ((1 - p) ** i)) + ((i / t) * ((1 - p) ** (t - i))) direct_vars = (demeaned ** 2).sum(0) / t for i in range(1, t): direct_vars += 2 * kernel_weights[i] * (demeaned[:t - i, :] * demeaned[i:, :]).sum(0) / t assert_allclose(direct_vars, variances) selection_criteria = -1.0 * np.sqrt((direct_vars / t) * 2 * np.log(np.log(t))) valid = loss_diffs.mean(0) >= selection_criteria assert_equal(valid, spa._valid_columns) # Bootstrap variances spa = SPA(self.benchmark, self.models, block_size=10, reps=100, nested=True) spa.seed(23456) spa.compute() spa.reset() bs = spa.bootstrap.clone(demeaned) variances = spa._loss_diff_var bootstrap_variances = t * bs.var(lambda x: x.mean(0), reps=100, recenter=True) assert_allclose(bootstrap_variances, variances)
def test_seed_reset(self): spa = SPA(self.benchmark, self.models, reps=10) spa.seed(23456) initial_state = spa.bootstrap.random_state assert_equal(spa.bootstrap._seed, 23456) spa.compute() spa.reset() assert_equal(spa._pvalues, None) assert_equal(spa.bootstrap.random_state, initial_state)
def test_equivalence(self): spa = SPA(self.benchmark, self.models, block_size=10, reps=100) spa.seed(23456) spa.compute() numpy_pvalues = spa.pvalues spa = SPA(self.benchmark_df, self.models_df, block_size=10, reps=100) spa.seed(23456) spa.compute() pandas_pvalues = spa.pvalues assert_series_equal(numpy_pvalues, pandas_pvalues)
def test_pvalues_and_critvals(self): spa = SPA(self.benchmark, self.models, reps=100) spa.compute() spa.seed(23456) simulated_vals = spa._simulated_vals max_stats = np.max(simulated_vals, 0) max_loss_diff = np.max(spa._loss_diff.mean(0), 0) pvalues = np.mean(max_loss_diff <= max_stats, 0) pvalues = pd.Series(pvalues, index=['lower', 'consistent', 'upper']) assert_series_equal(pvalues, spa.pvalues) crit_vals = np.percentile(max_stats, 90.0, axis=0) crit_vals = pd.Series(crit_vals, index=['lower', 'consistent', 'upper']) assert_series_equal(spa.critical_values(0.10), crit_vals)
def test_variances_and_selection(self): adj_models = self.models + linspace(-2, 0.5, self.k) spa = SPA(self.benchmark, adj_models, block_size=10, reps=10) spa.seed(23456) spa.compute() variances = spa._loss_diff_var loss_diffs = spa._loss_diff demeaned = spa._loss_diff - loss_diffs.mean(0) t = loss_diffs.shape[0] kernel_weights = np.zeros(t) p = 1 / 10.0 for i in range(1, t): kernel_weights[i] = ((1.0 - (i / t)) * ((1 - p)**i)) + ((i / t) * ((1 - p)**(t - i))) direct_vars = (demeaned**2).sum(0) / t for i in range(1, t): direct_vars += 2 * kernel_weights[i] * (demeaned[:t - i, :] * demeaned[i:, :]).sum(0) / t assert_allclose(direct_vars, variances) selection_criteria = -1.0 * np.sqrt( (direct_vars / t) * 2 * np.log(np.log(t))) valid = loss_diffs.mean(0) >= selection_criteria assert_equal(valid, spa._valid_columns) # Bootstrap variances spa = SPA(self.benchmark, self.models, block_size=10, reps=100, nested=True) spa.seed(23456) spa.compute() spa.reset() bs = spa.bootstrap.clone(demeaned) variances = spa._loss_diff_var bootstrap_variances = t * bs.var( lambda x: x.mean(0), reps=100, recenter=True) assert_allclose(bootstrap_variances, variances)
def exp_symbols_statistics(fout_path=os.path.join( DATA_DIR, 'exp_symbols_statistics.xlsx')): """ statistics of experiment symbols output the results to xlsx """ t0 = time() fin_path = os.path.join(SYMBOLS_PKL_DIR, 'TAIEX_2005_largest50cap_panel.pkl') # shape: (n_exp_period, n_stock, ('simple_roi', 'close_price')) panel = pd.read_pickle(fin_path) assert panel.major_axis.tolist() == EXP_SYMBOLS panel = panel.loc[date(2005, 1, 3):date(2014, 12, 31)] # the roi in the first experiment date is zero panel.loc[date(2005, 1, 3), :, 'simple_roi'] = 0. stat_indices = ( # basic information 'start_date', 'end_date', 'n_exp_period', 'n_period_up', 'n_period_down', # roi 'cum_roi', 'daily_roi', 'daily_mean_roi', 'daily_std_roi', 'daily_skew_roi', 'daily_kurt_roi', # roi/risk indices 'sharpe', 'sortino_full', 'sortino_full_semi_std', 'sortino_partial', 'sortino_partial_semi_std', 'max_abs_drawdown', # normal tests 'JB', 'JB_pvalue', # uni-root tests 'ADF_c', 'ADF_c_pvalue', 'ADF_ct', 'ADF_ct_pvalue', 'ADF_ctt', 'ADF_ctt_pvalue', 'ADF_nc', 'ADF_nc_pvalue', 'DFGLS_c', 'DFGLS_c_pvalue', 'DFGLS_ct', 'DFGLS_ct_pvalue', 'PP_c', 'PP_c_pvalue', 'PP_ct', 'PP_ct_pvalue', 'PP_nc', 'PP_nc_pvalue', 'KPSS_c', 'KPSS_c_pvalue', 'KPSS_ct', 'KPSS_ct_pvalue', # performance 'SPA_l_pvalue', 'SPA_c_pvalue', 'SPA_u_pvalue' ) stat_df = pd.DataFrame(np.zeros((len(stat_indices), len(EXP_SYMBOLS))), index=stat_indices, columns=EXP_SYMBOLS) for rdx, symbol in enumerate(EXP_SYMBOLS): t1 = time() rois = panel[:, symbol, 'simple_roi'] # basic stat_df.loc['start_date', symbol] = rois.index[0].strftime("%Y/%b/%d") stat_df.loc['end_date', symbol] = rois.index[-1].strftime("%Y/%b/%d") stat_df.loc['n_exp_period', symbol] = len(rois) stat_df.loc['n_period_up', symbol] = (rois > 0).sum() stat_df.loc['n_period_down', symbol] = (rois < 0).sum() # roi stat_df.loc['cum_roi', symbol] = (rois + 1.).prod() - 1 stat_df.loc['daily_roi', symbol] = np.power((rois + 1.).prod(), 1. / len(rois)) - 1 stat_df.loc['daily_mean_roi', symbol] = rois.mean() stat_df.loc['daily_std_roi', symbol] = rois.std() stat_df.loc['daily_skew_roi', symbol] = rois.skew() stat_df.loc['daily_kurt_roi', symbol] = rois.kurt() # excess # roi/risk indices stat_df.loc['sharpe', symbol] = sharpe(rois) (stat_df.loc['sortino_full', symbol], stat_df.loc['sortino_full_semi_std', symbol]) = sortino_full(rois) (stat_df.loc['sortino_partial', symbol], stat_df.loc['sortino_partial_semi_std', symbol]) = sortino_partial( rois) stat_df.loc['max_abs_drawdown', symbol] = maximum_drawdown(rois) # normal tests jb = jarque_bera(rois) stat_df.loc['JB', symbol] = jb[0] stat_df.loc['JB_pvalue', symbol] = jb[1] # uniroot tests adf_c = adfuller(rois, regression='c') stat_df.loc['ADF_c', symbol] = adf_c[0] stat_df.loc['ADF_c_pvalue', symbol] = adf_c[1] adf_ct = adfuller(rois, regression='ct') stat_df.loc['ADF_ct', symbol] = adf_ct[0] stat_df.loc['ADF_ct_pvalue', symbol] = adf_ct[1] adf_ctt = adfuller(rois, regression='ctt') stat_df.loc['ADF_ctt', symbol] = adf_ctt[0] stat_df.loc['ADF_ctt_pvalue', symbol] = adf_ctt[1] adf_nc = adfuller(rois, regression='nc') stat_df.loc['ADF_nc', symbol] = adf_nc[0] stat_df.loc['ADF_nc_pvalue', symbol] = adf_nc[1] dfgls_c_instance = DFGLS(rois, trend='c') dfgls_c, dfgls_c_pvalue = (dfgls_c_instance.stat, dfgls_c_instance.pvalue) stat_df.loc['DFGLS_c', symbol] = dfgls_c stat_df.loc['DFGLS_c_pvalue', symbol] = dfgls_c_pvalue dfgls_ct_instance = DFGLS(rois, trend='ct') dfgls_ct, dfgls_ct_pvalue = (dfgls_ct_instance.stat, dfgls_ct_instance.pvalue) stat_df.loc['DFGLS_ct', symbol] = dfgls_ct stat_df.loc['DFGLS_ct_pvalue', symbol] = dfgls_ct_pvalue pp_c_instance = PhillipsPerron(rois, trend='c') pp_c, pp_c_pvalue = (pp_c_instance.stat, pp_c_instance.pvalue) stat_df.loc['PP_c', symbol] = pp_c stat_df.loc['PP_c_pvalue', symbol] = pp_c_pvalue pp_ct_instance = PhillipsPerron(rois, trend='ct') pp_ct, pp_ct_pvalue = (pp_ct_instance.stat, pp_ct_instance.pvalue) stat_df.loc['PP_ct', symbol] = pp_ct stat_df.loc['PP_ct_pvalue', symbol] = pp_ct_pvalue pp_nc_instance = PhillipsPerron(rois, trend='nc') pp_nc, pp_nc_pvalue = (pp_nc_instance.stat, pp_nc_instance.pvalue) stat_df.loc['PP_nc', symbol] = pp_nc stat_df.loc['PP_nc_pvalue', symbol] = pp_nc_pvalue kpss_c_instance = KPSS(rois, trend='c') kpss_c, kpss_c_pvalue = (kpss_c_instance.stat, kpss_c_instance.pvalue) stat_df.loc['KPSS_c', symbol] = kpss_c stat_df.loc['KPSS_c_pvalue', symbol] = kpss_c_pvalue kpss_ct_instance = KPSS(rois, trend='ct') kpss_ct, kpss_ct_pvalue = (kpss_ct_instance.stat, kpss_ct_instance.pvalue) stat_df.loc['KPSS_ct', symbol] = kpss_ct stat_df.loc['KPSS_ct_pvalue', symbol] = kpss_ct_pvalue # performance spa = SPA(rois, np.zeros(len(rois)), reps=5000) spa.seed(np.random.randint(0, 2 ** 31 - 1)) spa.compute() stat_df.loc['SPA_l_pvalue', symbol] = spa.pvalues[0] stat_df.loc['SPA_c_pvalue', symbol] = spa.pvalues[1] stat_df.loc['SPA_u_pvalue', symbol] = spa.pvalues[2] print ("[{}/{}] {} roi statistics OK, {:.3f} secs".format( rdx + 1, len(EXP_SYMBOLS), symbol, time() - t1 )) # write to excel writer = pd.ExcelWriter(fout_path, engine='xlsxwriter') stat_df = stat_df.T stat_df.to_excel(writer, sheet_name='stats') # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book worksheet = writer.sheets['stats'] # basic formats. # set header header_fmt = workbook.add_format() header_fmt.set_text_wrap() worksheet.set_row(0, 15, header_fmt) # set date date_fmt = workbook.add_format({'num_format': 'yy/mmm/dd'}) date_fmt.set_align('right') worksheet.set_column('B:C', 12, date_fmt) # set percentage percent_fmt = workbook.add_format({'num_format': '0.00%'}) worksheet.set_column('G:J', 8, percent_fmt) worksheet.set_column('M:Q', 8, percent_fmt) worksheet.set_column('T:T', 8, percent_fmt) worksheet.set_column('V:V', 8, percent_fmt) worksheet.set_column('X:X', 8, percent_fmt) worksheet.set_column('Z:Z', 8, percent_fmt) worksheet.set_column('AB:AB', 8, percent_fmt) worksheet.set_column('AD:AD', 8, percent_fmt) worksheet.set_column('AF:AF', 8, percent_fmt) worksheet.set_column('AH:AH', 8, percent_fmt) worksheet.set_column('AJ:AJ', 8, percent_fmt) worksheet.set_column('AL:AL', 8, percent_fmt) worksheet.set_column('AN:AN', 8, percent_fmt) worksheet.set_column('AP:AP', 8, percent_fmt) worksheet.set_column('AQ:AS', 8, percent_fmt) writer.save() print ("all roi statistics OK, {:.3f} secs".format(time() - t0))
def exp_symbols_statistics(fout_path=os.path.join( DATA_DIR, 'exp_symbols_statistics.xlsx')): """ statistics of experiment symbols output the results to xlsx """ t0 = time() fin_path = os.path.join(SYMBOLS_PKL_DIR, 'TAIEX_2005_largest50cap_panel.pkl') # shape: (n_exp_period, n_stock, ('simple_roi', 'close_price')) panel = pd.read_pickle(fin_path) assert panel.major_axis.tolist() == EXP_SYMBOLS panel = panel.loc[date(2005, 1, 3):date(2014, 12, 31)] # the roi in the first experiment date is zero panel.loc[date(2005, 1, 3), :, 'simple_roi'] = 0. stat_indices = ( # basic information 'start_date', 'end_date', 'n_exp_period', 'n_period_up', 'n_period_down', # roi 'cum_roi', 'daily_roi', 'daily_mean_roi', 'daily_std_roi', 'daily_skew_roi', 'daily_kurt_roi', # roi/risk indices 'sharpe', 'sortino_full', 'sortino_full_semi_std', 'sortino_partial', 'sortino_partial_semi_std', 'max_abs_drawdown', # normal tests 'JB', 'JB_pvalue', # uni-root tests 'ADF_c', 'ADF_c_pvalue', 'ADF_ct', 'ADF_ct_pvalue', 'ADF_ctt', 'ADF_ctt_pvalue', 'ADF_nc', 'ADF_nc_pvalue', 'DFGLS_c', 'DFGLS_c_pvalue', 'DFGLS_ct', 'DFGLS_ct_pvalue', 'PP_c', 'PP_c_pvalue', 'PP_ct', 'PP_ct_pvalue', 'PP_nc', 'PP_nc_pvalue', 'KPSS_c', 'KPSS_c_pvalue', 'KPSS_ct', 'KPSS_ct_pvalue', # performance 'SPA_l_pvalue', 'SPA_c_pvalue', 'SPA_u_pvalue') stat_df = pd.DataFrame(np.zeros((len(stat_indices), len(EXP_SYMBOLS))), index=stat_indices, columns=EXP_SYMBOLS) for rdx, symbol in enumerate(EXP_SYMBOLS): t1 = time() rois = panel[:, symbol, 'simple_roi'] # basic stat_df.loc['start_date', symbol] = rois.index[0].strftime("%Y/%b/%d") stat_df.loc['end_date', symbol] = rois.index[-1].strftime("%Y/%b/%d") stat_df.loc['n_exp_period', symbol] = len(rois) stat_df.loc['n_period_up', symbol] = (rois > 0).sum() stat_df.loc['n_period_down', symbol] = (rois < 0).sum() # roi stat_df.loc['cum_roi', symbol] = (rois + 1.).prod() - 1 stat_df.loc['daily_roi', symbol] = np.power( (rois + 1.).prod(), 1. / len(rois)) - 1 stat_df.loc['daily_mean_roi', symbol] = rois.mean() stat_df.loc['daily_std_roi', symbol] = rois.std() stat_df.loc['daily_skew_roi', symbol] = rois.skew() stat_df.loc['daily_kurt_roi', symbol] = rois.kurt() # excess # roi/risk indices stat_df.loc['sharpe', symbol] = sharpe(rois) (stat_df.loc['sortino_full', symbol], stat_df.loc['sortino_full_semi_std', symbol]) = sortino_full(rois) (stat_df.loc['sortino_partial', symbol], stat_df.loc['sortino_partial_semi_std', symbol]) = sortino_partial(rois) stat_df.loc['max_abs_drawdown', symbol] = maximum_drawdown(rois) # normal tests jb = jarque_bera(rois) stat_df.loc['JB', symbol] = jb[0] stat_df.loc['JB_pvalue', symbol] = jb[1] # uniroot tests adf_c = adfuller(rois, regression='c') stat_df.loc['ADF_c', symbol] = adf_c[0] stat_df.loc['ADF_c_pvalue', symbol] = adf_c[1] adf_ct = adfuller(rois, regression='ct') stat_df.loc['ADF_ct', symbol] = adf_ct[0] stat_df.loc['ADF_ct_pvalue', symbol] = adf_ct[1] adf_ctt = adfuller(rois, regression='ctt') stat_df.loc['ADF_ctt', symbol] = adf_ctt[0] stat_df.loc['ADF_ctt_pvalue', symbol] = adf_ctt[1] adf_nc = adfuller(rois, regression='nc') stat_df.loc['ADF_nc', symbol] = adf_nc[0] stat_df.loc['ADF_nc_pvalue', symbol] = adf_nc[1] dfgls_c_instance = DFGLS(rois, trend='c') dfgls_c, dfgls_c_pvalue = (dfgls_c_instance.stat, dfgls_c_instance.pvalue) stat_df.loc['DFGLS_c', symbol] = dfgls_c stat_df.loc['DFGLS_c_pvalue', symbol] = dfgls_c_pvalue dfgls_ct_instance = DFGLS(rois, trend='ct') dfgls_ct, dfgls_ct_pvalue = (dfgls_ct_instance.stat, dfgls_ct_instance.pvalue) stat_df.loc['DFGLS_ct', symbol] = dfgls_ct stat_df.loc['DFGLS_ct_pvalue', symbol] = dfgls_ct_pvalue pp_c_instance = PhillipsPerron(rois, trend='c') pp_c, pp_c_pvalue = (pp_c_instance.stat, pp_c_instance.pvalue) stat_df.loc['PP_c', symbol] = pp_c stat_df.loc['PP_c_pvalue', symbol] = pp_c_pvalue pp_ct_instance = PhillipsPerron(rois, trend='ct') pp_ct, pp_ct_pvalue = (pp_ct_instance.stat, pp_ct_instance.pvalue) stat_df.loc['PP_ct', symbol] = pp_ct stat_df.loc['PP_ct_pvalue', symbol] = pp_ct_pvalue pp_nc_instance = PhillipsPerron(rois, trend='nc') pp_nc, pp_nc_pvalue = (pp_nc_instance.stat, pp_nc_instance.pvalue) stat_df.loc['PP_nc', symbol] = pp_nc stat_df.loc['PP_nc_pvalue', symbol] = pp_nc_pvalue kpss_c_instance = KPSS(rois, trend='c') kpss_c, kpss_c_pvalue = (kpss_c_instance.stat, kpss_c_instance.pvalue) stat_df.loc['KPSS_c', symbol] = kpss_c stat_df.loc['KPSS_c_pvalue', symbol] = kpss_c_pvalue kpss_ct_instance = KPSS(rois, trend='ct') kpss_ct, kpss_ct_pvalue = (kpss_ct_instance.stat, kpss_ct_instance.pvalue) stat_df.loc['KPSS_ct', symbol] = kpss_ct stat_df.loc['KPSS_ct_pvalue', symbol] = kpss_ct_pvalue # performance spa = SPA(rois, np.zeros(len(rois)), reps=5000) spa.seed(np.random.randint(0, 2**31 - 1)) spa.compute() stat_df.loc['SPA_l_pvalue', symbol] = spa.pvalues[0] stat_df.loc['SPA_c_pvalue', symbol] = spa.pvalues[1] stat_df.loc['SPA_u_pvalue', symbol] = spa.pvalues[2] print("[{}/{}] {} roi statistics OK, {:.3f} secs".format( rdx + 1, len(EXP_SYMBOLS), symbol, time() - t1)) # write to excel writer = pd.ExcelWriter(fout_path, engine='xlsxwriter') stat_df = stat_df.T stat_df.to_excel(writer, sheet_name='stats') # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book worksheet = writer.sheets['stats'] # basic formats. # set header header_fmt = workbook.add_format() header_fmt.set_text_wrap() worksheet.set_row(0, 15, header_fmt) # set date date_fmt = workbook.add_format({'num_format': 'yy/mmm/dd'}) date_fmt.set_align('right') worksheet.set_column('B:C', 12, date_fmt) # set percentage percent_fmt = workbook.add_format({'num_format': '0.00%'}) worksheet.set_column('G:J', 8, percent_fmt) worksheet.set_column('M:Q', 8, percent_fmt) worksheet.set_column('T:T', 8, percent_fmt) worksheet.set_column('V:V', 8, percent_fmt) worksheet.set_column('X:X', 8, percent_fmt) worksheet.set_column('Z:Z', 8, percent_fmt) worksheet.set_column('AB:AB', 8, percent_fmt) worksheet.set_column('AD:AD', 8, percent_fmt) worksheet.set_column('AF:AF', 8, percent_fmt) worksheet.set_column('AH:AH', 8, percent_fmt) worksheet.set_column('AJ:AJ', 8, percent_fmt) worksheet.set_column('AL:AL', 8, percent_fmt) worksheet.set_column('AN:AN', 8, percent_fmt) worksheet.set_column('AP:AP', 8, percent_fmt) worksheet.set_column('AQ:AS', 8, percent_fmt) writer.save() print("all roi statistics OK, {:.3f} secs".format(time() - t0))