def test_inter_bar_feature_values(self): """ Test entropy, misc, inter-bar feature generation """ # Encode volumes and pct changes df_trades = pd.read_csv(self.trades_path, parse_dates=[0]) df_trades['log_ret'] = np.log(df_trades.Price / df_trades.Price.shift(1)).dropna() unique_volumes = df_trades.Volume.drop_duplicates() non_null_log_ret = df_trades[df_trades.log_ret != 0].log_ret.dropna() volume_mapping = quantile_mapping(unique_volumes, num_letters=10) returns_mapping = quantile_mapping(non_null_log_ret, num_letters=10) # Compress bars from ticks compressed_bars = get_volume_bars(self.trades_path, threshold=20, verbose=False) compressed_bars.set_index('date_time', inplace=True) compressed_bars.index = pd.to_datetime(compressed_bars.index) gen = MicrostructuralFeaturesGenerator(self.trades_path, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping) features = gen.get_features(to_csv=False, verbose=False) features.set_index('date_time', inplace=True) # Check individual feature values # Avg tick size self.assertAlmostEqual(features.avg_tick_size.max(), 8.0, delta=1e-1) self.assertAlmostEqual(features.avg_tick_size.mean(), 3.1931, delta=1e-4) self.assertAlmostEqual(features.avg_tick_size[3], 1.6153, delta=1e-3) # Tick rule sum self.assertAlmostEqual(features.tick_rule_sum.max(), 7.0, delta=1e-1) self.assertAlmostEqual(features.tick_rule_sum.mean(), -3.4, delta=1e-4) self.assertAlmostEqual(features.tick_rule_sum[3], -11.0, delta=1e-3) # VWAP self.assertAlmostEqual(features.vwap.max(), 1311.663, delta=1e-1) self.assertAlmostEqual(features.vwap.mean(), 1304.94542, delta=1e-4) self.assertAlmostEqual(features.vwap[3], 1304.5119, delta=1e-3) # Kyle lambda self.assertAlmostEqual(features.kyle_lambda.max(), 197.958, delta=1e-1) self.assertAlmostEqual(features.kyle_lambda.mean(), 23.13859, delta=1e-4) self.assertAlmostEqual(features.kyle_lambda[3], 0.007936, delta=1e-3) # Amihud lambda self.assertAlmostEqual(features.amihud_lambda.max(), 8.291e-5, delta=1e-7) self.assertAlmostEqual(features.amihud_lambda.mean(), 1.001e-5, delta=1e-8) self.assertAlmostEqual(features.amihud_lambda[3], 4.663786e-9, delta=1e-11) # Hasbrouck lambda self.assertAlmostEqual(features.hasbrouck_lambda.max(), 0.0025621, delta=1e-5) self.assertAlmostEqual(features.hasbrouck_lambda.mean(), 0.00018253, delta=1e-5) self.assertAlmostEqual(features.hasbrouck_lambda[3], 2.42e-11, delta=1e-13) # Tick rule entropy shannon self.assertAlmostEqual(features.tick_rule_entropy_shannon.max(), 1.52192, delta=1e-4) self.assertAlmostEqual(features.tick_rule_entropy_shannon.mean(), 0.499, delta=1e-4) self.assertAlmostEqual(features.tick_rule_entropy_shannon[3], 0.39124, delta=1e-4) # Volume entropy plug-in self.assertAlmostEqual(features.volume_entropy_plug_in.max(), 1.92192, delta=1e-4) self.assertAlmostEqual(features.volume_entropy_plug_in.mean(), 1.052201, delta=1e-5) self.assertAlmostEqual(features.volume_entropy_plug_in[3], 0.41381, delta=1e-4) # Volume entropy Lempel-Ziv self.assertAlmostEqual(features.volume_entropy_lempel_ziv.max(), 1.0, delta=1e-4) self.assertAlmostEqual(features.volume_entropy_lempel_ziv.mean(), 0.5904612, delta=1e-4) self.assertAlmostEqual(features.volume_entropy_lempel_ziv[3], 0.46153, delta=1e-4) # Pct entropy Lempel-Ziv self.assertAlmostEqual(features.pct_entropy_lempel_ziv.max(), 0.8, delta=1e-4) self.assertAlmostEqual(features.pct_entropy_lempel_ziv.mean(), 0.56194, delta=1e-5) self.assertAlmostEqual(features.pct_entropy_lempel_ziv[3], 0.46153, delta=1e-5) # Pct entropy Konto self.assertAlmostEqual(features.pct_entropy_konto.max(), 1.361, delta=1e-4) self.assertAlmostEqual(features.pct_entropy_konto.mean(), 0.83039791, delta=1e-5) self.assertAlmostEqual(features.pct_entropy_konto[3], 1.067022, delta=1e-5)
# ['low', 'open', 'close', 'period_volume', 'number_trades'], axis=1) #data.columns = ['date_time', 'price', 'total_volume'] # CREATING STANDARD BARS # Create dollar bar print('Creating Dollar Bars') dollar_bar = ds.get_dollar_bars(data, threshold=28000, batch_size=10000000, verbose=True) # Create volume bar print('Creating Volume Bars') volume_bar = ds.get_volume_bars(data, threshold=28000, batch_size=1000000, verbose=True) # Create tick bar print('Creating Tick Bars') tick_bar = ds.get_tick_bars(data, threshold=28000, batch_size=1000000, verbose=True) # CREATING EMA IMBALANCE BARS # Create EMA Dollar Imbalance bar print('Creating EMA Dollar Imbalance Bar') ema_dollar_imbalance_bar = ds.get_ema_dollar_imbalance_bars( data,
def test_feature_generator_function(self): """ Test validity of MicrostructuralFeaturesGenerator """ # Encode volumes and pct changes df_trades = pd.read_csv(self.trades_path, parse_dates=[0]) df_trades['log_ret'] = np.log(df_trades.Price / df_trades.Price.shift(1)).dropna() non_null_log_ret = df_trades[df_trades.log_ret != 0].log_ret.dropna() # Take unique volumes only volume_mapping = quantile_mapping(df_trades.Volume.drop_duplicates(), num_letters=10) returns_mapping = quantile_mapping(non_null_log_ret, num_letters=10) # Compress bars from ticks compressed_bars = get_volume_bars(self.trades_path, threshold=20, verbose=False) compressed_bars.set_index('date_time', inplace=True) compressed_bars.index = pd.to_datetime(compressed_bars.index) bar_index = compressed_bars.index # Test None input ValureError raise with self.assertRaises(ValueError): MicrostructuralFeaturesGenerator(None, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping) gen = MicrostructuralFeaturesGenerator(self.trades_path, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping) gen_no_entropy = MicrostructuralFeaturesGenerator( self.trades_path, compressed_bars.tick_num, volume_encoding=None, pct_encoding=None) gen_csv = MicrostructuralFeaturesGenerator( self.trades_path, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping) gen_1 = MicrostructuralFeaturesGenerator( self.trades_path, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping, batch_size=1) gen_20 = MicrostructuralFeaturesGenerator( self.trades_path, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping, batch_size=20) gen_df = MicrostructuralFeaturesGenerator( df_trades, compressed_bars.tick_num, volume_encoding=volume_mapping, pct_encoding=returns_mapping, batch_size=20) features = gen.get_features(to_csv=False, verbose=False) features_1 = gen_1.get_features(to_csv=False, verbose=False) features_20 = gen_20.get_features(to_csv=False, verbose=False) features_from_df = gen_df.get_features(to_csv=False, verbose=False) features_no_entropy = gen_no_entropy.get_features(verbose=False) # No volume/pct entropy columns check with self.assertRaises(KeyError): features['tick_rule_entropy'] += features_no_entropy[ 'volume_entropy_plug_in'] with self.assertRaises(KeyError): features['tick_rule_entropy'] = features_no_entropy[ 'pct_entropy_plug_in'] gen_csv.get_features(to_csv=True, output_path='features.csv') features_from_csv = pd.read_csv('features.csv', parse_dates=[0]) self.assertTrue( (features.dropna().values == features_1.dropna().values).all()) self.assertTrue( (features.dropna().values == features_20.dropna().values).all()) self.assertTrue( (features.dropna().values == features_from_df.dropna().values ).all()) features.set_index('date_time', inplace=True) features_from_csv.set_index('date_time', inplace=True) self.assertAlmostEqual((features - features_from_csv).sum().sum(), 0, delta=1e-6) self.assertEqual(bar_index.shape[0], features.shape[0]) self.assertEqual(compressed_bars.loc[features.index].shape[0], compressed_bars.shape[0]) os.remove('features.csv')