Exemplo n.º 1
0
    def test_encoding_schemes(self):
        """
        Test quantile and sigma encoding
        """
        values = np.arange(0, 1000, 1)
        quantile_dict = quantile_mapping(values, num_letters=10)
        sigma_dict = sigma_mapping(values, step=20)
        self.assertEqual(len(quantile_dict), 10)
        self.assertEqual(quantile_dict[229.77], '\x02')
        self.assertEqual(len(sigma_dict), np.ceil((max(values) - min(values)) / 20))
        self.assertEqual(sigma_dict[100], '\x05')

        with self.assertRaises(ValueError):
            sigma_mapping(values, step=1)  # Length of dice > ASCII table
    def test_inter_bar_feature_values(self):
        """
        Test entropy, misc, inter-bar feature generation
        """
        # Encode volumes and pct changes
        df_trades = pd.read_csv(self.trades_path, parse_dates=[0])
        df_trades['log_ret'] = np.log(df_trades.Price /
                                      df_trades.Price.shift(1)).dropna()
        unique_volumes = df_trades.Volume.drop_duplicates()
        non_null_log_ret = df_trades[df_trades.log_ret != 0].log_ret.dropna()

        volume_mapping = quantile_mapping(unique_volumes, num_letters=10)
        returns_mapping = quantile_mapping(non_null_log_ret, num_letters=10)

        # Compress bars from ticks
        compressed_bars = get_volume_bars(self.trades_path,
                                          threshold=20,
                                          verbose=False)
        compressed_bars.set_index('date_time', inplace=True)
        compressed_bars.index = pd.to_datetime(compressed_bars.index)

        gen = MicrostructuralFeaturesGenerator(self.trades_path,
                                               compressed_bars.tick_num,
                                               volume_encoding=volume_mapping,
                                               pct_encoding=returns_mapping)

        features = gen.get_features(to_csv=False, verbose=False)
        features.set_index('date_time', inplace=True)

        # Check individual feature values
        # Avg tick size
        self.assertAlmostEqual(features.avg_tick_size.max(), 8.0, delta=1e-1)
        self.assertAlmostEqual(features.avg_tick_size.mean(),
                               3.1931,
                               delta=1e-4)
        self.assertAlmostEqual(features.avg_tick_size[3], 1.6153, delta=1e-3)

        # Tick rule sum
        self.assertAlmostEqual(features.tick_rule_sum.max(), 7.0, delta=1e-1)
        self.assertAlmostEqual(features.tick_rule_sum.mean(), -3.4, delta=1e-4)
        self.assertAlmostEqual(features.tick_rule_sum[3], -11.0, delta=1e-3)

        # VWAP
        self.assertAlmostEqual(features.vwap.max(), 1311.663, delta=1e-1)
        self.assertAlmostEqual(features.vwap.mean(), 1304.94542, delta=1e-4)
        self.assertAlmostEqual(features.vwap[3], 1304.5119, delta=1e-3)

        # Kyle lambda
        self.assertAlmostEqual(features.kyle_lambda.max(), 197.958, delta=1e-1)
        self.assertAlmostEqual(features.kyle_lambda.mean(),
                               23.13859,
                               delta=1e-4)
        self.assertAlmostEqual(features.kyle_lambda[3], 0.007936, delta=1e-3)

        # Amihud lambda
        self.assertAlmostEqual(features.amihud_lambda.max(),
                               8.291e-5,
                               delta=1e-7)
        self.assertAlmostEqual(features.amihud_lambda.mean(),
                               1.001e-5,
                               delta=1e-8)
        self.assertAlmostEqual(features.amihud_lambda[3],
                               4.663786e-9,
                               delta=1e-11)

        # Hasbrouck lambda
        self.assertAlmostEqual(features.hasbrouck_lambda.max(),
                               0.0025621,
                               delta=1e-5)
        self.assertAlmostEqual(features.hasbrouck_lambda.mean(),
                               0.00018253,
                               delta=1e-5)
        self.assertAlmostEqual(features.hasbrouck_lambda[3],
                               2.42e-11,
                               delta=1e-13)

        # Tick rule entropy shannon
        self.assertAlmostEqual(features.tick_rule_entropy_shannon.max(),
                               1.52192,
                               delta=1e-4)
        self.assertAlmostEqual(features.tick_rule_entropy_shannon.mean(),
                               0.499,
                               delta=1e-4)
        self.assertAlmostEqual(features.tick_rule_entropy_shannon[3],
                               0.39124,
                               delta=1e-4)

        # Volume entropy plug-in
        self.assertAlmostEqual(features.volume_entropy_plug_in.max(),
                               1.92192,
                               delta=1e-4)
        self.assertAlmostEqual(features.volume_entropy_plug_in.mean(),
                               1.052201,
                               delta=1e-5)
        self.assertAlmostEqual(features.volume_entropy_plug_in[3],
                               0.41381,
                               delta=1e-4)

        # Volume entropy Lempel-Ziv
        self.assertAlmostEqual(features.volume_entropy_lempel_ziv.max(),
                               1.0,
                               delta=1e-4)
        self.assertAlmostEqual(features.volume_entropy_lempel_ziv.mean(),
                               0.5904612,
                               delta=1e-4)
        self.assertAlmostEqual(features.volume_entropy_lempel_ziv[3],
                               0.46153,
                               delta=1e-4)

        # Pct entropy Lempel-Ziv
        self.assertAlmostEqual(features.pct_entropy_lempel_ziv.max(),
                               0.8,
                               delta=1e-4)
        self.assertAlmostEqual(features.pct_entropy_lempel_ziv.mean(),
                               0.56194,
                               delta=1e-5)
        self.assertAlmostEqual(features.pct_entropy_lempel_ziv[3],
                               0.46153,
                               delta=1e-5)

        # Pct entropy Konto
        self.assertAlmostEqual(features.pct_entropy_konto.max(),
                               1.361,
                               delta=1e-4)
        self.assertAlmostEqual(features.pct_entropy_konto.mean(),
                               0.83039791,
                               delta=1e-5)
        self.assertAlmostEqual(features.pct_entropy_konto[3],
                               1.067022,
                               delta=1e-5)
    def test_feature_generator_function(self):
        """
        Test validity of MicrostructuralFeaturesGenerator
        """
        # Encode volumes and pct changes
        df_trades = pd.read_csv(self.trades_path, parse_dates=[0])
        df_trades['log_ret'] = np.log(df_trades.Price /
                                      df_trades.Price.shift(1)).dropna()
        non_null_log_ret = df_trades[df_trades.log_ret != 0].log_ret.dropna()

        # Take unique volumes only
        volume_mapping = quantile_mapping(df_trades.Volume.drop_duplicates(),
                                          num_letters=10)

        returns_mapping = quantile_mapping(non_null_log_ret, num_letters=10)

        # Compress bars from ticks
        compressed_bars = get_volume_bars(self.trades_path,
                                          threshold=20,
                                          verbose=False)
        compressed_bars.set_index('date_time', inplace=True)
        compressed_bars.index = pd.to_datetime(compressed_bars.index)
        bar_index = compressed_bars.index

        # Test None input ValureError raise
        with self.assertRaises(ValueError):
            MicrostructuralFeaturesGenerator(None,
                                             compressed_bars.tick_num,
                                             volume_encoding=volume_mapping,
                                             pct_encoding=returns_mapping)

        gen = MicrostructuralFeaturesGenerator(self.trades_path,
                                               compressed_bars.tick_num,
                                               volume_encoding=volume_mapping,
                                               pct_encoding=returns_mapping)
        gen_no_entropy = MicrostructuralFeaturesGenerator(
            self.trades_path,
            compressed_bars.tick_num,
            volume_encoding=None,
            pct_encoding=None)
        gen_csv = MicrostructuralFeaturesGenerator(
            self.trades_path,
            compressed_bars.tick_num,
            volume_encoding=volume_mapping,
            pct_encoding=returns_mapping)
        gen_1 = MicrostructuralFeaturesGenerator(
            self.trades_path,
            compressed_bars.tick_num,
            volume_encoding=volume_mapping,
            pct_encoding=returns_mapping,
            batch_size=1)
        gen_20 = MicrostructuralFeaturesGenerator(
            self.trades_path,
            compressed_bars.tick_num,
            volume_encoding=volume_mapping,
            pct_encoding=returns_mapping,
            batch_size=20)
        gen_df = MicrostructuralFeaturesGenerator(
            df_trades,
            compressed_bars.tick_num,
            volume_encoding=volume_mapping,
            pct_encoding=returns_mapping,
            batch_size=20)
        features = gen.get_features(to_csv=False, verbose=False)
        features_1 = gen_1.get_features(to_csv=False, verbose=False)
        features_20 = gen_20.get_features(to_csv=False, verbose=False)
        features_from_df = gen_df.get_features(to_csv=False, verbose=False)
        features_no_entropy = gen_no_entropy.get_features(verbose=False)

        # No volume/pct entropy columns check
        with self.assertRaises(KeyError):
            features['tick_rule_entropy'] += features_no_entropy[
                'volume_entropy_plug_in']

        with self.assertRaises(KeyError):
            features['tick_rule_entropy'] = features_no_entropy[
                'pct_entropy_plug_in']

        gen_csv.get_features(to_csv=True, output_path='features.csv')
        features_from_csv = pd.read_csv('features.csv', parse_dates=[0])

        self.assertTrue(
            (features.dropna().values == features_1.dropna().values).all())
        self.assertTrue(
            (features.dropna().values == features_20.dropna().values).all())
        self.assertTrue(
            (features.dropna().values == features_from_df.dropna().values
             ).all())

        features.set_index('date_time', inplace=True)
        features_from_csv.set_index('date_time', inplace=True)

        self.assertAlmostEqual((features - features_from_csv).sum().sum(),
                               0,
                               delta=1e-6)

        self.assertEqual(bar_index.shape[0], features.shape[0])
        self.assertEqual(compressed_bars.loc[features.index].shape[0],
                         compressed_bars.shape[0])

        os.remove('features.csv')