Пример #1
0
    def test_value_error_raise(self):
        """
        Test seq_bootstrap and ind_matrix functions for raising ValueError on nan values
        """

        with self.assertRaises(ValueError):
            get_ind_matrix(self.meta_labeled_events.t1, self.data)
Пример #2
0
 def test_value_error_raise(self):
     """
     Test seq_bootstrap and ind_matrix functions for raising ValueError on nan values
     """
     nan_samples_info_sets = self.samples_info_sets.copy()
     nan_samples_info_sets.loc[pd.Timestamp(2019, 1, 1), 't1'] = None
     with self.assertRaises(ValueError):
         get_ind_matrix(nan_samples_info_sets.t1, self.price_bars)
Пример #3
0
    def __init__(self,
                 samples_info_sets,
                 price_bars,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap_features=False,
                 oob_score=False,
                 warm_start=False,
                 n_jobs=None,
                 random_state=None,
                 verbose=0):
        super().__init__(base_estimator=base_estimator,
                         n_estimators=n_estimators,
                         bootstrap=True,
                         max_samples=max_samples,
                         max_features=max_features,
                         bootstrap_features=bootstrap_features,
                         oob_score=oob_score,
                         warm_start=warm_start,
                         n_jobs=n_jobs,
                         random_state=random_state,
                         verbose=verbose)

        # pylint: disable=invalid-name
        self.samples_info_sets = samples_info_sets
        self.price_bars = price_bars
        self.ind_mat = get_ind_matrix(samples_info_sets, price_bars)
        # Used for create get ind_matrix subsample during cross-validation
        self.timestamp_int_index_mapping = pd.Series(
            index=samples_info_sets.index, data=range(self.ind_mat.shape[1]))

        self.X_time_index = None  # Timestamp index of X_train
Пример #4
0
    def test_seq_bootstrap(self):
        """
        Test sequential bootstrapping length, indicator matrix length and NaN checks
        """

        non_nan_meta_labels = self.meta_labeled_events.dropna()
        ind_mat = get_ind_matrix(non_nan_meta_labels, self.data)

        label_endtime = non_nan_meta_labels.t1
        trimmed_price_bars_index = self.data[(self.data.index >= non_nan_meta_labels.index.min()) &
                                             (self.data.index <= non_nan_meta_labels.t1.max())].index
        bar_index = list(non_nan_meta_labels.index)  # Generate index for indicator matrix from t1 and index
        bar_index.extend(non_nan_meta_labels.t1)
        bar_index.extend(trimmed_price_bars_index)
        bar_index = sorted(list(set(bar_index)))  # Drop duplicates and sort
        ind_mat_book_implementation = book_ind_mat_implementation(bar_index, label_endtime)

        self.assertTrue(bool((ind_mat_book_implementation.values == ind_mat).all()) is True)
        # Indicator matrix shape should be (unique(meta_label_index+t1+price_bars_index), t1)
        self.assertTrue(ind_mat.shape == (782, 7))

        # Check indicator matrix values for specific labels
        self.assertTrue(bool((ind_mat[:100, 0] == np.ones(100)).all()) is True)
        self.assertTrue(bool((ind_mat[191:340, 2] == np.ones(149)).all()) is True)
        self.assertTrue(bool((ind_mat[341:420, 2] == np.zeros(79)).all()) is True)
        self.assertTrue(bool((ind_mat[406:412, 4] == np.ones(6)).all()) is True)
        self.assertTrue(bool((ind_mat[662:, 6] == np.ones(120)).all()) is True)

        bootstrapped_samples = seq_bootstrap(ind_mat, compare=False, verbose=True, warmup_samples=None)
        bootstrapped_samples_1000 = seq_bootstrap(ind_mat, compare=True, sample_length=100)
        self.assertTrue(len(bootstrapped_samples) == non_nan_meta_labels.shape[0])
        self.assertTrue(len(bootstrapped_samples_1000) == 100)

        # Test sequential bootstrapping on example from a book
        ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3))
        ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0]
        ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0]
        ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1]
        ind_mat = ind_mat.values

        seq_bootstrap(ind_mat, sample_length=3, verbose=True, warmup_samples=[1])  # Show printed probabilities

        # Perform Monte-Carlo test
        standard_unq_array = np.zeros(1000) * np.nan
        seq_unq_array = np.zeros(1000) * np.nan
        for i in range(0, 1000):
            bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3)
            random_samples = np.random.choice(ind_mat.shape[1], size=3)

            random_unq = get_ind_mat_average_uniqueness(ind_mat[:, random_samples])
            sequential_unq = get_ind_mat_average_uniqueness(ind_mat[:, bootstrapped_samples])

            standard_unq_array[i] = random_unq
            seq_unq_array[i] = sequential_unq

        self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array))
        self.assertTrue(np.median(seq_unq_array) >= np.median(standard_unq_array))
Пример #5
0
    def setUp(self):
        """
        Set the file path for the sample dollar bars data and get triple barrier events, generate features
        """
        project_path = os.path.dirname(__file__)
        self.path = project_path + '/test_data/dollar_bar_sample.csv'
        self.data = pd.read_csv(self.path, index_col='date_time')
        self.data.index = pd.to_datetime(self.data.index)

        # Compute moving averages
        self.data['fast_mavg'] = self.data['close'].rolling(
            window=20, min_periods=20, center=False).mean()
        self.data['slow_mavg'] = self.data['close'].rolling(
            window=50, min_periods=50, center=False).mean()

        # Compute sides
        self.data['side'] = np.nan

        long_signals = self.data['fast_mavg'] >= self.data['slow_mavg']
        short_signals = self.data['fast_mavg'] < self.data['slow_mavg']
        self.data.loc[long_signals, 'side'] = 1
        self.data.loc[short_signals, 'side'] = -1

        # Remove Look ahead bias by lagging the signal
        self.data['side'] = self.data['side'].shift(1)

        daily_vol = get_daily_vol(close=self.data['close'], lookback=50) * 0.5
        cusum_events = cusum_filter(self.data['close'], threshold=0.005)
        vertical_barriers = add_vertical_barrier(t_events=cusum_events,
                                                 close=self.data['close'],
                                                 num_hours=2)
        meta_labeled_events = get_events(
            close=self.data['close'],
            t_events=cusum_events,
            pt_sl=[1, 4],
            target=daily_vol,
            min_ret=5e-5,
            num_threads=3,
            vertical_barrier_times=vertical_barriers,
            side_prediction=self.data['side'])
        meta_labeled_events.dropna(inplace=True)
        labels = get_bins(meta_labeled_events, self.data['close'])

        # Generate data set which shows the power of SB Bagging vs Standard Bagging
        ind_mat = get_ind_matrix(meta_labeled_events.t1, self.data.close)

        unique_samples = _get_synthetic_samples(ind_mat, 0.5, 0.1)

        X = self.data.loc[labels.index, ].iloc[unique_samples].dropna(
        )  # get synthetic data set with drawn samples
        labels = labels.loc[X.index, :]
        X.loc[labels.index, 'y'] = labels.bin

        # Generate features (some of them are informative, others are just noise)
        for index, value in X.y.iteritems():
            X.loc[index,
                  'label_prob_0.6'] = _generate_label_with_prob(value, 0.6)
            X.loc[index,
                  'label_prob_0.5'] = _generate_label_with_prob(value, 0.5)
            X.loc[index,
                  'label_prob_0.3'] = _generate_label_with_prob(value, 0.3)
            X.loc[index,
                  'label_prob_0.2'] = _generate_label_with_prob(value, 0.2)
            X.loc[index,
                  'label_prob_0.1'] = _generate_label_with_prob(value, 0.1)

        features = ['label_prob_0.6', 'label_prob_0.2',
                    'label_prob_0.1']  # Two super-informative features
        for prob in [0.5, 0.3, 0.2, 0.1]:
            for window in [2, 5]:
                X['label_prob_{}_sma_{}'.format(
                    prob, window)] = X['label_prob_{}'.format(prob)].rolling(
                        window=window).mean()
                features.append('label_prob_{}_sma_{}'.format(prob, window))
        X.dropna(inplace=True)
        y = X.pop('y')

        self.X_train, self.X_test, self.y_train_clf, self.y_test_clf = train_test_split(
            X[features], y, test_size=0.4, random_state=1, shuffle=False)
        self.y_train_reg = (1 + self.y_train_clf)
        self.y_test_reg = (1 + self.y_test_clf)

        self.samples_info_sets = meta_labeled_events.loc[self.X_train.index,
                                                         't1']
        self.price_bars_trim = self.data[
            (self.data.index >= self.X_train.index.min())
            & (self.data.index <= self.X_train.index.max())].close
Пример #6
0
### Bagging, Bootstrapping and Random Forrest

# Ensemble learning technique (bagging with replacement) the goal is to randomly choose data samples
# that are unique and non-concurrent for each decision tree
# With sequential bootsrapping our goal is to select samples such that with each iteration we can
# maximize average unqiueness of subsamples

ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3))
ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0]
ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0]
ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1]
ind_mat
print(ind_mat)

# Get triple barier method indicator matrix
triple_barrier_ind_mat = get_ind_matrix(barrier_events,
                                        price_bars=close_prices['close'])
print(triple_barrier_ind_mat)

ind_mat_uniqueness = get_ind_mat_average_uniqueness(
    triple_barrier_ind_mat)  ### CHECK BACK AFTER FIXING DUPLICATE T Values
print(ind_mat_uniqueness)

first_sample = ind_mat_uniqueness
first_sample[first_sample > 0].mean()

# Jupyter notebook output
# av_unique.loc[0]

# Get the values
ind_mat = ind_mat.values
Пример #7
0
    def test_seq_bootstrap(self):
        """
        Test sequential bootstrapping length, indicator matrix length and NaN checks
        """

        non_nan_meta_labels = self.meta_labeled_events.dropna()
        ind_mat = get_ind_matrix(non_nan_meta_labels)
        self.assertTrue(ind_mat.shape == (
            13,
            7))  # Indicator matrix shape should be (meta_label_index+t1, t1)
        # Check indicator matrix values for specific labels
        self.assertTrue(
            bool((ind_mat[:, 0] == [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                  ).all()) is True)
        self.assertTrue(
            bool((ind_mat[:, 2] == [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
                  ).all()) is True)
        self.assertTrue(
            bool((ind_mat[:, 4] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
                  ).all()) is True)
        self.assertTrue(
            bool((ind_mat[:, 6] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
                  ).all()) is True)

        bootstrapped_samples = seq_bootstrap(ind_mat,
                                             compare=False,
                                             verbose=True,
                                             warmup_samples=None)
        bootstrapped_samples_1000 = seq_bootstrap(ind_mat,
                                                  compare=True,
                                                  sample_length=100)
        self.assertTrue(
            len(bootstrapped_samples) == non_nan_meta_labels.shape[0])
        self.assertTrue(len(bootstrapped_samples_1000) == 100)

        # Test sequential bootstrapping on example from a book
        ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3))
        ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0]
        ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0]
        ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1]
        ind_mat = ind_mat.values

        seq_bootstrap(ind_mat,
                      sample_length=3,
                      verbose=True,
                      warmup_samples=[1])  # Show printed probabilities

        # Perform Monte-Carlo test
        standard_unq_array = np.zeros(1000) * np.nan
        seq_unq_array = np.zeros(1000) * np.nan
        for i in range(0, 1000):
            bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3)
            random_samples = np.random.choice(ind_mat.shape[1], size=3)

            random_unq = get_ind_mat_average_uniqueness(
                ind_mat[:, random_samples])
            random_unq_mean = random_unq[random_unq > 0].mean()

            sequential_unq = get_ind_mat_average_uniqueness(
                ind_mat[:, bootstrapped_samples])
            sequential_unq_mean = sequential_unq[sequential_unq > 0].mean()

            standard_unq_array[i] = random_unq_mean
            seq_unq_array[i] = sequential_unq_mean

        self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array))
        self.assertTrue(
            np.median(seq_unq_array) >= np.median(standard_unq_array))