Пример #1
0
    def test_seq_bootstrap(self):
        """
        Test sequential bootstrapping length, indicator matrix length and NaN checks
        """

        non_nan_meta_labels = self.meta_labeled_events.dropna()
        ind_mat = get_ind_matrix(non_nan_meta_labels, self.data)

        label_endtime = non_nan_meta_labels.t1
        trimmed_price_bars_index = self.data[(self.data.index >= non_nan_meta_labels.index.min()) &
                                             (self.data.index <= non_nan_meta_labels.t1.max())].index
        bar_index = list(non_nan_meta_labels.index)  # Generate index for indicator matrix from t1 and index
        bar_index.extend(non_nan_meta_labels.t1)
        bar_index.extend(trimmed_price_bars_index)
        bar_index = sorted(list(set(bar_index)))  # Drop duplicates and sort
        ind_mat_book_implementation = book_ind_mat_implementation(bar_index, label_endtime)

        self.assertTrue(bool((ind_mat_book_implementation.values == ind_mat).all()) is True)
        # Indicator matrix shape should be (unique(meta_label_index+t1+price_bars_index), t1)
        self.assertTrue(ind_mat.shape == (782, 7))

        # Check indicator matrix values for specific labels
        self.assertTrue(bool((ind_mat[:100, 0] == np.ones(100)).all()) is True)
        self.assertTrue(bool((ind_mat[191:340, 2] == np.ones(149)).all()) is True)
        self.assertTrue(bool((ind_mat[341:420, 2] == np.zeros(79)).all()) is True)
        self.assertTrue(bool((ind_mat[406:412, 4] == np.ones(6)).all()) is True)
        self.assertTrue(bool((ind_mat[662:, 6] == np.ones(120)).all()) is True)

        bootstrapped_samples = seq_bootstrap(ind_mat, compare=False, verbose=True, warmup_samples=None)
        bootstrapped_samples_1000 = seq_bootstrap(ind_mat, compare=True, sample_length=100)
        self.assertTrue(len(bootstrapped_samples) == non_nan_meta_labels.shape[0])
        self.assertTrue(len(bootstrapped_samples_1000) == 100)

        # Test sequential bootstrapping on example from a book
        ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3))
        ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0]
        ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0]
        ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1]
        ind_mat = ind_mat.values

        seq_bootstrap(ind_mat, sample_length=3, verbose=True, warmup_samples=[1])  # Show printed probabilities

        # Perform Monte-Carlo test
        standard_unq_array = np.zeros(1000) * np.nan
        seq_unq_array = np.zeros(1000) * np.nan
        for i in range(0, 1000):
            bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3)
            random_samples = np.random.choice(ind_mat.shape[1], size=3)

            random_unq = get_ind_mat_average_uniqueness(ind_mat[:, random_samples])
            sequential_unq = get_ind_mat_average_uniqueness(ind_mat[:, bootstrapped_samples])

            standard_unq_array[i] = random_unq
            seq_unq_array[i] = sequential_unq

        self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array))
        self.assertTrue(np.median(seq_unq_array) >= np.median(standard_unq_array))
Пример #2
0
def _generate_bagging_indices(random_state, bootstrap_features, n_features,
                              max_features, max_samples, ind_mat):
    """Randomly draw feature and sample indices."""
    # Get valid random state
    random_state = check_random_state(random_state)

    # Draw indices
    feature_indices = _generate_random_features(random_state,
                                                bootstrap_features, n_features,
                                                max_features)
    sample_indices = seq_bootstrap(ind_mat,
                                   sample_length=max_samples,
                                   random_state=random_state)

    return feature_indices, sample_indices
Пример #3
0
    uniqueness_array[i] = (label_uniqueness[label_uniqueness > 0].mean())
prob_array = uniqueness_array / sum(uniqueness_array)

phi = [1, 2, 0]
uniqueness_array = np.array([None, None, None])
for i in range(0, 3):
    ind_mat_reduced = ind_mat[:, phi + [i]]
    label_uniqueness = get_ind_mat_average_uniqueness(
        ind_mat_reduced)  #[-1] TODO fix thiss
    uniqueness_array[i] = (label_uniqueness[label_uniqueness > 0].mean())
prob_array = uniqueness_array / sum(uniqueness_array)

print(prob_array)

samples = seq_bootstrap(ind_mat,
                        sample_length=4,
                        warmup_samples=[1],
                        verbose=True)
print(samples)

### Monte-Carlo experiment (checks to see how sequential bootsrapping will improve average label uniqueness)

standard_unq_array = np.zeros(
    10000) * np.nan  # Array of random sampling uniqueness
seq_unq_array = np.zeros(
    10000) * np.nan  # Array of Sequential Bootstapping uniqueness
for i in range(0, 10000):
    bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3)
    random_samples = np.random.choice(ind_mat.shape[1], size=3)

    random_unq = get_ind_mat_average_uniqueness(ind_mat[:, random_samples])
    random_unq_mean = random_unq[random_unq > 0].mean()
Пример #4
0
    def test_seq_bootstrap(self):
        """
        Test sequential bootstrapping length, indicator matrix length and NaN checks
        """

        non_nan_meta_labels = self.meta_labeled_events.dropna()
        ind_mat = get_ind_matrix(non_nan_meta_labels)
        self.assertTrue(ind_mat.shape == (
            13,
            7))  # Indicator matrix shape should be (meta_label_index+t1, t1)
        # Check indicator matrix values for specific labels
        self.assertTrue(
            bool((ind_mat[:, 0] == [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                  ).all()) is True)
        self.assertTrue(
            bool((ind_mat[:, 2] == [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
                  ).all()) is True)
        self.assertTrue(
            bool((ind_mat[:, 4] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
                  ).all()) is True)
        self.assertTrue(
            bool((ind_mat[:, 6] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
                  ).all()) is True)

        bootstrapped_samples = seq_bootstrap(ind_mat,
                                             compare=False,
                                             verbose=True,
                                             warmup_samples=None)
        bootstrapped_samples_1000 = seq_bootstrap(ind_mat,
                                                  compare=True,
                                                  sample_length=100)
        self.assertTrue(
            len(bootstrapped_samples) == non_nan_meta_labels.shape[0])
        self.assertTrue(len(bootstrapped_samples_1000) == 100)

        # Test sequential bootstrapping on example from a book
        ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3))
        ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0]
        ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0]
        ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1]
        ind_mat = ind_mat.values

        seq_bootstrap(ind_mat,
                      sample_length=3,
                      verbose=True,
                      warmup_samples=[1])  # Show printed probabilities

        # Perform Monte-Carlo test
        standard_unq_array = np.zeros(1000) * np.nan
        seq_unq_array = np.zeros(1000) * np.nan
        for i in range(0, 1000):
            bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3)
            random_samples = np.random.choice(ind_mat.shape[1], size=3)

            random_unq = get_ind_mat_average_uniqueness(
                ind_mat[:, random_samples])
            random_unq_mean = random_unq[random_unq > 0].mean()

            sequential_unq = get_ind_mat_average_uniqueness(
                ind_mat[:, bootstrapped_samples])
            sequential_unq_mean = sequential_unq[sequential_unq > 0].mean()

            standard_unq_array[i] = random_unq_mean
            seq_unq_array[i] = sequential_unq_mean

        self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array))
        self.assertTrue(
            np.median(seq_unq_array) >= np.median(standard_unq_array))