def test_classification_balanced_simple(num_classes):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([i % num_classes for i in range(1000)])
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)
def test_sampler_ratio_dictionary_multiclass(sampling_ratio_dict, expected):
    X = pd.DataFrame({"a": [i for i in range(1200)]})
    y = pd.Series([0] * 200 + [1] * 800 + [2] * 200)
    bcs = BalancedClassificationSampler(
        sampling_ratio_dict=sampling_ratio_dict)
    indices = bcs.fit_resample(X, y)
    y_new = y.iloc[indices]
    y_sampled_count = y_new.value_counts().to_dict()
    assert y_sampled_count == expected
예제 #3
0
def test_classification_severely_imbalanced_multiclass_simple():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    # 9 instances of 1, 9 instances of 2
    y = pd.Series([0 if i % 55 != 0 else (1 + i % 2) for i in range(1000)])
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)
예제 #4
0
def test_classification_severely_imbalanced_binary_simple():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    # 5 instances of positive 1
    y = pd.Series([1 if i % 200 != 0 else 0 for i in range(1000)])
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)
def test_dict_overrides_ratio():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 200 + [1] * 800)
    sampling_ratio_dict = {0: 200, 1: 800}
    bcs = BalancedClassificationSampler(
        sampling_ratio=0.1, sampling_ratio_dict=sampling_ratio_dict)
    indices = bcs.fit_resample(X, y)
    y_new = y.iloc[indices]
    y_sampled_count = y_new.value_counts().to_dict()
    assert y_sampled_count == sampling_ratio_dict
예제 #6
0
def test_classification_imbalanced_multiple_multiclass():
    X = pd.DataFrame({"a": [i for i in range(10000)]})
    y = pd.Series([0] * 4900 + [1] * 4900 +
                  [2] * 200)  # minority class is 2% of data
    bcs = BalancedClassificationSampler(min_samples=201)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    # severe imbalanace case, don't resample
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)

    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 1800
    assert all(y2.value_counts().values == [800, 800, 200])
    assert y2.value_counts()[2] == 200

    bcs = BalancedClassificationSampler(balanced_ratio=3)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    # resample to 4:1 ratios on both 0 and 1 classes
    assert len(X2) == 1400
    assert all(y2.value_counts().values == [600, 600, 200])
    assert y2.value_counts()[2] == 200
def test_classification_imbalanced_custom_indices(index):
    X = pd.DataFrame({"a": [i for i in range(1000)]}, index=index)
    y = pd.Series([0] * 900 + [1] * 100, index=index)
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 500
    assert all(y2.value_counts(0).values == [400, 100])
    assert all(y2.index.values == X2.index.values)
    assert len(set(y2.index.values).intersection(set(y.index.values))) == len(y2)
예제 #8
0
def test_classification_data_drop():
    # tests for whether or not the `max(0, counts[k] - goal_value)` code works as expected
    X = pd.DataFrame({"a": [i for i in range(420)]})
    y = pd.Series([0] * 90 + [1] * 100 + [2] * 120 + [3] * 40 + [4] * 70)
    # will downsample the [2] target
    # will try to downsample [0] and [4], but max(0, x) will prevent that
    bcs = BalancedClassificationSampler(balanced_ratio=1, min_percentage=0.01)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 400
    assert y2.value_counts().values[0] == 100
예제 #9
0
def test_classification_imbalanced_data_type(data_type, make_data_type):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 900 + [1] * 100)
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    assert len(indices) == 500
    if data_type in ['pd', 'np']:
        y2 = y.loc[indices]
        assert all(y2.value_counts().values == [400, 100])
        assert y2.value_counts()[1] == 100
def test_classification_imbalanced_severe_imbalance_binary(min_samples, min_percentage):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 850 + [1] * 150)  # minority class is 15% of total distribution
    bcs = BalancedClassificationSampler(sampling_ratio=0.5, min_samples=min_samples, min_percentage=min_percentage)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_samples >= 200 and min_percentage >= 0.2:
        # severe imbalance, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # does not classify as severe imbalance, so balance 2:1 with min_samples
        assert len(X2) == 150 + max(min_samples, 2 * 150)
        assert y2.value_counts().values[0] == max(min_samples, 2 * 150)
def test_classification_imbalanced_normal_imbalance_binary(min_samples, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 850 + [1] * 150)  # minority class is 15% of total distribution, never counts as severe imbalance
    bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio, min_samples=min_samples)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if sampling_ratio < 0.2:
        # data is balanced, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # rebalance according to the ratio and min_samples
        assert len(X2) == 150 + max(min_samples, int(150 / sampling_ratio))
        assert y2.value_counts().values[0] == max(min_samples, int(150 / sampling_ratio))
예제 #12
0
def test_balanced_classification_errors():
    with pytest.raises(ValueError, match="balanced_ratio must be"):
        BalancedClassificationSampler(balanced_ratio=-1)

    with pytest.raises(ValueError, match="min_sample must be"):
        BalancedClassificationSampler(min_samples=0)

    with pytest.raises(ValueError, match="min_percentage must be"):
        BalancedClassificationSampler(min_percentage=0)

    with pytest.raises(ValueError, match="min_percentage must be"):
        BalancedClassificationSampler(min_percentage=0.6)

    with pytest.raises(ValueError, match="min_percentage must be"):
        BalancedClassificationSampler(min_percentage=-1.3)
def test_classification_imbalanced_normal_imbalance_multiclass(data_type, min_samples, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if data_type == 'n':
        y = pd.Series([0] * 800 + [1] * 100 + [2] * 100)  # minority class is 10% of total distribution
    else:
        y = pd.Series(["class_1"] * 800 + ["class_2"] * 100 + ["class_3"] * 100)
    bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio, min_samples=min_samples)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if sampling_ratio < 0.2:
        # data is balanced, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # rebalance according to the ratio and min_samples
        assert len(X2) == 200 + max(min_samples, int(100 / sampling_ratio))
        assert y2.value_counts().values[0] == max(min_samples, int(100 / sampling_ratio))
def test_classification_imbalanced_min_percentage(num_classes, min_percentage):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if num_classes == 2:
        y = pd.Series([0] * 950 + [1] * 50)
    else:
        y = pd.Series([0] * 820 + [1] * 90 + [2] * 90)
    bcs = BalancedClassificationSampler(sampling_ratio=1, min_percentage=min_percentage)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_percentage <= 0.05:
        # does not classify as severe imbalance, so balance 1:1 with min_samples==100
        assert len(X2) == {2: 150, 3: 280}[num_classes]
        assert y2.value_counts().values[0] == 100
    else:
        # severe imbalance, do nothing
        pd.testing.assert_frame_equal(X2, X)
예제 #15
0
def test_balanced_classification_init(ratio, samples, percentage, seed):
    bcs = BalancedClassificationSampler(balanced_ratio=ratio,
                                        min_samples=samples,
                                        min_percentage=percentage,
                                        random_seed=seed)
    assert bcs.balanced_ratio == ratio
    assert bcs.min_samples == samples
    assert bcs.min_percentage == percentage
    assert bcs.random_seed == seed
def test_classification_imbalanced_sampling_ratio(num_classes, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if num_classes == 2:
        y = pd.Series([0] * 750 + [1] * 250)
    else:
        y = pd.Series([0] * 600 + [1] * 200 + [2] * 200)
    bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if sampling_ratio <= 1 / 3:
        # the classes are considered balanced, do nothing
        pd.testing.assert_frame_equal(X, X2)
        pd.testing.assert_series_equal(y, y2)
    else:
        # remove some samples
        assert len(X2) == {2: (250 + int(250 / sampling_ratio)), 3: (400 + int(200 / sampling_ratio))}[num_classes]
        assert len(y2) == len(X2)
        assert y2.value_counts().values[0] == int(1 / sampling_ratio) * {2: 250, 3: 200}[num_classes]
def test_classification_imbalanced_min_samples(num_classes, min_samples):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if num_classes == 2:
        y = pd.Series([0] * 900 + [1] * 100)
    else:
        y = pd.Series([0] * 799 + [1] * 101 + [2] * 100)
    bcs = BalancedClassificationSampler(sampling_ratio=1, min_samples=min_samples)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_samples <= 100:
        # balance 1:1 without conflicting with min_samples
        assert len(X2) == {2: 200, 3: 300}[num_classes]
        assert y2.value_counts().values[0] == 100
    else:
        # cannot balance 1:1, choosing the min_samples size for the majority class and add minority class(es)
        if num_classes == 2:
            assert len(X2) == min_samples + 100
            assert y2.value_counts().values[0] == min_samples
        else:
            assert len(X2) == min_samples + 201
            assert y2.value_counts().values[0] == min_samples
예제 #18
0
def test_classification_imbalanced_severe_imbalance_multiclass(
        data_type, min_samples, min_percentage):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if data_type == 'n':
        y = pd.Series([0] * 800 + [1] * 100 +
                      [2] * 100)  # minority class is 10% of total distribution
    else:
        y = pd.Series(["class_1"] * 800 + ["class_2"] * 100 +
                      ["class_3"] * 100)
    bcs = BalancedClassificationSampler(balanced_ratio=2,
                                        min_samples=min_samples,
                                        min_percentage=min_percentage)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_samples >= 200 and min_percentage >= 0.2:
        # severe imbalance, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # does not classify as severe imbalance, so balance 2:1 with min_samples
        assert len(X2) == 200 + max(min_samples, 2 * 100)
        assert y2.value_counts().values[0] == max(min_samples, 2 * 100)
def test_classification_imbalanced_random_seed(random_seed, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 800 + [1] * 200)
    bcs1 = BalancedClassificationSampler(sampling_ratio=sampling_ratio, random_seed=random_seed)
    bcs2 = BalancedClassificationSampler(sampling_ratio=sampling_ratio, random_seed=random_seed)
    indices1 = bcs1.fit_resample(X, y)
    X1 = X.loc[indices1]
    y1 = y.loc[indices1]
    indices2 = bcs2.fit_resample(X, y)
    X2 = X.loc[indices2]
    y2 = y.loc[indices2]

    if sampling_ratio <= 0.25:
        # data is balanced
        pd.testing.assert_frame_equal(X1, X)
    else:
        assert len(X2) == 200 + int(200 / sampling_ratio)
        assert y2.value_counts().values[0] == int(200 / sampling_ratio)
    pd.testing.assert_frame_equal(X1, X2)
    pd.testing.assert_series_equal(y1, y2)
예제 #20
0
def test_classification_imbalanced_small_dataset(size):
    X = pd.DataFrame({"a": [i for i in range(size)]})
    y = pd.Series([0] * int(0.8 * size) + [1] * int(0.2 * size))
    bcs = BalancedClassificationSampler(balanced_ratio=1)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if size == 100:
        pd.testing.assert_frame_equal(X2, X)
    else:
        assert len(X2) == 0.2 * size + 100

    bcs2 = BalancedClassificationSampler(balanced_ratio=1, min_samples=40)
    indices = bcs2.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if size == 500:
        # resulting majority size is 100
        assert len(X2) == 200
        assert y2.value_counts(normalize=True).values[0] == 0.5
    else:
        assert len(X2) == 0.2 * size + 40
        assert y2.value_counts().values[0] == 40