def test_give_classifier_wrong_obj():
    ratio = 'auto'
    classifier = 2
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True, estimator=classifier)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        bc.fit_sample(X, Y)
Exemplo n.º 2
0
def test_give_classifier_wrong_obj():
    sampling_strategy = 'auto'
    classifier = 2
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=True,
                        estimator=classifier)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        bc.fit_sample(X, Y)
Exemplo n.º 3
0
def test_give_classifier_obj():
    ratio = 'auto'
    classifier = RandomForestClassifier(random_state=RND_SEED)
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=False, estimator=classifier)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 4
0
def test_fit_sample_half():
    ratio = 0.8
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.9281014, 0.53085498],
                      [0.3084254, 0.33299982],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 5
0
def test_fit_sample_auto():
    sampling_strategy = 'auto'
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=True)
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]],
                     [[0.28893132, -0.38761769], [0.83680821, 1.72827342],
                      [0.3084254, 0.33299982], [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049], [0.77481731, 0.60935141],
                      [-0.18410027, -0.45194484], [1.15514042, 0.0129463],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    idx_gt = np.array(
        [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19],
         [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_fit_sample_half():
    ratio = {0: 8, 1: 10}
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[-0.41635887, -0.38299653],
                      [0.53366841, -0.30312976],
                      [1.25192108, -0.22367336],
                      [1.70580611, -0.11219234],
                      [1.52091956, -0.49283504],
                      [0.11622591, -0.0317206],
                      [1.31301027, -0.92648734],
                      [0.88407872, 0.35454207],
                      [0.3084254, 0.33299982],
                      [0.08711622, 0.93259929],
                      [-0.28162401, -2.10400981],
                      [-0.14374509, 0.27370049],
                      [0.9281014, 0.53085498],
                      [-0.18410027, -0.45194484],
                      [0.77481731, 0.60935141],
                      [1.15514042, 0.0129463],
                      [-1.11515198, -0.93689695],
                      [0.70472253, -0.73309052]]])
    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_gradient_boosting():
    """Test the fit and sample routine with auto ratio with a gradient
    boosting."""

    # Define the ratio parameter
    ratio = 'auto'
    classifier = 'gradient-boosting'

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        classifier=classifier)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_gb.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_gb.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_gb.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
def test_give_classifier_obj():
    ratio = 'auto'
    estimator = RandomForestClassifier(random_state=RND_SEED)
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=False, estimator=estimator)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 9
0
def test_fit_sample_auto_linear_svm():
    ratio = 'auto'
    classifier = 'linear-svm'
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=False,
                        classifier=classifier)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]],
                     [[1.15514042, 0.0129463], [0.9281014, 0.53085498],
                      [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                      [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                      [0.70472253, -0.73309052], [0.77481731, 0.60935141],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_early_stop():
    """Test the fit and sample routine with auto ratio with a static number
    of subsets."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 4

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_n_sub.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_n_sub.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_n_sub.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
def test_fit_sample_auto_early_stop():
    """Test the fit and sample routine with auto ratio with 1 subset."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 1

    # Create the sampling object
    bc = BalanceCascade(
        ratio=ratio,
        random_state=RND_SEED,
        return_indices=True,
        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]])

    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
    idx_gt = np.array(
        [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]])
    # Check each array
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemplo n.º 12
0
def test_fit_sample_auto_early_stop():
    """Test the fit and sample routine with auto ratio with 1 subset."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 1

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]])

    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
    idx_gt = np.array(
        [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]])
    # Check each array
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemplo n.º 13
0
def deep_ensemble_merged(smote=None):
    dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
    ensembler = BalanceCascade(estimator=dt,
                               n_max_subset=10,
                               random_state=KFOLD_SEED)

    print("fitting sample")
    X_res, y_res = ensembler.fit_sample(features, labels_1d)
    print(X_res.shape, y_res.shape)

    print("training")

    # Merge sample batches
    Xs = None
    ys = None
    for i, X_train in enumerate(X_res):
        if Xs is None:
            Xs = np.array(X_res[i])
            ys = np.array(y_res[i])
            print(Xs.shape, ys.shape)
        else:
            Xs = np.concatenate((Xs, np.array(X_res[i])))
            ys = np.concatenate((ys, np.array(y_res[i])))

    print(Xs.shape, ys.shape)
    shuffle(Xs, ys)

    # Generate more synthetic samples
    if smote is not None:
        Xs, ys = smote.fit_sample(Xs, ys)

    shuffle(Xs, ys)
    ys = to_categorical(ys, 2)

    return Xs, ys
Exemplo n.º 14
0
def ensemble_adaboost(feat, label):
    print(type(label))
    print(Counter(label))
    bm = BalanceCascade(random_state=19, estimator='adaboost')
    feat_res, label_res = bm.fit_sample(feat, label)
    print(label_res.shape)
    return feat_res, label_res
def test_rf_wth_bootstrap():
    """Test the fit and sample routine with auto ratio with a random
    forest."""

    # Define the ratio parameter
    ratio = 'auto'
    classifier = RandomForestClassifier(random_state=RND_SEED)

    # Create the sampling object
    bc = BalanceCascade(
        ratio=ratio,
        random_state=RND_SEED,
        return_indices=True,
        estimator=classifier,
        bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array(
        [
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]),
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.77481731, 0.60935141],
                      [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                      [0.9281014, 0.53085498]])
        ],
        dtype=object)
    y_gt = np.array(
        [
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        ],
        dtype=object)
    idx_gt = np.array(
        [
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5,
                      13]),
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15])
        ],
        dtype=object)

    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
def test_rf_wth_bootstrap():
    # Define the ratio parameter
    ratio = 'auto'
    classifier = RandomForestClassifier(random_state=RND_SEED)

    # Create the sampling object
    bc = BalanceCascade(
        ratio=ratio,
        random_state=RND_SEED,
        return_indices=True,
        estimator=classifier,
        bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array(
        [
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]),
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.77481731, 0.60935141],
                      [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                      [0.9281014, 0.53085498]])
        ],
        dtype=object)
    y_gt = np.array(
        [
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        ],
        dtype=object)
    idx_gt = np.array(
        [
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5,
                      13]),
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15])
        ],
        dtype=object)

    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
def test_fit_sample_auto_early_stop_2():
    """Test the fit and sample routine with auto ratio with a 2 subsets."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 2

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array([
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                  [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                  [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                  [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]),
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.70472253, -0.73309052],
                  [-0.18410027, -0.45194484], [0.77481731, 0.60935141],
                  [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                  [0.9281014, 0.53085498]])
    ],
                    dtype=object)
    y_gt = np.array([
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
    ],
                    dtype=object)
    idx_gt = np.array([
        np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, 8, 1, 7]),
        np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 8, 0, 3, 5, 9])
    ],
                      dtype=object)

    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Exemplo n.º 18
0
def unbalanceProcess(params, X_train, y_train):
    pos_num = np.sum(y_train == 0)
    neg_num = y_train.shape[0] - pos_num
    ratio = {0: int(pos_num * 0.2),
             1: int(neg_num * 1)}
    y_train = y_train.astype("int")
    sm = BalanceCascade(sampling_strategy=ratio,# replacement=True,
                        random_state=params['random-state'], n_max_subset=10,
                        estimator=LogisticRegression(solver='sag', max_iter=200, random_state=0))

    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    X_train_res = X_train_res[0];
    y_train_res = y_train_res[0]

    return X_train_res, y_train_res
Exemplo n.º 19
0
def cross_validation_ensenble(name):
    with open('../data/conv_pred/train_data2_' + name + '.pickle', 'rb') as f:
        data = pickle.load(f)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])

    cv = 5
    kf = KFold(n_splits=cv)
    fscore = 0
    ftscore = 0
    all_f_value = 0
    for train_index, test_index in tqdm(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ensenble_num = 1

        bc = BalanceCascade(estimator=LogisticRegression(),
                            n_max_subset=ensenble_num)
        bc_x, bc_y = bc.fit_sample(X_train, y_train)
        models = []
        predicts = []
        final_result = []
        models.append(xgb.XGBClassifier(n_estimators=500, max_delta_step=1))
        for i in range(ensenble_num):
            models[i].fit(bc_x[i], bc_y[i])
        for i in range(ensenble_num):
            predicts.append(models[i].predict_proba(X_test))
        for i in range(len(predicts[0])):
            result = [0, 0]
            for j in range(ensenble_num):
                result[0] += predicts[j][i][0] / ensenble_num
                result[1] += predicts[j][i][1] / ensenble_num
            final_result.append(result)
        precision, recall, f_value, _ = eval(y_test, final_result)
        fscore += precision
        ftscore += recall
        all_f_value += f_value
    # pprint(sorted(
    #     zip(np.mean([est.steps[1][1].feature_importances_ for est in model.estimators_], axis=0), v.feature_names_),
    #     key=lambda x: x[0], reverse=True))
    print('\n')
    print('final precision : ', str(fscore / cv))
    print('final recall : ', str(ftscore / cv))
    print('final f-value : ', str(all_f_value / cv))
def test_fit_sample_auto():
    ratio = 'auto'
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True)
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]],
                     [[0.28893132, -0.38761769],
                      [0.83680821, 1.72827342],
                      [0.3084254, 0.33299982],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.77481731, 0.60935141],
                      [-0.18410027, -0.45194484],
                      [1.15514042, 0.0129463],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    idx_gt = np.array(
        [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19],
         [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemplo n.º 21
0
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.5

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)

    # Get the different subset
    X_resampled, y_resampled = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_05.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.8

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled = bc.fit_sample(X, Y)

    X_gt = np.array(
        [
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.9281014, 0.53085498], [0.3084254, 0.33299982]]),
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.70472253, -0.73309052],
                      [-0.18410027, -0.45194484], [0.77481731, 0.60935141],
                      [0.28893132, -0.38761769]])
        ],
        dtype=object)

    y_gt = np.array(
        [
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        ],
        dtype=object)
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
Exemplo n.º 23
0
def test_fit_sample_auto_early_stop():
    sampling_strategy = 'auto'
    estimator = LinearSVC(random_state=RND_SEED)
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=False,
                        estimator=estimator,
                        n_max_subset=1)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 24
0
def smot(train_x, train_y, feature_columns):
    from imblearn.ensemble import BalanceCascade
    from sklearn.ensemble import RandomForestClassifier

    #sm = RandomOverSampler(ratio='majority')
    #from imblearn.ensemble import BalanceCascade

    sm = BalanceCascade(random_state=42, classifier=RandomForestClassifier())

    print('Détail du nombre par CLASSE  Y {}'.format(Counter(train_y)))
    X_res, y_res = sm.fit_sample(train_x, train_y)

    my_list = map(lambda x: x[0], y_res)
    train_y = pd.Series(my_list)
    print(' Détail du nombre par CLASSE Y  {}'.format(Counter(train_y)))

    # reconstitution DATAFRAME
    train_x = pd.DataFrame(X_res, columns=feature_columns)

    return train_x, train_y
Exemplo n.º 25
0
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.8

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled = bc.fit_sample(X, Y)

    X_gt = np.array([
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                  [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                  [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                  [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                  [0.9281014, 0.53085498], [0.3084254, 0.33299982]]),
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.70472253, -0.73309052],
                  [-0.18410027, -0.45194484], [0.77481731, 0.60935141],
                  [0.28893132, -0.38761769]])
    ],
                    dtype=object)

    y_gt = np.array([
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    ],
                    dtype=object)
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Exemplo n.º 27
0
from sklearn.linear_model import LassoCV

from src.common.my_data import Data

data = Data()

X1 = pd.read_table(data.output.sorted_train_agg_have_log_usr).drop('USRID', axis=1,inplace=True)
X2 = pd.read_table(data.feature.tf_idf_have_log_usr_evt)
Y = pd.read_table(data.output.sorted_train_flg_have_log_usr)['FLAG']
X = pd.concat([X1, X2], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('Original dataset shape {}'.format(Counter(y_train)))

bc = BalanceCascade(random_state=40, estimator ='adaboost')
x_res, y_res = bc.fit_sample(x_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res[0])))

x_res_train = x_res[0]
y_res_train = y_res[0]

#支持向量机
svc = LinearSVC(dual=False)
svc.fit(x_res_train, y_res_train)
result_svc = svc.predict(x_test)

#逻辑回归
lgc = LogisticRegressionCV()
lgc.fit(x_res_train, y_res_train)
result_lgc = lgc.predict(x_test)
Exemplo n.º 28
0
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn_pandas import DataFrameMapper

now = time.time()
testData = pd.read_pickle(r'E:\tencent\input\noonehot\train')
testData = testData.fillna(0)
testData_x = testData.iloc[:, 3:]
testData_y = testData.iloc[:, :1]

#x,y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

print('Original dataset shape {}'.format(Counter(testData_y)))
#采样
bc = BalanceCascade(random_state=42)
X_res, y_res = bc.fit_sample(testData_x, testData_y)
#print(X_res)
#print(y_res)
print('Resampled dataset shape {}'.format(Counter(y_res[0])))
print(time.time() - now)
#获取训练集L
#trains = train_data.drop('f1')
labels = pd.DataFrame(y_res[0])
trains = pd.DataFrame(X_res[0])
#trains.columns = ['creativeID', 'userID', 'positionID', 'connectionType', 'telecomsOperator', 'age', 'gender', 'education', 'marriageStatus', 'haveBaby', 'hometown', 'residence', 'advertiserID', 'appID', 'appPlatform', 'sitesetID', 'positionType']
'''
mapper =DataFrameMapper([
            ('education',LabelBinarizer()),
            (['positionID','connectionType','telecomsOperator','gender','education','marriageStatus',
              'haveBaby','appPlatform','sitesetID','positionType'],OneHotEncoder()),
           # ('hometown',[FunctionTransformer(lambda x: x%100),MultiLabelBinarizer()]), 
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_sample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],
                label="Class #1 - set #{}".format(iy), alpha=0.5)
train = pd.concat([train, dummied], axis=1)

train_x, test_x, train_y, test_y = train_test_split(train,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=1)
print("正负样本数:正样本数:{}\n负样本数:{}\n正负样本比例:{}".format(
    train_y.value_counts()[0],
    train_y.value_counts()[1],
    train_y.value_counts()[0] / train_y.value_counts()[1]))

#针对不平衡数据,采样生成新的15个数据集

n_subset = 15
BC = BalanceCascade(ratio='auto', n_max_subset=n_subset, random_state=123)
train_xBC, train_yBC = BC.fit_sample(train_x, train_y)
#for ii in np.arange(n_subset):
#    print(pd.value_counts(train_yBC[ii,:]))

#可以发现每个数据集中每类均为1308个样本。下面针对每一对数据集,训练一个xgboost分类器。
lgbmodels = []
n_estimator = np.arange(500, 800, 50)
n_estimator = np.random.choice(n_estimator, n_subset)
max_depth = np.arange(6, 10, 1)
max_depth = np.random.choice(max_depth, n_subset)
num_leaves = np.arange(40, 70, 5)
num_leaves = np.random.choice(num_leaves, n_subset)
reg_alpha = np.arange(0, 2, 0.5)
reg_alpha = np.random.choice(reg_alpha, n_subset)
feature_fraction = [0.6, 0.7, 0.8, 0.9]
feature_fraction = np.random.choice(feature_fraction, n_subset)
Exemplo n.º 31
0
def test_init_wrong_classifier():
    classifier = 'rnd'
    bc = BalanceCascade(classifier=classifier)
    with raises(NotImplementedError):
        bc.fit_sample(X, Y)
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars',
       'BasePolicy', 'FraudFound_P'])
sns.set_style('whitegrid')
sns.countplot(x = 'FraudFound_P', data = d, palette = 'RdBu_r')

'''
# # Balance Cascade

# In[5]:


from imblearn.ensemble import BalanceCascade
bc = BalanceCascade(random_state=42)
X_res, y_res = bc.fit_sample(X_train, y_train)


# In[7]:


#p = np.c_[X_res,y_res]
'''d = pd.DataFrame(p, columns = ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age',
       'PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Year',
       'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
       'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType',
       'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars',
       'BasePolicy', 'FraudFound_P'])
Exemplo n.º 33
0
y_train = y_train.ix[:, -1]

# 取特征和标签
# X_train, y_train = data15.ix[:, 0:-1], data15.ix[:, -1]
# X_test, y_test = data21.ix[:, 0:-1], data21.ix[:, -1]

zscore = preprocessing.StandardScaler()
X_train = zscore.fit_transform(X_train)
# X_test = zscore.fit_transform(X_test)

# 划分数据集n_max_subset=4
bc = BalanceCascade(random_state=0,
                    estimator=ADB(random_state=0),
                    bootstrap=True)
# bc = BalanceCascade(random_state=0,estimator=ADB(random_state=0),n_max_subset=15)
x_resampled, y_resampled = bc.fit_sample(X_train, y_train)
# x_resampled, y_resampled = bc.fit_sample(X_test, y_test )

print(x_resampled.shape)  # 打印输出集成方法处理后的x样本集概况
print(y_resampled.shape)  # 打印输出集成方法处理后的y标签集概况

index_num = 1  # 设置抽样样本集索引
# x_resampled_t =pd.DataFrame(x_resampled[index_num],columns=['wind_speed','generator_speed','power','wind_direction','wind_direction_mean','yaw_position','yaw_speed','pitch1_angle','pitch2_angle','pitch3_angle','pitch1_speed','pitch2_speed','pitch3_speed','pitch1_moto_tmp','pitch2_moto_tmp','pitch3_moto_tmp','acc_x','acc_y','environment_tmp','int_tmp','pitch1_ng5_tmp','pitch2_ng5_tmp','pitch3_ng5_tmp','pitch1_ng5_DC','pitch2_ng5_DC','pitch3_ng5_DC'])
# # 将数据转换为数据框并命名列名
# y_resampled_t =pd.DataFrame(y_resampled[index_num],columns=['label']) # 将数据转换为数据框并命名列名
# # # 按列合并数据框
# EasyEnsemble_resampled = pd.concat([x_resampled_t,y_resampled_t], axis = 1)
# # 对label做分类汇总
# groupby_data_EasyEnsemble =EasyEnsemble_resampled.groupby('label').count()
# # 打印输出经过处理后的数据集样本分类分布
# print (groupby_data_EasyEnsemble)
Exemplo n.º 34
0
data = Data()

X1 = pd.read_table(data.output.sorted_train_agg_have_log_usr).drop(
    'USRID', axis=1, inplace=True)
X2 = pd.read_table(data.feature.tf_idf_have_log_usr_evt)
Y = pd.read_table(data.output.sorted_train_flg_have_log_usr)['FLAG']
X = pd.concat([X1, X2], axis=1)

# x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
# print('Original dataset shape {}'.format(Counter(y_train)))
test_agg = pd.read_table(data.input.test_agg)
result["USRID"] = test_agg['USRID']
test_noID = test_agg.drop('USRID', axis=1)

bc = BalanceCascade(random_state=40, estimator='adaboost')
x_res, y_res = bc.fit_sample(X, Y)
print('Resampled dataset shape {}'.format(Counter(y_res[0])))

#支持向量机
# svc = LinearSVC(dual=False)
# svc.fit(x_train,y_train)
# result_svc = svc.predict(x_test)
#
# #逻辑回归
# lgc = LogisticRegressionCV()
# lgc.fit(x_train,y_train)
# result_lgc = lgc.predict(x_test)

#线性回归
lasso = LassoCV()
lasso.fit(x_res[0], y_res[0])
Exemplo n.º 35
0
from imblearn.ensemble import BalanceCascade

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_sample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
Exemplo n.º 36
0
from imblearn.ensemble import BalanceCascade
from sklearn.svm import SVC

with open('values.pkl', 'rb') as f:
    x_train = pickle.load(f)
    y_train = pickle.load(f)
    test = pickle.load(f)

x_train = np.array(x_train)
y_train = np.array(y_train)
test = np.array(test)

bc = BalanceCascade(estimator=SVC(gamma='auto'),
                    random_state=100,
                    n_max_subset=5)
x_train_resam, y_train_resam = bc.fit_sample(x_train, y_train)

with open('values_undersampling.pkl', 'wb') as f:
    pickle.dump(x_train_resam, f)
    pickle.dump(y_train_resam, f)
    pickle.dump(test, f)

# In[13]:

with open('values_undersampling.pkl', 'rb') as f:
    x_train_resam = pickle.load(f)
    y_train_resam = pickle.load(f)
    test = pickle(f)

# ### Xgbosst