def test_give_classifier_wrong_obj(): ratio = 'auto' classifier = 2 bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_sample(X, Y)
def test_give_classifier_wrong_obj(): sampling_strategy = 'auto' classifier = 2 bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_sample(X, Y)
def test_give_classifier_obj(): ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=False, estimator=classifier) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_half(): ratio = 0.8 bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.9281014, 0.53085498], [0.3084254, 0.33299982], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto(): sampling_strategy = 'auto' bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[0.28893132, -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.77481731, 0.60935141], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_sample_half(): ratio = {0: 8, 1: 10} bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [1.25192108, -0.22367336], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [0.11622591, -0.0317206], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [0.3084254, 0.33299982], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [-0.14374509, 0.27370049], [0.9281014, 0.53085498], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [1.15514042, 0.0129463], [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_gradient_boosting(): """Test the fit and sample routine with auto ratio with a gradient boosting.""" # Define the ratio parameter ratio = 'auto' classifier = 'gradient-boosting' # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, classifier=classifier) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_gb.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_gb.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_gb.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_give_classifier_obj(): ratio = 'auto' estimator = RandomForestClassifier(random_state=RND_SEED) bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=False, estimator=estimator) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_linear_svm(): ratio = 'auto' classifier = 'linear-svm' bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=False, classifier=classifier) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[1.15514042, 0.0129463], [0.9281014, 0.53085498], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.70472253, -0.73309052], [0.77481731, 0.60935141], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_early_stop(): """Test the fit and sample routine with auto ratio with a static number of subsets.""" # Define the ratio parameter ratio = 'auto' n_subset = 4 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_n_sub.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_n_sub.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_n_sub.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_fit_sample_auto_early_stop(): """Test the fit and sample routine with auto ratio with 1 subset.""" # Define the ratio parameter ratio = 'auto' n_subset = 1 # Create the sampling object bc = BalanceCascade( ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]) idx_gt = np.array( [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]]) # Check each array assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_sample_auto_early_stop(): """Test the fit and sample routine with auto ratio with 1 subset.""" # Define the ratio parameter ratio = 'auto' n_subset = 1 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]) idx_gt = np.array( [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]]) # Check each array assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def deep_ensemble_merged(smote=None): dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED) ensembler = BalanceCascade(estimator=dt, n_max_subset=10, random_state=KFOLD_SEED) print("fitting sample") X_res, y_res = ensembler.fit_sample(features, labels_1d) print(X_res.shape, y_res.shape) print("training") # Merge sample batches Xs = None ys = None for i, X_train in enumerate(X_res): if Xs is None: Xs = np.array(X_res[i]) ys = np.array(y_res[i]) print(Xs.shape, ys.shape) else: Xs = np.concatenate((Xs, np.array(X_res[i]))) ys = np.concatenate((ys, np.array(y_res[i]))) print(Xs.shape, ys.shape) shuffle(Xs, ys) # Generate more synthetic samples if smote is not None: Xs, ys = smote.fit_sample(Xs, ys) shuffle(Xs, ys) ys = to_categorical(ys, 2) return Xs, ys
def ensemble_adaboost(feat, label): print(type(label)) print(Counter(label)) bm = BalanceCascade(random_state=19, estimator='adaboost') feat_res, label_res = bm.fit_sample(feat, label) print(label_res.shape) return feat_res, label_res
def test_rf_wth_bootstrap(): """Test the fit and sample routine with auto ratio with a random forest.""" # Define the ratio parameter ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) # Create the sampling object bc = BalanceCascade( ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier, bootstrap=False) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array( [ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) ], dtype=object) y_gt = np.array( [ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) idx_gt = np.array( [ np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]), np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_rf_wth_bootstrap(): # Define the ratio parameter ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) # Create the sampling object bc = BalanceCascade( ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier, bootstrap=False) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array( [ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) ], dtype=object) y_gt = np.array( [ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) idx_gt = np.array( [ np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]), np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_fit_sample_auto_early_stop_2(): """Test the fit and sample routine with auto ratio with a 2 subsets.""" # Define the ratio parameter ratio = 'auto' n_subset = 2 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.70472253, -0.73309052], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) ], dtype=object) y_gt = np.array([ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) ], dtype=object) idx_gt = np.array([ np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, 8, 1, 7]), np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 8, 0, 3, 5, 9]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def unbalanceProcess(params, X_train, y_train): pos_num = np.sum(y_train == 0) neg_num = y_train.shape[0] - pos_num ratio = {0: int(pos_num * 0.2), 1: int(neg_num * 1)} y_train = y_train.astype("int") sm = BalanceCascade(sampling_strategy=ratio,# replacement=True, random_state=params['random-state'], n_max_subset=10, estimator=LogisticRegression(solver='sag', max_iter=200, random_state=0)) X_train_res, y_train_res = sm.fit_sample(X_train, y_train) X_train_res = X_train_res[0]; y_train_res = y_train_res[0] return X_train_res, y_train_res
def cross_validation_ensenble(name): with open('../data/conv_pred/train_data2_' + name + '.pickle', 'rb') as f: data = pickle.load(f) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) cv = 5 kf = KFold(n_splits=cv) fscore = 0 ftscore = 0 all_f_value = 0 for train_index, test_index in tqdm(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] ensenble_num = 1 bc = BalanceCascade(estimator=LogisticRegression(), n_max_subset=ensenble_num) bc_x, bc_y = bc.fit_sample(X_train, y_train) models = [] predicts = [] final_result = [] models.append(xgb.XGBClassifier(n_estimators=500, max_delta_step=1)) for i in range(ensenble_num): models[i].fit(bc_x[i], bc_y[i]) for i in range(ensenble_num): predicts.append(models[i].predict_proba(X_test)) for i in range(len(predicts[0])): result = [0, 0] for j in range(ensenble_num): result[0] += predicts[j][i][0] / ensenble_num result[1] += predicts[j][i][1] / ensenble_num final_result.append(result) precision, recall, f_value, _ = eval(y_test, final_result) fscore += precision ftscore += recall all_f_value += f_value # pprint(sorted( # zip(np.mean([est.steps[1][1].feature_importances_ for est in model.estimators_], axis=0), v.feature_names_), # key=lambda x: x[0], reverse=True)) print('\n') print('final precision : ', str(fscore / cv)) print('final recall : ', str(ftscore / cv)) print('final f-value : ', str(all_f_value / cv))
def test_fit_sample_auto(): ratio = 'auto' bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[0.28893132, -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.77481731, 0.60935141], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_05.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.8 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array( [ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.9281014, 0.53085498], [0.3084254, 0.33299982]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.70472253, -0.73309052], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [0.28893132, -0.38761769]]) ], dtype=object) y_gt = np.array( [ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def test_fit_sample_auto_early_stop(): sampling_strategy = 'auto' estimator = LinearSVC(random_state=RND_SEED) bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=False, estimator=estimator, n_max_subset=1) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def smot(train_x, train_y, feature_columns): from imblearn.ensemble import BalanceCascade from sklearn.ensemble import RandomForestClassifier #sm = RandomOverSampler(ratio='majority') #from imblearn.ensemble import BalanceCascade sm = BalanceCascade(random_state=42, classifier=RandomForestClassifier()) print('Détail du nombre par CLASSE Y {}'.format(Counter(train_y))) X_res, y_res = sm.fit_sample(train_x, train_y) my_list = map(lambda x: x[0], y_res) train_y = pd.Series(my_list) print(' Détail du nombre par CLASSE Y {}'.format(Counter(train_y))) # reconstitution DATAFRAME train_x = pd.DataFrame(X_res, columns=feature_columns) return train_x, train_y
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.8 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.9281014, 0.53085498], [0.3084254, 0.33299982]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.70472253, -0.73309052], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [0.28893132, -0.38761769]]) ], dtype=object) y_gt = np.array([ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
from sklearn.linear_model import LassoCV from src.common.my_data import Data data = Data() X1 = pd.read_table(data.output.sorted_train_agg_have_log_usr).drop('USRID', axis=1,inplace=True) X2 = pd.read_table(data.feature.tf_idf_have_log_usr_evt) Y = pd.read_table(data.output.sorted_train_flg_have_log_usr)['FLAG'] X = pd.concat([X1, X2], axis=1) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) print('Original dataset shape {}'.format(Counter(y_train))) bc = BalanceCascade(random_state=40, estimator ='adaboost') x_res, y_res = bc.fit_sample(x_train, y_train) print('Resampled dataset shape {}'.format(Counter(y_res[0]))) x_res_train = x_res[0] y_res_train = y_res[0] #支持向量机 svc = LinearSVC(dual=False) svc.fit(x_res_train, y_res_train) result_svc = svc.predict(x_test) #逻辑回归 lgc = LogisticRegressionCV() lgc.fit(x_res_train, y_res_train) result_lgc = lgc.predict(x_test)
from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import MultiLabelBinarizer from sklearn_pandas import DataFrameMapper now = time.time() testData = pd.read_pickle(r'E:\tencent\input\noonehot\train') testData = testData.fillna(0) testData_x = testData.iloc[:, 3:] testData_y = testData.iloc[:, :1] #x,y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) print('Original dataset shape {}'.format(Counter(testData_y))) #采样 bc = BalanceCascade(random_state=42) X_res, y_res = bc.fit_sample(testData_x, testData_y) #print(X_res) #print(y_res) print('Resampled dataset shape {}'.format(Counter(y_res[0]))) print(time.time() - now) #获取训练集L #trains = train_data.drop('f1') labels = pd.DataFrame(y_res[0]) trains = pd.DataFrame(X_res[0]) #trains.columns = ['creativeID', 'userID', 'positionID', 'connectionType', 'telecomsOperator', 'age', 'gender', 'education', 'marriageStatus', 'haveBaby', 'hometown', 'residence', 'advertiserID', 'appID', 'appPlatform', 'sitesetID', 'positionType'] ''' mapper =DataFrameMapper([ ('education',LabelBinarizer()), (['positionID','connectionType','telecomsOperator','gender','education','marriageStatus', 'haveBaby','appPlatform','sitesetID','positionType'],OneHotEncoder()), # ('hometown',[FunctionTransformer(lambda x: x%100),MultiLabelBinarizer()]),
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Balance Cascade method bc = BalanceCascade() X_resampled, y_resampled = bc.fit_sample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1], label="Class #1 - set #{}".format(iy), alpha=0.5)
train = pd.concat([train, dummied], axis=1) train_x, test_x, train_y, test_y = train_test_split(train, Y, test_size=0.3, random_state=1) print("正负样本数:正样本数:{}\n负样本数:{}\n正负样本比例:{}".format( train_y.value_counts()[0], train_y.value_counts()[1], train_y.value_counts()[0] / train_y.value_counts()[1])) #针对不平衡数据,采样生成新的15个数据集 n_subset = 15 BC = BalanceCascade(ratio='auto', n_max_subset=n_subset, random_state=123) train_xBC, train_yBC = BC.fit_sample(train_x, train_y) #for ii in np.arange(n_subset): # print(pd.value_counts(train_yBC[ii,:])) #可以发现每个数据集中每类均为1308个样本。下面针对每一对数据集,训练一个xgboost分类器。 lgbmodels = [] n_estimator = np.arange(500, 800, 50) n_estimator = np.random.choice(n_estimator, n_subset) max_depth = np.arange(6, 10, 1) max_depth = np.random.choice(max_depth, n_subset) num_leaves = np.arange(40, 70, 5) num_leaves = np.random.choice(num_leaves, n_subset) reg_alpha = np.arange(0, 2, 0.5) reg_alpha = np.random.choice(reg_alpha, n_subset) feature_fraction = [0.6, 0.7, 0.8, 0.9] feature_fraction = np.random.choice(feature_fraction, n_subset)
def test_init_wrong_classifier(): classifier = 'rnd' bc = BalanceCascade(classifier=classifier) with raises(NotImplementedError): bc.fit_sample(X, Y)
'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy', 'FraudFound_P']) sns.set_style('whitegrid') sns.countplot(x = 'FraudFound_P', data = d, palette = 'RdBu_r') ''' # # Balance Cascade # In[5]: from imblearn.ensemble import BalanceCascade bc = BalanceCascade(random_state=42) X_res, y_res = bc.fit_sample(X_train, y_train) # In[7]: #p = np.c_[X_res,y_res] '''d = pd.DataFrame(p, columns = ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Year', 'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy', 'FraudFound_P'])
y_train = y_train.ix[:, -1] # 取特征和标签 # X_train, y_train = data15.ix[:, 0:-1], data15.ix[:, -1] # X_test, y_test = data21.ix[:, 0:-1], data21.ix[:, -1] zscore = preprocessing.StandardScaler() X_train = zscore.fit_transform(X_train) # X_test = zscore.fit_transform(X_test) # 划分数据集n_max_subset=4 bc = BalanceCascade(random_state=0, estimator=ADB(random_state=0), bootstrap=True) # bc = BalanceCascade(random_state=0,estimator=ADB(random_state=0),n_max_subset=15) x_resampled, y_resampled = bc.fit_sample(X_train, y_train) # x_resampled, y_resampled = bc.fit_sample(X_test, y_test ) print(x_resampled.shape) # 打印输出集成方法处理后的x样本集概况 print(y_resampled.shape) # 打印输出集成方法处理后的y标签集概况 index_num = 1 # 设置抽样样本集索引 # x_resampled_t =pd.DataFrame(x_resampled[index_num],columns=['wind_speed','generator_speed','power','wind_direction','wind_direction_mean','yaw_position','yaw_speed','pitch1_angle','pitch2_angle','pitch3_angle','pitch1_speed','pitch2_speed','pitch3_speed','pitch1_moto_tmp','pitch2_moto_tmp','pitch3_moto_tmp','acc_x','acc_y','environment_tmp','int_tmp','pitch1_ng5_tmp','pitch2_ng5_tmp','pitch3_ng5_tmp','pitch1_ng5_DC','pitch2_ng5_DC','pitch3_ng5_DC']) # # 将数据转换为数据框并命名列名 # y_resampled_t =pd.DataFrame(y_resampled[index_num],columns=['label']) # 将数据转换为数据框并命名列名 # # # 按列合并数据框 # EasyEnsemble_resampled = pd.concat([x_resampled_t,y_resampled_t], axis = 1) # # 对label做分类汇总 # groupby_data_EasyEnsemble =EasyEnsemble_resampled.groupby('label').count() # # 打印输出经过处理后的数据集样本分类分布 # print (groupby_data_EasyEnsemble)
data = Data() X1 = pd.read_table(data.output.sorted_train_agg_have_log_usr).drop( 'USRID', axis=1, inplace=True) X2 = pd.read_table(data.feature.tf_idf_have_log_usr_evt) Y = pd.read_table(data.output.sorted_train_flg_have_log_usr)['FLAG'] X = pd.concat([X1, X2], axis=1) # x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2) # print('Original dataset shape {}'.format(Counter(y_train))) test_agg = pd.read_table(data.input.test_agg) result["USRID"] = test_agg['USRID'] test_noID = test_agg.drop('USRID', axis=1) bc = BalanceCascade(random_state=40, estimator='adaboost') x_res, y_res = bc.fit_sample(X, Y) print('Resampled dataset shape {}'.format(Counter(y_res[0]))) #支持向量机 # svc = LinearSVC(dual=False) # svc.fit(x_train,y_train) # result_svc = svc.predict(x_test) # # #逻辑回归 # lgc = LogisticRegressionCV() # lgc.fit(x_train,y_train) # result_lgc = lgc.predict(x_test) #线性回归 lasso = LassoCV() lasso.fit(x_res[0], y_res[0])
from imblearn.ensemble import BalanceCascade # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Balance Cascade method bc = BalanceCascade() X_resampled, y_resampled = bc.fit_sample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
from imblearn.ensemble import BalanceCascade from sklearn.svm import SVC with open('values.pkl', 'rb') as f: x_train = pickle.load(f) y_train = pickle.load(f) test = pickle.load(f) x_train = np.array(x_train) y_train = np.array(y_train) test = np.array(test) bc = BalanceCascade(estimator=SVC(gamma='auto'), random_state=100, n_max_subset=5) x_train_resam, y_train_resam = bc.fit_sample(x_train, y_train) with open('values_undersampling.pkl', 'wb') as f: pickle.dump(x_train_resam, f) pickle.dump(y_train_resam, f) pickle.dump(test, f) # In[13]: with open('values_undersampling.pkl', 'rb') as f: x_train_resam = pickle.load(f) y_train_resam = pickle.load(f) test = pickle(f) # ### Xgbosst