def test_oss_with_object():
    knn = KNeighborsClassifier(n_neighbors=1)
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_resample(X, Y)

    X_gt = np.array([
        [-0.3879569, 0.6894251],
        [0.91542919, -0.65453327],
        [-0.65571327, 0.42412021],
        [1.06446472, -1.09279772],
        [0.30543283, -0.02589502],
        [-0.00717161, 0.00318087],
        [-0.09322739, 1.28177189],
        [-0.77740357, 0.74097941],
        [-0.43877303, 1.07366684],
        [-0.85795321, 0.82980738],
        [-0.30126957, -0.66268378],
        [0.20246714, -0.34727125],
    ])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    knn = 1
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_resample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_oss_with_object():
    knn = KNeighborsClassifier(n_neighbors=1)
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_resample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [
        -0.65571327, 0.42412021
    ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [
        -0.00717161, 0.00318087
    ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    knn = 1
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = oss.fit_resample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#3
0
def one_sided_selection(X,
                        y,
                        visualize=False,
                        pca2d=True,
                        pca3d=True,
                        tsne=True,
                        pie_evr=True):
    oss = OneSidedSelection(random_state=42)
    X_res, y_res = oss.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
示例#4
0
def test_oss_fit_resample_with_indices():
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_resample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_oss_fit_resample_with_indices():
    oss = OneSidedSelection(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = oss.fit_resample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [
        -0.65571327, 0.42412021
    ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [
        -0.00717161, 0.00318087
    ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 8, 11])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#6
0
	def under_sampling(self, data, label, n_neighbors=5, method=None):
		#Input
		#	data: 2D array data (im_height*im_width, num of band)
		#	label: 1D array label(0,1,2...) per each data
		#	n_neighbors: num of neighbors used in OSS
		#	method: select under sampling method (OSS)
		#Output
		#	return under sampled data, label
		if method in self.under_method:
			print("Before sampling label proportion: ",Counter(label))
			if method == 'OSS' or method == 'OneSidedSelection':	  
				undersample = OneSidedSelection(n_neighbors=n_neighbors, n_seeds_S=200)
				data, label = undersample.fit_resample(data, label)
				
			print("After sampling label proportion: ",Counter(label))
		
		return data, label
# undersample and plot imbalanced dataset with One-Sided Selection
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import OneSidedSelection
from plotDataset import plot_dataset

if __name__ == '__main__':
    X, y = make_classification(n_samples=10000,
                               n_features=2,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               weights=[0.99],
                               flip_y=0,
                               random_state=1)
    counter = Counter(y)
    print(counter)
    plot_dataset(X, y, counter)

    undersample = OneSidedSelection(
        n_neighbors=1,
        n_seeds_S=200)  # n_seeds_S sets the number of samples in set S
    X, y = undersample.fit_resample(X, y)

    counter = Counter(y)
    print(counter)
    plot_dataset(X, y, counter)
示例#8
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply One-Sided Selection
oss = OneSidedSelection(return_indices=True)
X_resampled, y_resampled, idx_resampled = oss.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')
示例#10
0
#scaling
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(train_set.drop('plan', axis=1))

scaled_X_test = scaler.transform(test_set.drop('plan', axis=1))

scaled_test_users = scaler.transform(the_users_test)

scaled_test_users = pd.DataFrame(scaled_test_users,
                                 columns=the_users_test.columns)

#UNDERSAMPLING and/or OVERSAMPLING

#undersampling the train set
under = OneSidedSelection()
X_train_res, y_train_res = under.fit_resample(scaled_X_train, y_train)

#oversampling the train set
sm = SMOTE()
X_train_res, y_train_res = sm.fit_resample(X_train_res, y_train_res)

X_train_res = pd.DataFrame(X_train_res,
                           columns=train_set.drop('plan', axis=1).columns)

#creating the final train and test set for modeling
train_set = pd.concat([X_train_res, y_train_res], axis=1)

scaled_X_test = pd.DataFrame(scaled_X_test,
                             columns=test_set.drop('plan', axis=1).columns)

test_set = pd.concat([scaled_X_test, test_set['plan'].reset_index(drop=True)],
示例#11
0
def one_sided_selection(X, y):
    oss = OneSidedSelection(random_state=42)
    X_res, y_res = oss.fit_resample(X, y)
    return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train,
                        PM_ova_X_train, PM_ova_y_train, SC_ova_X_train,
                        SC_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN()
        AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ALLKNN":
        AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN()
        AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train,
                                                    AP_ova_y_train)
        PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train,
                                                    PM_ova_y_train)
        SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train,
                                                    SC_ova_y_train)
    elif imb_technique == "CNN":
        AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ENN":
        AP_enn, PM_enn, SC_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "IHT":
        AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NCR":
        AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AP_ova_y_train = [
            0 if i == "Add penalty" else 1 for i in AP_ova_y_train
        ]
        AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train]
        PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_ova_y_train = [
            0 if i == "Send for Credit Collection" else 1
            for i in SC_ova_y_train
        ]
        SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NM":
        AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss()
        AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "OSS":
        AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RENN":
        AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "SMOTE":
        AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE()
        AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "BSMOTE":
        AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN()
        AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train,
                                                   AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train,
                                                   PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train,
                                                   SC_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek()
        AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "TOMEK":
        AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks()
        AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "ROS":
        AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RUS":
        AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
df=d.iloc[:,6:]

X_all = df.iloc[:,[0,2]].values
y_all = df.iloc[:,10].values

# undersample and plot imbalanced dataset with One-Sided Selection
from collections import Counter
from imblearn.under_sampling import OneSidedSelection
from numpy import where
# summarize class distribution
counter = Counter(y_all)
print(counter)
# define the undersampling method
undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=600000, n_jobs=-1, random_state=42)
# transform the dataset
X, y = undersample.fit_resample(X_all, y_all)
# summarize the new class distribution
counter = Counter(y)
print(counter)
#Old: Counter({0.0: 7319966, 1.0: 972038})
#New: Counter({0.0: 3857552, 1.0: 972038})

# scatter plot of examples by class label
plt.figure(figsize=[13,8])
plt.ylim([0,7])
for label, _ in counter.items():
    row_ix = where(y == label)[0]
    plt.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label), s=16, marker='.')
plt.legend()
plt.show()
def one_sided_selection(x, y):
    print("----One Sided Selection----")
    sampler = OneSidedSelection()
    X, y = sampler.fit_resample(x, y)
    return X, y
示例#15
0
def all_imblearn(xx, yy):
    
    imblearnlist = []  
    
    """OVER SAMPLING"""
    
    """Random Over Sampler"""
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(xx, yy)
    randomOverSampler = [X_resampled, y_resampled, 'random over sampler']
    imblearnlist.append(randomOverSampler)
    
    """SMOTE"""
    X_resampled, y_resampled = SMOTE().fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote']
    imblearnlist.append(smote)
    
    """SMOTE borderline1"""
    sm = SMOTE(kind='borderline1')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline1']
    imblearnlist.append(smote)
    
    """SMOTE borderline2"""
    sm = SMOTE(kind='borderline2')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline2']
    imblearnlist.append(smote)
    
    """SMOTE svm"""
    sm = SMOTE(kind='svm')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote svm']
    imblearnlist.append(smote)
    
    """SMOTENC"""
    smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
    X_resampled, y_resampled = smote_nc.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smotenc']
    imblearnlist.append(smote)
    
#    """ADASYN"""
#    X_resampled, y_resampled = ADASYN.fit_resample(xx, yy)
#    adasyn = [X_resampled, y_resampled, 'adasyn']
#    imblearnlist.append(adasyn)
#    


    """UNDER SAMPLING"""
    
    """Cluster Centroids"""
    cc = ClusterCentroids(random_state=0)
    X_resampled, y_resampled = cc.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'cluster centroids']
    imblearnlist.append(reSampled)

    """Random Over Sampler"""
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'random under sampler']
    imblearnlist.append(reSampled)
    
    """Near Miss 1"""
    nm1 = NearMiss(version=1)
    X_resampled, y_resampled = nm1.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 1']
    imblearnlist.append(reSampled)
    
    """Near Miss 2"""
    nm2 = NearMiss(version=2)
    X_resampled, y_resampled = nm2.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 2']
    imblearnlist.append(reSampled)
    
    """Near Miss 3"""
    nm3 = NearMiss(version=3)
    X_resampled, y_resampled = nm3.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 3']
    imblearnlist.append(reSampled)
    
    """Edited Nearest Neighbours"""
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """Repeated Edited Nearest Neighbours"""
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """All KNN"""
    allknn = AllKNN()
    X_resampled, y_resampled = allknn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'allKNN']
    imblearnlist.append(reSampled)
    
    """Condensed Nearest Neighbour"""
    cnn = CondensedNearestNeighbour(random_state=0)
    X_resampled, y_resampled = cnn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour']
    imblearnlist.append(reSampled)
    
    """One Sided Selection"""
    oss = OneSidedSelection(random_state=0)
    X_resampled, y_resampled = oss.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'One Sided Selection']
    imblearnlist.append(reSampled)
    
    """Neighbourhood Cleaning Rule"""
    ncr = NeighbourhoodCleaningRule()
    X_resampled, y_resampled = ncr.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule']
    imblearnlist.append(reSampled)


    """OVER AND UNDER SAMPLING"""
    
    """SMOTEENN"""
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTEENN']
    imblearnlist.append(reSampled)
    
    """SMOTETomek"""
    smote_tomek = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTETomek']
    imblearnlist.append(reSampled)
    
    return imblearnlist
    
    
    
    
    
    
    
示例#16
0
X_resampled, y_resample = nm.fit_resample(X,y)

np.bincount(y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

deleted_ind = np.setdiff1d(np.arange(len(X)), ind)

plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2
plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

from imblearn.under_sampling import OneSidedSelection

oss=OneSidedSelection(random_state=0, n_neighbors=1, n_seeds_S=1)

X_resampled, y_resampled = oss.fit_resample(X,y)

np.bincount(y_resampled)


deleted_ind = np.setdiff1d(np.arange(len(X)), ind)

plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2
plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2)
plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=y[deleted_ind], marker='x')

colors = plt.cm.virdis(y[deleted_ind]/2)
plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2)
plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=colors, marker='x')
示例#17
0
np.set_printoptions(formatter={'float': '{:.1f}'.format})
print(tol/10)
print("\n")
print("Number of cases in each class")
print(Counter(y))

################### 4. Under-sampling: Selected Class (Compare Each Technique) ###################
########## one sided selection ##########
np.set_printoptions(formatter={'float': '{:.2f}'.format})
X = np.array(principal_4_Df.iloc[:,:-1])
y = np.array(principal_4_Df.iloc[:,-1])
model = KNeighborsClassifier(n_neighbors=5)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
us =  OneSidedSelection(random_state=0, n_neighbors=8)
pipeline = make_pipeline(us, model)
X_res , y_res = us.fit_resample(X, y)

overall = []
recall = np.zeros((1,13))
spe = np.zeros((1,13))
tol = np.zeros((13,13))
trial = 0
for train_index, test_index in skf.split(X,y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  model.fit(X_train, y_train)
  y_pred = cross_val_predict(pipeline, X_test, y_test, cv=skf)
  score = cross_val_score(pipeline, X_test, y_test, cv=skf).mean() 
  overall.append(score)
  trial+=1
  print("KNN  k=5  4 dim PCA  One Sided Selection")
def test_oss_with_wrong_object():
    knn = 'rnd'
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int"):
        oss.fit_resample(X, Y)
示例#19
0
def test_oss_with_wrong_object():
    knn = 'rnd'
    oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int"):
        oss.fit_resample(X, Y)
示例#20
0
                           random_state=seed,
                           n_jobs=-1)
X_train_full_fs, y_train_full = oversample_smoten.fit_resample(
    X_train_full_fs, y_train_full)
counter = Counter(y_train_full)
print("After oversampling, the class distribution is:")
print(counter)

# Undersample with One-Sided Selection (Tomek Links + Condensed Nearest Neighbor)
print("Undersampling...")
# n_seeds_S is the number of majority class to be added to set C, which is then used as a reference for a kNN on the remaining majority samples not in set C
undersample_oss = OneSidedSelection(n_neighbors=1,
                                    n_seeds_S=counter[1],
                                    n_jobs=-1,
                                    random_state=seed)
X_train_full_fs, y_train_full = undersample_oss.fit_resample(
    X_train_full_fs, y_train_full)
counter = Counter(y_train_full)
print("After OSS undersampling, the class distribution is:")
print(counter)
undersample_ncr = NeighbourhoodCleaningRule(n_neighbors=3,
                                            threshold_cleaning=0.5,
                                            n_jobs=-1)
X_train_full_fs, y_train_full = undersample_ncr.fit_resample(
    X_train_full_fs, y_train_full)
counter = Counter(y_train_full)
print("After NCR undersampling, the class distribution is:")
print(counter)

# Saving to Local
print("Saving to Local in csv...")
X_train_full_fs.to_csv("./data/X_train.csv", index=False)