def test_iht_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.7 iht = InstanceHardnessThreshold(ESTIMATOR, ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) idx_gt = np.array([0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def under_samplin(self): from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import InstanceHardnessThreshold iht = InstanceHardnessThreshold( random_state=0, estimator=LogisticRegression(solver='lbfgs', multi_class='auto')) self.X_resampled, self.y_resampled = iht.fit_resample(self.X, self.y)
def test_iht_fit_resample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) with pytest.raises(ValueError, match="Invalid parameter `estimator`"): iht.fit_resample(X, Y)
def test_iht_wrong_estimator(): ratio = 0.7 est = 'rnd' iht = InstanceHardnessThreshold(estimator=est, ratio=ratio, random_state=RND_SEED) with raises(NotImplementedError): iht.fit_sample(X, Y)
def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold(NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6, )
def test_iht_fit_resample_half(): sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold(ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (14, 2) assert y_resampled.shape == (14, )
def test_iht_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object iht = InstanceHardnessThreshold(random_state=RND_SEED) iht.fit(X, Y) assert_raises(RuntimeError, iht.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def sample_func(): X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) print('Original dataset shape %s' % Counter(y)) cnv_vec = np.vectorize(convert_neg_class) y = cnv_vec(y) iht = InstanceHardnessThreshold(random_state=42) X_res, y_res = iht.fit_resample(X, y) print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS
def test_iht_reproducibility(): from sklearn.datasets import load_digits X_digits, y_digits = load_digits(return_X_y=True) idx_sampled = [] for seed in range(5): est = RandomForestClassifier(n_estimators=10, random_state=seed) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) iht.fit_resample(X_digits, y_digits) idx_sampled.append(iht.sample_indices_.copy()) for idx_1, idx_2 in zip(idx_sampled, idx_sampled[1:]): assert_array_equal(idx_1, idx_2)
def test_iht_fit_sample(): """Test the fit sample routine""" # Resample the data iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def instance_hardness_thresold(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): iht = InstanceHardnessThreshold(random_state=42) X_res, y_res = iht.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_iht_fit_sample(): iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm(): """Test the fit sample routine with linear SVM""" # Resample the data est = 'linear-svm' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_svm.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_svm.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_gradient_boosting(): """Test the fit sample routine with gradient boosting""" # Resample the data est = 'gradient-boosting' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_gb.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_gb.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit(): """Test the fitting method""" # Create the object iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) # Fit the data iht.fit(X, Y) # Check if the data information have been computed assert_equal(iht.min_c_, 0) assert_equal(iht.maj_c_, 1) assert_equal(iht.stats_c_[0], 500) assert_equal(iht.stats_c_[1], 4500)
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 5000) iht = InstanceHardnessThreshold(random_state=RND_SEED) assert_warns(UserWarning, iht.fit, X, y) # multiclass case y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) iht = InstanceHardnessThreshold(random_state=RND_SEED) assert_warns(UserWarning, iht.fit, X, y)
def test_iht_fit_sample_decision_tree(): """Test the fit sample routine with decision-tree""" # Resample the data est = 'decision-tree' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_dt.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_dt.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_resample(): iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [ -0.65571327, 0.42412021 ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [ -0.00717161, 0.00318087 ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_knn(): est = 'knn' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [-0.65571327, 0.42412021], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [0.20246714, -0.34727125]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm(): # Resample the data est = 'linear-svm' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'iht_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def iht(X, Y): from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import InstanceHardnessThreshold iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression( solver='lbfgs', multi_class='auto')) Y = np.array(Y, dtype=int) iht.fit_resample(X, Y) indexes = iht.sample_indices_ nobj = len(Y) mask = np.zeros(nobj, dtype=int) for i in range(nobj): if i in indexes: mask[i] = 1 return True, mask
def get_sampler(self): sampler = None if self.sampler == 'random-over-sampler': sampler = RandomOverSampler(random_state=self.random_seed) elif self.sampler == 'adasyn': sampler = ADASYN(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'smote': sampler = SMOTE(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'svm-smote': sampler = SVMSMOTE(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'random-under-sampler': sampler = RandomUnderSampler(random_state=self.random_seed) elif self.sampler == 'tomek-links': sampler = TomekLinks(n_jobs=self.njobs) elif self.sampler == 'near-miss': sampler = NearMiss(n_jobs=self.njobs) elif self.sampler == 'instance-hardness': sampler = InstanceHardnessThreshold(random_state=self.random_seed, n_jobs=self.njobs) return sampler
def iht(X, Y): from sklearn.linear_model import LogisticRegression from imblearn.under_sampling import InstanceHardnessThreshold iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression( solver='lbfgs', multi_class='auto')) Y = np.array(Y, dtype=int) iht.fit_resample(X, Y) indexes = iht.sample_indices_ mask = [] for i in range(len(X)): if i in indexes: mask.append(1) else: mask.append(0) return True, np.asarray(mask)
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def test_iht_init(): sampling_strategy = 'auto' iht = InstanceHardnessThreshold( ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED) assert iht.sampling_strategy == sampling_strategy assert iht.random_state == RND_SEED
def test_iht_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) assert_raises(RuntimeError, iht.sample, X, Y)
def test_iht_wrong_estimator(): # Resample the data ratio = 0.7 est = 'rnd' iht = InstanceHardnessThreshold( estimator=est, ratio=ratio, random_state=RND_SEED) assert_raises(NotImplementedError, iht.fit_sample, X, Y)
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def test_iht_fit_sample_wrong_class_obj(): # Resample the data from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) assert_raises_regex(ValueError, "Invalid parameter `estimator`", iht.fit_sample, X, Y)
def test_iht_fit_sample_class_obj(): """Test the fit sample routine passing a classifiermixin object""" # Resample the data est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_gradient_boosting(): """Test the fit sample routine with gradient boosting""" # Resample the data est = 'gradient-boosting' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm(): """Test the fit sample routine with linear SVM""" # Resample the data est = 'linear-svm' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_knn(): """Test the fit sample routine with knn""" # Resample the data est = 'knn' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], [0.20246714, -0.34727125], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_iht_init(): ratio = 'auto' iht = InstanceHardnessThreshold(ESTIMATOR, ratio=ratio, random_state=RND_SEED) assert iht.ratio == ratio assert iht.random_state == RND_SEED
def test_iht_fit_resample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) with raises(ValueError, match="Invalid parameter `estimator`"): iht.fit_resample(X, Y)
pca = PCA(n_components=2) X_vis = pca.fit_transform(X) # Two subplots, unpack the axes array immediately f, axs = plt.subplots(2, 2) axs = [a for ax in axs for a in ax] for ax, ratio in zip(axs, [0.0, 0.1, 0.3, 0.5]): if ratio == 0.0: ax.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax.set_title('Original set') else: iht = InstanceHardnessThreshold(ratio=ratio) X_res, y_res = iht.fit_sample(X, y) X_res_vis = pca.transform(X_res) ax.scatter(X_res_vis[y_res == 0, 0], X_res_vis[y_res == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax.scatter(X_res_vis[y_res == 1, 0], X_res_vis[y_res == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax.set_title('Instance Hardness Threshold ({})'.format(ratio)) plt.show()
pca = PCA(n_components=2) X_vis = pca.fit_transform(X) # Two subplots, unpack the axes array immediately f, axs = plt.subplots(2, 2) axs = [a for ax in axs for a in ax] for ax, sampling_strategy in zip(axs, (0, {1: 25, 0: 10}, {1: 14, 0: 10}, {1: 10, 0: 10})): if sampling_strategy == 0: c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy, estimator=LogisticRegression(), return_indices=True) X_res, y_res, idx_res = iht.fit_resample(X, y) X_res_vis = pca.transform(X_res) plot_resampling(ax, X_res_vis, y_res, 'Instance Hardness Threshold ({})' .format(sampling_strategy)) # plot samples which have been removed idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res) c3 = ax.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples') plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'), loc='lower center', ncol=3, labelspacing=0.)