def test_enn_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object enn = EditedNearestNeighbours(random_state=RND_SEED) enn.fit(X, Y) assert_raises(RuntimeError, enn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_enn_fit_sample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample(): """Test the fit sample routine""" # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_enn_fit(): """Test the fitting method""" # Create the object enn = EditedNearestNeighbours(random_state=RND_SEED) # Fit the data enn.fit(X, Y) # Check if the data information have been computed assert_equal(enn.min_c_, 0) assert_equal(enn.maj_c_, 1) assert_equal(enn.stats_c_[0], 500) assert_equal(enn.stats_c_[1], 4500)
def test_enn_fit_sample_mode(): enn = EditedNearestNeighbours(kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 1836) assert_equal(count_y_res[2], 5)
def test_enn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) idx_gt = np.array([4, 11, 0, 3, 1, 8, 15]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_enn_fit_sample_with_nn_object(): """Test the fit sample routine using a NN object""" # Resample the data nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def fit(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours() elif by == 'NCR': sampler = NeighbourhoodCleaningRule() elif by == 'Tomek': sampler = TomekLinks() elif by == 'ALLKNN': sampler = AllKNN() elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss() elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y self.base_estimator.fit(X_train, y_train)
def under_sampling_algs(): algs = list() algs.append(("No Rs Undersampling case", "No Re-sampling")) algs.append((RandomUnderSampler(random_state=1), 'RU')) algs.append((ClusterCentroids(random_state=1), 'CC')) algs.append((TomekLinks(), 'TL')) algs.append((NearMiss(version=1), 'NM1')) algs.append((NearMiss(version=2), 'NM2')) algs.append((NearMiss(version=3), 'NM3')) algs.append((CondensedNearestNeighbour(random_state=1), 'CNN')) algs.append((OneSidedSelection(random_state=1), 'OSS')) algs.append((EditedNearestNeighbours(), 'ENN')) algs.append((NeighbourhoodCleaningRule(), 'NCL')) algs.append((InstanceHardnessThreshold(random_state=1), 'IHT')) algs.append((RepeatedEditedNearestNeighbours(), 'RENN')) algs.append((AllKNN(), 'AllKNN')) return algs
def get_models(): models, names = list(), list() # TL models.append(TomekLinks()) names.append('TL') # ENN models.append(EditedNearestNeighbours()) names.append('ENN') # RENN models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') # OSS models.append(OneSidedSelection()) names.append('OSS') # NCR models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names
def test_validate_estimator_init(): # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(random_state=RND_SEED) smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def resample(X, Y, rate=0.9, strategy='hybrid'): """ Sampling based methods to balance dataset Args: X (pd.DataFrame): Main dataset with the variables Y (pd.Series): Target variable rate (float): Ratio of the number of samples in the minority class over the number of samples in the majority class after resampling strategy ('hybrid' | 'over_sampling' | 'under_sampling'): Strategy to balance the dataset """ strategies = { 'hybrid': SMOTEENN(sampling_strategy=rate), 'over_sampling': SMOTE(sampling_strategy=rate), 'under_sampling': EditedNearestNeighbours(), } resampling = strategies[strategy] cols = X.columns X_r, Y_r = resampling.fit_resample(X, Y) return pd.DataFrame(data=X_r, columns=cols), Y_r
def test_sample_regular_pass_smote_enn(): smote = SMOTEENN( smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED), enn=EditedNearestNeighbours(sampling_strategy="all"), random_state=RND_SEED, ) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def get_samplers(): samplers = { # Under-samplers 'RandomUn': RandomUnderSampler(), 'TL': TomekLinks(), # 'ENN': EditedNearestNeighbours(), 'RENN': RepeatedEditedNearestNeighbours(), 'OSS': OneSidedSelection(), 'NCR': NeighbourhoodCleaningRule(), 'IHT': InstanceHardnessThreshold(), # Over-Samplers 'RandomOv': RandomOverSampler(), 'SMOTE': SMOTE(), 'SMOTESVM': SVMSMOTE(), # 'SMOTEKMeans': KMeansSMOTE(), 'ADASYN': ADASYN(), # Combined Under and Over Samplers 'SMOTEENN': SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority')), 'SMOTETomek': SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')), } return samplers
def create_sampler(sampler_name, random_state=None): if sampler_name is None or sampler_name == 'None': return None if sampler_name.lower() == 'randomundersampler': return RandomUnderSampler(random_state=random_state) if sampler_name.lower() == 'tomeklinks': return TomekLinks(random_state=random_state) if sampler_name.lower() == 'enn': return EditedNearestNeighbours(random_state=random_state) if sampler_name.lower() == 'ncl': return NeighbourhoodCleaningRule(random_state=random_state) if sampler_name.lower() == 'randomoversampler': return RandomOverSampler(random_state=random_state) if sampler_name.lower() == 'smote': return SMOTE(random_state=random_state) if sampler_name.lower() == 'smotetomek': return SMOTETomek(random_state=random_state) if sampler_name.lower() == 'smoteenn': return SMOTEENN(random_state=random_state) else: raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
def resample_data(predictors, target, df_data, method): """ This function resamples training datasets prior to training models. """ if method=='adasyn': util = ADASYN() elif method=='random-over-sampler': util = RandomOverSampler() elif method=='smote': util = SMOTE(kind='borderline2') elif method=='smote-tomek': util = SMOTETomek() elif method=='smote-enn': util = SMOTEENN() elif method=='edited-nn': util = EditedNearestNeighbours() elif method=='repeated-edited-nn': util = RepeatedEditedNearestNeighbours() elif method=='all-knn': util = AllKNN() elif method=='one-sided-selection': util = OneSidedSelection() elif method=='cluster-centroids': util = ClusterCentroids() elif method=='random-under-sampler': util = RandomUnderSampler() elif method=='neighbourhood-cleaning-rule': util = NeighbourhoodCleaningRule() elif method=='condensed-nearest-neighbour': util = CondensedNearestNeighbour() elif method=='near-miss': util = NearMiss(version=1) elif method=='instance-hardness-threshold': util = InstanceHardnessThreshold() x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target]) x_resampled = pd.DataFrame(x_resampled, columns=predictors) y_resampled = pd.DataFrame(y_resampled, columns=[target]) return x_resampled, y_resampled
def build_loaders(titles, labels, batch_size, under_sample=False, over_sample=False): train_titles, test_titles, train_labels, test_labels = \ train_test_split(titles, labels, test_size=0.1) val_titles, test_titles, val_labels, test_labels = \ train_test_split(test_titles, test_labels, test_size=0.01) steps = [] if under_sample: steps.append(("Under", EditedNearestNeighbours(n_neighbors=2))) if over_sample: steps.append(("Over", SMOTE(sampling_strategy=1))) if under_sample or over_sample: pipeline = Pipeline(steps=steps) train_titles, train_labels = pipeline.fit_resample(train_titles, train_labels) print("Train:") calc_ratio(train_labels) print("Validation:") calc_ratio(val_labels) print("Test:") calc_ratio(test_labels) train = TensorDataset(torch.from_numpy(train_titles), torch.from_numpy(train_labels)) val = TensorDataset(torch.from_numpy(val_titles), torch.from_numpy(val_labels)) test = TensorDataset(torch.from_numpy(test_titles), torch.from_numpy(test_labels)) train_loader = DataLoader(train, shuffle=True, batch_size=batch_size, drop_last=True) test_loader = DataLoader(test, shuffle=True, batch_size=batch_size, drop_last=True) val_loader = DataLoader(val, shuffle=True, batch_size=batch_size, drop_last=True) return train_loader, test_loader, val_loader
def __init__(self): from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \ TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \ CondensedNearestNeighbour, NeighbourhoodCleaningRule from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \ BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier self.oversamplers = { 'ADASYN': ADASYN(), 'RandomOverSampler': RandomOverSampler(), 'SMOTE': SMOTE(), 'BorderlineSMOTE': BorderlineSMOTE(), 'SVMSMOTE': SVMSMOTE() } self.undersamplers = { 'ClusterCentroids': ClusterCentroids(), 'RandomUnderSampler': RandomUnderSampler(), 'InstanceHardnessThreshold': InstanceHardnessThreshold(), 'NearMiss': NearMiss(), 'TomekLinks': TomekLinks(), 'EditedNearestNeighbours': EditedNearestNeighbours(), 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(), 'AllKNN': AllKNN(), 'OneSidedSelection': OneSidedSelection(), 'CondensedNearestNeighbour': CondensedNearestNeighbour(), 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule() } self.ensemblesamplers = { 'EasyEnsemble': EasyEnsemble(), 'EasyEnsembleClassifier': EasyEnsembleClassifier(), 'BalancedBaggingClassifier': BalancedBaggingClassifier(), 'BalanceCascade': BalanceCascade(), 'BalancedRandomForestClassifier': BalancedRandomForestClassifier, 'RUSBoostClassifier': RUSBoostClassifier() }
def test_enn_init(): enn = EditedNearestNeighbours() assert enn.n_neighbors == 3 assert enn.kind_sel == 'all' assert enn.n_jobs == 1
def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours( n_neighbors=nn, kind_sel='mode') with raises(ValueError, match="has to be one of"): enn.fit_sample(X, Y)
oversampler = SMOTE(ratio=0.2, random_state=np.random.randint(100), kind='regular', n_jobs=-1) os_X_train, os_y_train = oversampler.fit_sample(X_train.fillna(0), y_train) ##ADASYN 运行起来很慢### X_resampled_adasyn, y_resampled_adasyn = ADASYN( sampling_strategy=0.2, n_jobs=-1).fit_sample(train.loc[:, feature].fillna(0).values, train["y"].values.astype('int')) ###删除边界的一些噪声点### from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours(random_state=0) X_resampled, y_resampled = enn.fit_sample(X, y) dtrain = xgb.DMatrix(data=train.loc[:, feature].astype('float'), label=train['y'].astype('int')) dval = xgb.DMatrix(data=val.loc[:, feature].astype('float'), label=val['y'].astype('int')) train.loc[:, feature].info(null_counts=True) params = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8,
import numpy as np from sklearn.svm import SVC from hyperopt import hp from sklearn.decomposition import PCA from imblearn.pipeline import Pipeline from imblearn.under_sampling import EditedNearestNeighbours from config import random_seed from utils.python_utils import quniform_int steps = [('undersampler', EditedNearestNeighbours(random_state=random_seed)), ('SVC', SVC(C=1, kernel='linear', random_state=random_seed, probability=True))] model = Pipeline(steps=steps) params_space = {'svm__C': hp.quniform('C', 1, 100, 5)}
# # # In[135]: # # # smotenc+enn X_smote = np.array(df_smotenc[[ 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21' ]]) Y_smote = list(df_smotenc['click']) # from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X_smotenc, y_smotenc) # # # In[52]: # # # df_smotenc = pd.DataFrame(X_smotenc, # columns=column1) # df_smotenc = pd.concat([df_smotenc, pd.DataFrame(y_smotenc, columns=['click'])], axis=1) # for i in column1: # df_smotenc[i] = df_smotenc[i].astype(int) # # # In[53]: # # # df_smX_resampledotenc.head()
print(spe/10) print("\n") print("Overall Confusion Matrix: ") np.set_printoptions(formatter={'float': '{:.1f}'.format}) print(tol/10) print("\n") print("Number of cases in each class") print(Counter(y_res)) ########## Edited Nearest Neighbour ########## np.set_printoptions(formatter={'float': '{:.2f}'.format}) X = np.array(principal_4_Df.iloc[:,:-1]) y = np.array(principal_4_Df.iloc[:,-1]) model = KNeighborsClassifier(n_neighbors=8) skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) us = EditedNearestNeighbours(random_state=0) pipeline = make_pipeline(us, model) X_res , y_res = us.fit_resample(X, y) overall = [] recall = np.zeros((1,13)) spe = np.zeros((1,13)) tol = np.zeros((13,13)) trial = 0 for train_index, test_index in skf.split(X,y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_pred = cross_val_predict(pipeline, X_test, y_test, cv=skf) score = cross_val_score(pipeline, X_test, y_test, cv=skf).mean() overall.append(score)
print("---------------") print("ratio", i) results['ratio'][a] = i print("neighbors", j) results['neighbors'][a] = j b = a a = a + 1 results['Class'][b] = 0 results['Class'][a] = 1 results['Datasize'][b] = datasize[0] results['Datasize'][a] = datasize[1] results['Training Datasize'][b] = trainingdatasize[0] results['Training Datasize'][a] = trainingdatasize[1] results['Testing Datasize'][b] = testingdatasize[0] results['Testing Datasize'][a] = testingdatasize[1] enn = EditedNearestNeighbours(random_state=5, n_neighbors=j) X_train_sampled, y_train_sampled = enn.fit_sample( X_train_sampled1, y_train_sampled1) samplingdatasize = collections.Counter(y_train_sampled) print("sampled training data size", samplingdatasize) results['After sampling'][b] = samplingdatasize[0] results['After sampling'][a] = samplingdatasize[1] #random forest clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0, oob_score=True) clf.fit(X_train_sampled, y_train_sampled) y_pred = clf.predict(X_test) y_test_arr = np.array(y_test['Outcome'])
pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Three subplots, unpack the axes array immediately f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') # Apply the ENN print('ENN') enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X)))) ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax2.set_title('Edited nearest neighbours') # Apply the RENN print('RENN') renn = RepeatedEditedNearestNeighbours()
def test_deprecation_random_state(): enn = EditedNearestNeighbours(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): enn.fit_sample(X, Y)
def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError, match="has to be one of"): enn.fit_resample(X, Y)
n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Three subplots, unpack the axes array immediately f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) c0, c1 = plot_resampling(ax1, X_vis, y, 'Original set') # Apply the ENN print('ENN') enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = enn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format( 100 * (1 - float(len(X_resampled)) / len(X)))) print(reduction_str) c3 = ax2.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples', c='g') plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str) # Apply the RENN print('RENN')
import numpy as np import xgboost as xgb from hyperopt import hp from imblearn.pipeline import Pipeline from imblearn.under_sampling import EditedNearestNeighbours from config import random_seed from utils.python_utils import quniform_int steps = [('undersampler', EditedNearestNeighbours(random_state=random_seed, n_neighbors=3)), ('xgb', xgb.XGBClassifier(n_estimators=6450, colsample_bytree=0.9, learning_rate=0.0271311414499, min_child_weight=4, subsample=0.917109565217, max_depth=25, gamma=0.0100121777578, silent=True, nthread=3, seed=random_seed))] model = Pipeline(steps=steps) params_space = { 'undersampler__n_neighbors': quniform_int('n_neighbors', 2, 10, 1), 'xgb__max_depth': quniform_int('max_depth', 10, 30, 1), 'xgb__min_child_weight':
# model eval recall = recall_score(Y_test,predictions) report = classification_report(Y_test ,predictions) print(f'Recall Logistic Regression {recall: .2f}') print(report) print(balanced_accuracy_score(Y_test, predictions)) t1 = pl.time.time() - t0 print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60))) print("best parameters",LR_model.best_params_) plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin']) #seventh Test----------------------Logestic Regression--------------------------------undersampling with ENN from imblearn.under_sampling import EditedNearestNeighbours Xtrain_tomek, Ytrain_tomek = EditedNearestNeighbours().fit_sample(X_train, Y_train) t0 = pl.time.time() LR = LogisticRegression(max_iter=4000, random_state=49, n_jobs=1, class_weight='balanced') # for liblinear n_jobs is +1. parameters = {"penalty": ['l1', 'l2'],'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], "solver":['liblinear','sag','saga']} LR_model = GridSearchCV(LR, parameters, scoring="precision", cv=3) # fit the classifier LR_model.fit(Xtrain_tomek,Ytrain_tomek.values.ravel()) # get the prediction predictions = LR_model.predict(X_test)
X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision')) scores['test_roc_auc'].mean(), scores['test_average_precision'].mean() # (0.9518183780276207, 0.6767076447148238) ######### Edited Nearest Neighbor ######### # removes all samples that are misclassified by KNN from the training data (`mode`) # Or if have any point from other class as neighbor (`all`) # So basically, what you're doing here is you clean up outliers and boundaries. from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours(n_neighbors=5) X_train_enn, y_train_enn = enn.fit_sample(X_train, y_train) enn_mode = EditedNearestNeighbours(kind_sel="mode", n_neighbors=5) X_train_enn_mode, y_train_enn_mode = enn_mode.fit_sample(X_train, y_train) print(X_train_enn_mode.shape) print(np.bincount(y_train_enn_mode)) ### Pipeline method enn_pipe = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5), LogisticRegression()) scores = cross_validate(enn_pipe, X_train, y_train,
alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') # Apply the ENN print('ENN') enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5, edgecolor=almost_black,
def model_resampling_pipeline(X_train, X_test, y_train, y_test, model, b=0.5, name='', eval_show=True, columns=None): if not hasattr(model, 'predict_proba'): model = CalibratedClassifierCV(model, cv=3) else: model = model results = { 'ordinary': {}, 'class_weight': {}, 'oversample': {}, 'undersample': {} } # ------ No balancing ------ model.fit(X_train, y_train) predictions = model.predict(X_test) probas = [x[1] for x in model.predict_proba(X_test)] scores = metrics.classification_report( y_test, predictions, target_names=['negative', 'positive', 'mixed'], output_dict=True) w_precision = scores['macro avg']['precision'] w_recall = scores['macro avg']['recall'] w_fscore = scores['macro avg']['f1-score'] results['ordinary'] = { 'w_precision': w_precision, 'w_recall': w_recall, 'w_fscore': w_fscore, 'predictions': np.array(predictions), 'probas': probas } # ------ Class weight ------ if 'class_weight' in model.get_params().keys(): model.set_params(class_weight='balanced') model.fit(X_train, y_train) predictions = model.predict(X_test) probas = [x[1] for x in model.predict_proba(X_test)] scores = metrics.classification_report( y_test, predictions, target_names=['negative', 'positive', 'mixed'], output_dict=True) w_precision = scores['macro avg']['precision'] w_recall = scores['macro avg']['recall'] w_fscore = scores['macro avg']['f1-score'] results['class_weight'] = { 'w_precision': w_precision, 'w_recall': w_recall, 'w_fscore': w_fscore, 'predictions': np.array(predictions), 'probas': probas } # ------------ OVERSAMPLING TECHNIQUES ------------ techniques = [RandomOverSampler(), SMOTE(), ADASYN()] for sampler in techniques: technique = sampler.__class__.__name__ X_resampled, y_resampled = sampler.fit_sample(X_train, y_train) X_resampled = pd.DataFrame(X_resampled) if columns: X_resampled.columns = columns else: X_resampled.columns = X_train.columns model.fit(X_resampled, y_resampled) predictions = model.predict(X_test) probas = [x[1] for x in model.predict_proba(X_test)] scores = metrics.classification_report( y_test, predictions, target_names=['negative', 'positive', 'mixed'], output_dict=True) w_precision = scores['macro avg']['precision'] w_recall = scores['macro avg']['recall'] w_fscore = scores['macro avg']['f1-score'] results['oversample'][technique] = { 'w_precision': w_precision, 'w_recall': w_recall, 'w_fscore': w_fscore, 'predictions': np.array(predictions), 'probas': probas } # ------------ UNDERSAMPLING TECHNIQUES ------------ techniques = [ RandomUnderSampler(), NearMiss(version=1), NearMiss(version=2), TomekLinks(), EditedNearestNeighbours() ] for sampler in techniques: technique = sampler.__class__.__name__ if technique == 'NearMiss': technique += str(sampler.version) X_resampled, y_resampled = sampler.fit_sample(X_train, y_train) X_resampled = pd.DataFrame(X_resampled) if columns: X_resampled.columns = columns else: X_resampled.columns = X_train.columns model.fit(X_resampled, y_resampled) predictions = model.predict(X_test) probas = [x[1] for x in model.predict_proba(X_test)] scores = metrics.classification_report( y_test, predictions, target_names=['negative', 'positive', 'mixed'], output_dict=True) w_precision = scores['macro avg']['precision'] w_recall = scores['macro avg']['recall'] w_fscore = scores['macro avg']['f1-score'] results['undersample'][technique] = { 'w_precision': w_precision, 'w_recall': w_recall, 'w_fscore': w_fscore, 'predictions': np.array(predictions), 'probas': probas } if eval_show: evaluate_method(results, y_test, 'undersample', title=name + '\nUndersampled') evaluate_method(results, y_test, 'oversample', title='Oversampled') return results
def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') assert_raises_regex(ValueError, "has to be one of", enn.fit_sample, X, Y)
# dados_completo.drop('index', axis=1, inplace=True) mostrar_quantidade_por_classe(dados_completo, 'dirtiness') mostrar_quantidade_por_classe(dados_completo, 'white_bgd') mostrar_quantidade_por_classe(dados_completo, 'viable') mostrar_quantidade_por_classe(dados_completo, 'not_viable') # mostrar_quantidade_por_classe(dados_completo, 5) print(dados_completo.shape) print(dados_completo.describe(include=['number'])) n_jobs = 5 # classes_balancear = list([2, 3]) # balanceador = EditedNearestNeighbours(n_jobs=n_jobs, n_neighbors=5) # balanceador = SMOTE(n_jobs=n_jobs, random_state=random_state) balanceador = SMOTEENN(enn=EditedNearestNeighbours(n_jobs=n_jobs, n_neighbors=n_jobs), smote=SMOTE(n_jobs=n_jobs), random_state=random_state) X_treino, Y_treino = balanceador.fit_resample( dados_completo.drop('classe', axis=1), dados_completo['classe']) X_treino = pd.DataFrame(data=X_treino, columns=dados_completo.drop(['classe'], axis=1).columns) Y_treino = pd.DataFrame(data=Y_treino, columns=['classe']) # X_treino.to_csv('../input/DadosCompletoTransformadoMLBalanceadoX.csv', encoding='utf-8', sep='\t') # Y_treino.to_csv('../input/DadosCompletoTransformadoMLBalanceadoY.csv', encoding='utf-8', sep='\t') # # exit() # X_treino = pd.read_csv('../input/DadosCompletoTransformadoMLBalanceadoX.csv', encoding='utf-8', delimiter='\t') # X_treino.drop(X_treino.columns[0], axis=1, inplace=True) # Y_treino = pd.read_csv('../input/DadosCompletoTransformadoMLBalanceadoY.csv', encoding='utf-8', delimiter='\t')
# their class differ from the one of their nearest-neighbors. This sieve can be # repeated which is the principle of the # ``RepeatedEditedNearestNeighbours``. ``AllKNN`` is slightly different from # the ``RepeatedEditedNearestNeighbours`` by changing the :math:`k` parameter # of the internal nearest neighors algorithm, increasing it at each iteration. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip( ax_arr, ( EditedNearestNeighbours(), RepeatedEditedNearestNeighbours(), AllKNN(allow_minority=True), ), ): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title(f"Decision function for {sampler.__class__.__name__}") plot_resampling(X, y, sampler, ax[1]) ax[1].set_title(f"Resampling using {sampler.__class__.__name__}") fig.tight_layout() ############################################################################### # ``CondensedNearestNeighbour`` makes use of a 1-NN to iteratively decide if a # sample should be kept in a dataset or not. The issue is that
def validateFitModel(X_train, y_train, X_test=None, y_test=None, cv=False, target=None): rs = RobustScaler(quantile_range=(0.1, 0.90)) mms = MinMaxScaler() X_train_mms = mms.fit_transform(rs.fit_transform(X_train)) ncr = EditedNearestNeighbours(n_neighbors=1, sampling_strategy=[7, 10], random_state=42, return_indices=True) _, _, indexes = ncr.fit_resample(X_train_mms, y_train) resampling_index = random.sample(range(len(indexes)), len(indexes)) sampled_indexes = indexes[resampling_index] with open(os.path.join(MODELS_PATH, 'sampled_dfs_%s.bin' % target), 'wb') as f: pickle.dump(sampled_indexes, f) f.close() model = XGBClassifier(verbosity=2, n_estimators=100, objective='multi:softprob', learning_rate=0.125, min_child_weight=1, max_depth=13, gamma=0.6, max_delta_step=0, subsample=1, colsample_bytree=0.9, reg_lambda=2, scale_pos_weight=0.05) if cv: param_grid = { 'n_estimators': [10], 'objective': ['multi:softprob'], 'learning_rate': [0.125], 'min_child_weigth': [1], 'max_depth': [13], 'gamma': [0.6], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.9], 'reg_lambda': [2], 'scale_pos_weight': [0.05] } validate(X_train[sampled_indexes], y_train[sampled_indexes], X_test, y_test, target=target, model=model, parameters=param_grid, model_name='XGB') else: model.fit(X_train[sampled_indexes], y_train[sampled_indexes]) with open( os.path.join(MODELS_PATH, '%s_fitted_classifier.bin' % target), 'wb') as f: pickle.dump(model, f) f.close() return
'MLPClassifier(hidden_layer_sizes=(5), solver="lbfgs", max_iter=1000, random_state=42)' ), ("KerasNN_3neurons", 'KerasNN_not_fitted(n_neurons=3, init="he_normal")'), ("KerasNN_12neurons", 'KerasNN_not_fitted(n_neurons=12,init="he_normal")') ] scalers = [("StandardScaler", StandardScaler()), ("RobustScaler", RobustScaler()), ("MinMaxScaler", MinMaxScaler()), ("Normalizer", Normalizer()), ("None", None)] samplers = [ ("RandomOverSampler_0.2", RandomOverSampler(random_state=42, ratio=0.2)), ("RandomOverSampler_0.5", RandomOverSampler(random_state=42, ratio=0.5)), ("RandomOverSampler_0.5", RandomOverSampler(random_state=42, ratio=0.35)), ("TomekLinks", TomekLinks(random_state=42)), ("EditedNN", EditedNearestNeighbours(random_state=42, n_neighbors=3)), ("SMOTE", SMOTE(random_state=42, ratio=0.5)), ("SMOTETomek", SMOTETomek(random_state=42, ratio=0.8)), ("None", None) ] pre_processing_pipelines = [ ("Joris_Pipeline", preprocessing.joris_preprocessing_pipeline), ("Morten_Pipeline", preprocessing.morten_preprocessing_pipeline), ("Bin it!", preprocessing.bin_it_preprocessing_pipeline), ("simple_pipeline", preprocessing.simple_pipeline), ("chop_off", preprocessing.chop_off), ("pca_chopoff", preprocessing.pca_chopoff), ("box_cox_pipeline", preprocessing.box_cox_pipeline), ("feature_engineered", preprocessing.feature_engineered) ] seed = [1]
def test_deprecation_random_state(): enn = EditedNearestNeighbours(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): enn.fit_resample(X, Y)