def test_cnn_fit_resample_with_object(): knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array([ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646], ]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) X_resampled, y_resampled = cnn.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def cnn_test(data_set: pd.DataFrame, metric: str, k: int, weights='uniform'): X = np.array(data_set.iloc[:, 0:2]) y = np.array(data_set.iloc[:, 2:]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10) cnn = CondensedNearestNeighbour(n_neighbors=k, sampling_strategy="all") X_train_re, y_train_re = cnn.fit_resample(X_train, y_train) clf = neighbors.KNeighborsClassifier(k, metric=metric, weights=weights) clf.fit(X_train_re, y_train_re.ravel()) predicted = clf.predict(X_test) accuracy = accuracy_score(predicted, y_test) print(accuracy) plot_decisions_boundaries(X_train, y_train, clf=clf)
def condensed_nearest_neighbour(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_cnn_fit_resample_with_indices(): cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def get_data(force_reload=False, strategy='oversampling', test_size=0.15): train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy)) train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy)) val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy)) val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy)) training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file) val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file) if not force_reload and training_files_exist and val_files_exist: X_train = np.load(train_data_file) y_train = np.load(train_labels_file) X_val = np.load(val_data_file) y_val = np.load(val_labels_file) else: train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv')) X, y = to_data_format(train_df) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size) print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape)) if strategy == 'oversampling': X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train) elif strategy == 'combine': smote = SMOTE(n_jobs=n_jobs) enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train) elif strategy == 'undersampling': enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = enn.fit_resample(X_train, y_train) elif strategy == 'condensed-undersampling': cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3) X_train, y_train = cnn.fit_resample(X_train, y_train) print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape)) np.save(train_data_file, X_train) np.save(train_labels_file, y_train) np.save(val_data_file, X_val) np.save(val_labels_file, y_val) return X_train, X_val, y_train, y_val
def readFile(path, y_label,method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False,fea_eng=True): raw = pd.read_csv(path) n, d = raw.shape if (shuffle): raw = raw.sample(frac=1).reset_index(drop=True) # shuffle if (needSkew): skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(skew_exempted)].apply(lambda x: skew(x.dropna())) skewed = skewed[skewed > 0.75].index raw[skewed] = np.log1p(raw[skewed]) # reduce skewness raw = pd.get_dummies(raw, columns=encode_features) # encode categorical features raw = raw.fillna(raw.mean()) # if(method=='OverSample'): # ind_more=np.argmax(np.bincount(raw[y_label])) # more=raw[ind] # less=raw[-ind] # x = [randint(0, len(less)) for a in range(0, len(more)-len(less))] # raw. X=raw.drop(y_label,axis=1) y=raw[y_label] if(method=='OverSample'): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X, y) X=X_res y=y_res if(method=='UnderSample'): # for i in [] model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP X_res, y_res = model.fit_resample(X, y) #doctest: +SKIP \ X=X_res y=y_res # if(method=='Weights'): # if(fea_eng==True): # # X,y=feature_eng(X,y) X_train, X_test, y_train, y_test=split(X,y, training_ratio) return X_train, X_test, y_train, y_test
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train, PM_ova_X_train, PM_ova_y_train, SC_ova_X_train, SC_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN() AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ALLKNN": AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN() AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "CNN": AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ENN": AP_enn, PM_enn, SC_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "IHT": AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NCR": AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AP_ova_y_train = [ 0 if i == "Add penalty" else 1 for i in AP_ova_y_train ] AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train] PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_ova_y_train = [ 0 if i == "Send for Credit Collection" else 1 for i in SC_ova_y_train ] SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NM": AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss() AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "OSS": AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RENN": AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTE": AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE() AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "BSMOTE": AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTEENN": AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN() AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTETOMEK": AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek() AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "TOMEK": AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks() AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ROS": AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RUS": AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train, SC_ova_y_train) return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
X_best_train = f_classif_select.fit_transform(X_train, y_train) X_best_test = f_classif_select.fit_transform(X_test, y_test) knn.fit(X_best_train, y_train) y_pred = knn.predict(X_best_test) scores[i].append(metric(y_test, y_pred)) for dataset_score in scores: print(np.mean(dataset_score)) if not SKIP_CNN: scores = [[] for _ in range(len(datasets))] for i, dataset in enumerate(datasets): X, y = dataset X, y = cnn.fit_resample(X, y) for train_index, test_index in rskf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] f_classif_select = SelectKBest(k=K_BEST) X_best_train = f_classif_select.fit_transform(X_train, y_train) X_best_test = f_classif_select.fit_transform(X_test, y_test) knn.fit(X_best_train, y_train) y_pred = knn.predict(X_best_test) scores[i].append(metric(y_test, y_pred)) for dataset_score in scores: print(np.mean(dataset_score))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_res, y_res = ros.fit_resample(X_train, y_train) train_and_measure(clf, 'ros', X_res, y_res, X_test, y_test) # SMOTE + ENN print('SMOTEENN') for i in range(N): clf = get_model() smnn = SMOTEENN() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_res, y_res = smnn.fit_resample(X_train, y_train) train_and_measure(clf, 'smoteenn', X_res, y_res, X_test, y_test) # CNN print('CNN') for i in range(N): clf = get_model() cnn = CondensedNearestNeighbour() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_res, y_res = cnn.fit_resample(X_train, y_train) train_and_measure(clf, 'cnn', X_res, y_res, X_test, y_test) # RUS print('RUS') for i in range(N): clf = get_model() rus = RandomUnderSampler() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_res, y_res = rus.fit_resample(X_train, y_train) train_and_measure(clf, 'rus', X_res, y_res, X_test, y_test)
plotCounts() # unfortunately we can clearly see that the data is imbalanced by having about 84% of the asteroids # as not hazardous and about 16% as hazardous # HANDLING IMBALANCED DATA # to handle the imbalanced data we will use and compare multiple techniques and algorithms # from the graph above we can clearly see that all Hazardous data points and condensed in a small region, # i don't think using oversampling here is a good idea, instead we could just use undersampling # or try cost-sensitive down-weighting # CondensedNearestNeighbour technique for undersampling from imblearn.under_sampling import CondensedNearestNeighbour cnn = CondensedNearestNeighbour(n_neighbors=5, n_seeds_S=55) cnn_X, cnn_y = cnn.fit_resample(X, y) plotData2D(cnn_X, cnn_y) # CondensedNearestNeighbour removes too many instances we will not use it # NeighbourhoodCleaningRule technique for undersampling from imblearn.under_sampling import NeighbourhoodCleaningRule ncr = NeighbourhoodCleaningRule(sampling_strategy='majority', n_neighbors=5, kind_sel='mode') ncr_X, ncr_y = ncr.fit_resample(X, y) plotData2D(ncr_X, ncr_y) # NeighbourhoodCleaningRule also doesn't work for this dataset because it removes many data points # in just one region and that will mess up our decision boundary and make our predictions worse.
def all_imblearn(xx, yy): imblearnlist = [] """OVER SAMPLING""" """Random Over Sampler""" ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(xx, yy) randomOverSampler = [X_resampled, y_resampled, 'random over sampler'] imblearnlist.append(randomOverSampler) """SMOTE""" X_resampled, y_resampled = SMOTE().fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote'] imblearnlist.append(smote) """SMOTE borderline1""" sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline1'] imblearnlist.append(smote) """SMOTE borderline2""" sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline2'] imblearnlist.append(smote) """SMOTE svm""" sm = SMOTE(kind='svm') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote svm'] imblearnlist.append(smote) """SMOTENC""" smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smotenc'] imblearnlist.append(smote) # """ADASYN""" # X_resampled, y_resampled = ADASYN.fit_resample(xx, yy) # adasyn = [X_resampled, y_resampled, 'adasyn'] # imblearnlist.append(adasyn) # """UNDER SAMPLING""" """Cluster Centroids""" cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'cluster centroids'] imblearnlist.append(reSampled) """Random Over Sampler""" rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'random under sampler'] imblearnlist.append(reSampled) """Near Miss 1""" nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 1'] imblearnlist.append(reSampled) """Near Miss 2""" nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 2'] imblearnlist.append(reSampled) """Near Miss 3""" nm3 = NearMiss(version=3) X_resampled, y_resampled = nm3.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 3'] imblearnlist.append(reSampled) """Edited Nearest Neighbours""" enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'edited nearest neighbours'] imblearnlist.append(reSampled) """Repeated Edited Nearest Neighbours""" renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours'] imblearnlist.append(reSampled) """All KNN""" allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'allKNN'] imblearnlist.append(reSampled) """Condensed Nearest Neighbour""" cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour'] imblearnlist.append(reSampled) """One Sided Selection""" oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'One Sided Selection'] imblearnlist.append(reSampled) """Neighbourhood Cleaning Rule""" ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule'] imblearnlist.append(reSampled) """OVER AND UNDER SAMPLING""" """SMOTEENN""" smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTEENN'] imblearnlist.append(reSampled) """SMOTETomek""" smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTETomek'] imblearnlist.append(reSampled) return imblearnlist
def test_cnn_fit_resample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int or an "): cnn.fit_resample(X, Y)
# undersample and plot imbalanced dataset with the Condensed Nearest Neighbor Rule from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import CondensedNearestNeighbour from matplotlib import pyplot from numpy import where # define dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) # summarize class distribution counter = Counter(y) print(counter) # define the undersampling method undersample = CondensedNearestNeighbour(n_neighbors=1) # transform the dataset X, y = undersample.fit_resample(X, y) # summarize the new class distribution counter = Counter(y) print(counter) # scatter plot of examples by class label for label, _ in counter.items(): row_ix = where(y == label)[0] pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) pyplot.legend() pyplot.show()
def fit(self, X, y): """Fitting.""" # if not hasattr(self, "base_estimator"): # self.set_base_clf() X, y = check_X_y(X, y) self.classes_ = unique_labels(y) self.X_ = X self.y_ = y minority_X = X[y == 1] minority_y = y[y == 1] majority_X = X[y == 0] majority_y = y[y == 0] for i in range(self.ensemble_size): self.estimators_.append(base.clone(self.base_estimator)) for n, estimator in enumerate(self.estimators_): np.random.seed(self.random_state + (n * 2)) bagXminority = minority_X[np.random.choice( minority_X.shape[0], len(minority_y), replace=True), :] bagXmajority = majority_X[np.random.choice( majority_X.shape[0], len(majority_y), replace=True), :] bagyminority = np.ones(len(minority_y)).astype('int') bagymajority = np.zeros(len(majority_y)).astype('int') train_X = np.concatenate((bagXmajority, bagXminority)) train_y = np.concatenate((bagymajority, bagyminority)) unique, counts = np.unique(train_y, return_counts=True) if self.oversampler == "ROS": ros = RandomOverSampler(random_state=self.random_state + (n * 2)) try: train_X, train_y = ros.fit_resample(train_X, train_y) except: pass elif self.oversampler == "B2": b2 = BorderlineSMOTE(random_state=self.random_state + (n * 2), kind='borderline-2') try: train_X, train_y = b2.fit_resample(train_X, train_y) except: pass elif self.oversampler == "RUS": rus = RandomUnderSampler(random_state=self.random_state + (n * 2)) try: train_X, train_y = rus.fit_resample(train_X, train_y) # _, ys_counter = np.unique(train_ys, return_counts=True) # if np.sum(ys_counter) < 9: # rus = RandomUnderSampler(random_state=self.random_state+(n*2), sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]}) # train_Xs, train_ys = rus.fit_resample(train_X, train_y) # train_X, train_y = train_Xs, train_ys # else: # train_X, train_y = train_Xs, train_ys except: pass elif self.oversampler == "CNN": cnn = CondensedNearestNeighbour( random_state=self.random_state + (n * 2)) try: train_X, train_y = cnn.fit_resample(train_X, train_y) except: pass # if train_X.shape[0] >= 5: estimator.fit(train_X, train_y) # else: # print("Padlem, więc biorę %i sasiadow" % train_X.shape[0]) # self.estimators_[n] = KNeighborsClassifier(weights='distance', n_neighbors=train_X.shape[0]).fit(train_X, train_y) # Return the classifier return self
def partial_fit(self, X, y, classes=None): """Partial fitting.""" X, y = check_X_y(X, y) if not hasattr(self, "ensemble_"): self.ensemble_ = [] self.ensemble_base_ = [] # Check feature consistency if hasattr(self, "X_"): if self.X_.shape[1] != X.shape[1]: raise ValueError("number of features does not match") self.X_, self.y_ = X, y if self.oversampled == "None": self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "ROS": ros = RandomOverSampler(random_state=42) try: self.dsel_X_, self.dsel_y_ = ros.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "B2": b2 = BorderlineSMOTE(random_state=42, kind='borderline-2') try: self.dsel_X_, self.dsel_y_ = b2.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "RUS": rus = RandomUnderSampler(random_state=42) try: self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_) # _, ys_counter = np.unique(self.dsel_y_, return_counts=True) # if np.sum(ys_counter) < 9: # rus = RandomUnderSampler(random_state=42, sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]}) # self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ elif self.oversampled == "CNN": cnn = CondensedNearestNeighbour(random_state=42) try: self.dsel_X_, self.dsel_y_ = cnn.fit_resample(self.X_, self.y_) except: self.dsel_X_, self.dsel_y_ = self.X_, self.y_ # Check classes self.classes_ = classes if self.classes_ is None: self.classes_, _ = np.unique(y, return_inverse=True) # Append new estimator self.candidate_ = clone(self.base_estimator).fit(self.X_, self.y_) self.ensemble_.append(self.candidate_) self.ensemble_base_.extend(self.candidate_.estimators_) # Remove the worst when ensemble becomes too large if len(self.ensemble_) > self.n_estimators: self.prune_index_ = np.argmin( [self.metric(y, clf.predict(X)) for clf in self.ensemble_]) # print(self.prune_index_) del self.ensemble_[self.prune_index_] a = (((self.prune_index_ + 1) * 10) - 10) b = (((self.prune_index_ + 1) * 10)) del self.ensemble_base_[a:b] # print(a, ":", b) return self
def sample_dataset(X_train, y_train): sampling_method = CondensedNearestNeighbour(random_state=seed) X_resampled, y_resampled = sampling_method.fit_resample(X_train, y_train) print("Shape del dataset original: {}. Shape del dataset procesado{} ".format(X_train.shape, X_resampled.shape)) return X_resampled, y_resampled
X_train, y_train, X_test, y_test = input.read_train_test_data( config['patch_size'], conv3d=True) #X_test, y_test, X_train, y_train = input.read_data(config['patch_size']) if validation_set: X_train, X_val, y_train, y_val = \ train_test_split(X_train, y_train, test_size=0.5, random_state=42, stratify=y_train) if undersampling: X, y, _, _ = input.read_data(patch_size=1) X_reshaped = X.reshape(X.shape[0], input.bands) print("Elements before undersampling: %i" % len(X)) print(sorted(Counter(y).items())) enn = CondensedNearestNeighbour(n_jobs=8) enn.fit_resample(X_reshaped, y_train) print("Elements after undersampling: %i" % len(enn.sample_indices_)) X_test, y_test = np.delete(X_train, enn.sample_indices_, axis=0), np.delete(y_train, enn.sample_indices_, axis=0) X_train, y_train = np.take(X_train, enn.sample_indices_, axis=0), np.take(y_train, enn.sample_indices_, axis=0) print(sorted(Counter(y_train).items())) if oversampling: X_train, y_train = input.oversample_data(X_train, y_train, config['patch_size'])
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
def condensed_nearest_neighbour(X, y): cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_resample(X, y) return X_res, y_res
# # # 8 ---------- Glass Identification # dataset = pd.read_csv('data/glass.txt') # X_data = dataset.iloc[:, 1:9].values # y_data = dataset.iloc[:, 10].values # ---------- ABALONE ----- # dataset = pd.read_csv('data/abalone.txt') # X_data = dataset.iloc[:, 0:].values # y_data = dataset.iloc[:, 8].values print(X_data.shape) print('-------') # ------- CNN -------- cnn = CondensedNearestNeighbour() X_cnn, y_cnn = cnn.fit_resample(X_data, y_data) print(X_cnn.shape) # ------- ENN -------- enn = EditedNearestNeighbours() X_enn, y_enn = enn.fit_resample(X_data, y_data) print(X_enn.shape) # ------- RENN -------- renn = RepeatedEditedNearestNeighbours() X_renn, y_renn = renn.fit_resample(X_data, y_data) print(X_renn.shape) # ------- Tomek -------- tl = TomekLinks() X_t, y_t = tl.fit_resample(X_data, y_data)
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour(return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8,
def sample_data_by_cnn(X, y): cnn = CondensedNearestNeighbour(random_state=42) return cnn.fit_resample(X, y)
def get_uds_CNN(data_list, label): cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_resample(data_list, label)