def test_tl_fit_resample(): tl = TomekLinks() X_resampled, y_resampled = tl.fit_resample(X, Y) X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [ 1.34192108, -0.13367336 ], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [ -0.37162401, -2.19400981 ], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [ 0.61472253, -0.82309052 ], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_deprecation_random_state(): tl = TomekLinks(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): tl.fit_resample(X, Y)
#Remove passenger car samples randomly desiredSampleCounts = {4: 75000} rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts) X_undersampled, y_undersampled = rus.fit_resample(X, y) #Remove tractor samples randomly desiredSampleCounts = {6: 75000} rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts) X_undersampled, y_undersampled = rus.fit_resample(X_undersampled, y_undersampled) print(np.bincount(y_undersampled)) #Remove Tomek Pairs underSampleObj = TomekLinks(sampling_strategy='all', n_jobs=5) X_undersampledTomek, y_undersampledTomek = underSampleObj.fit_resample( X_undersampled, y_undersampled) print(np.bincount(y_undersampledTomek)) #Over sample minority classes to match majority classes overSampleObj = SMOTENC(categorical_features=[2, 6, 7, 8, 9, 10, 11, 12], n_jobs=6) X_final, y_final = overSampleObj.fit_resample(X_undersampledTomek, y_undersampledTomek) print(np.bincount(y_final)) print(time.clock() - start) # ============================================================================= # # Reconstruct full data frame to include BasicCategory # # =============================================================================
from plots import Plot #load data train_data = pd.read_csv('./input/train.csv') train_labels = pd.read_csv('./input/train_labels.csv') #print(train_labels.head(3)) #split train data X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.3, random_state=0) unique, count = np.unique(y_train, return_counts=True) print('counts of labels before undersampling: ', unique, count) tl = TomekLinks(random_state=2) X_train_res, y_train_res = tl.fit_resample(X_train, y_train.values.ravel()) unique1, count1 = np.unique(y_train_res, return_counts=True) print('counts of labels after undersampling: ', unique1, count1) clf = neighbors.KNeighborsClassifier(15) clf.fit(X_train_res, y_train_res) Z = clf.predict(X_train) acc = clf.score(X_train, y_train) print('Accuracy on split training data: ' + str(acc)) # Put the result into a confusion matrix
def tomek_links(X,y): #print(sorted(Counter(y).items())) tml = TomekLinks() X_resampled, y_resampled = tml.fit_resample(X, y) #print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled
bank_y, stratify=bank_y, train_size=0.7, random_state=0) #classifier = svm.SVC(kernel = 'rbf',C=1000,gamma=0.001) classifier = LogisticRegression(max_iter=10000, C=0.1) #easy_ensemble = imblearn.ensemble.EasyEnsembleClassifier(n_estimators=35, base_estimator=classifier, sampling_strategy='majority', n_jobs=-1) oversample = BorderlineSMOTE(sampling_strategy=0.5, n_jobs=-1, kind='borderline-1') x_train, y_train = oversample.fit_resample(x_train, y_train) tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1) x_train, y_train = tom_lin.fit_resample(x_train, y_train) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) h.printResults2(y_test, y_pred) h.plotConfusionMatrix(y_test, y_pred, norm=True) h.plotConfusionMatrix(y_test, y_pred, norm=False) #White-box explanation feature_names = bank_X.columns.values interpr.plotFeaturesCoefficientGlobal(classifier, feature_names) new_x_train = x_train new_y_train = classifier.predict(x_train)
# undersample and plot imbalanced dataset with Tomek Links from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import TomekLinks from matplotlib import pyplot from numpy import where # define dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) # summarize class distribution counter = Counter(y) print(counter) # define the undersampling method undersample = TomekLinks() # transform the dataset X, y = undersample.fit_resample(X, y) # summarize the new class distribution counter = Counter(y) print(counter) # scatter plot of examples by class label for label, _ in counter.items(): row_ix = where(y == label)[0] pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label)) pyplot.legend() pyplot.show()
print(__doc__) rng = np.random.RandomState(0) n_samples_1 = 500 n_samples_2 = 50 X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]] y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2)) X_syn, y_syn = shuffle(X_syn, y_syn) X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn, y_syn) # remove Tomek links tl = TomekLinks(return_indices=True) X_resampled, y_resampled, idx_resampled = tl.fit_resample(X_syn, y_syn) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1], alpha=.8, label='Removed samples') # make nice plotting
sampling_strategy = 'not majority' ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # With **cleaning method**, the number of samples in each class will not be # equalized even if targeted. sampling_strategy = 'not minority' tl = TomekLinks(sampling_strategy) X_res, y_res = tl.fit_resample(X, y) print('Information of the iris data set after making it ' 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` as a ``dict`` # ................................... # # When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted # classes. The values correspond to the desired number of samples for each # targeted class. This is working for both **under- and over-sampling** # algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
if NN: np.savetxt('../4_learningData/RUStestNN.csv', M, delimiter=',', fmt='%d') else: np.savetxt('../4_learningData/RUStest.csv', M, delimiter=',', fmt='%d') ########################## ## Tomek Links ## ########################## # create the tomek links undersampler object sampler2 = TL() # tomek link undersampling xTL, yTL = sampler2.fit_resample(x,y) if DEBUG: print(xTL.shape) print(yTL.shape) print(sorted(Counter(yTL).items())) # split into training and test data 70/30 split xTLtrain, xTLtest, yTLtrain, yTLtest = tts(xTL, yTL, test_size=TEST_SIZE, random_state=6375) x_pr = np.concatenate([xTLtrain, xTLtest]) y_pr = np.concatenate([yTLtrain, yTLtest]) dfProto = np.column_stack([y_pr,x_pr]) df = pd.DataFrame(data=dfProto, columns=['contact_type', 'contact_class_score_diff', 'contact_id','counter', 'delay']) import plotly.express as px
def tomeklinks(X, y): tl = TomekLinks() X_res, y_res = tl.fit_resample(X, y) return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train, PM_ova_X_train, PM_ova_y_train, SC_ova_X_train, SC_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN() AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ALLKNN": AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN() AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "CNN": AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ENN": AP_enn, PM_enn, SC_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "IHT": AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NCR": AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AP_ova_y_train = [ 0 if i == "Add penalty" else 1 for i in AP_ova_y_train ] AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train] PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_ova_y_train = [ 0 if i == "Send for Credit Collection" else 1 for i in SC_ova_y_train ] SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NM": AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss() AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "OSS": AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RENN": AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTE": AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE() AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "BSMOTE": AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTEENN": AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN() AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTETOMEK": AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek() AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "TOMEK": AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks() AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ROS": AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RUS": AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train, SC_ova_y_train) return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
def tomek_links(x, y): # use with other resampler print("----TOMEK----") sampler = TomekLinks() X, y = sampler.fit_resample(x, y) return X, y
# class will be removed. If ``sampling_strategy='all'`` both samples will be # removed. sampler = TomekLinks() fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) ax_arr = (ax1, ax2) title_arr = ('Removing only majority samples', 'Removing all samples') for ax, title, sampler in zip(ax_arr, title_arr, [TomekLinks(sampling_strategy='auto'), TomekLinks(sampling_strategy='all')]): X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)), np.array([0] * X_minority.shape[0] + [1] * X_majority.shape[0])) ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label='Minority class', s=200, marker='_') ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1], label='Majority class', s=200, marker='+') # highlight the samples of interest ax.scatter([X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label='Tomek link', s=200, alpha=0.3) ax.set_title(title) make_plot_despine(ax) fig.tight_layout()
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
def train_and_eval2(cross_tuples, classifiers, classifier_kwargs, missing_feature_strategy="intersection", undersample=False, save=None, save_rate=20): """ cross_tuples: A list of tuples with the following shape: (*[Training DataFrames], *[Testing DataFrames], name : string) classifiers: list of classifier classes classifier_kwargs: list of dictionaries that will be used as keyword arguments for the classifier. If the kwargs includes a key 'param_grid' with a dictionary of value ranges, the optimum hyperparameters will be searched for using a GridSearch. missing_feature_strategy: Either intersection or substitution. Intersection will remove features not in common. Substitution will substitute the prediction of the missing tool with a 0. undersample: Boolean. Indicates whether to try undersampling. save: String. Path to which to save the pickled dataframe. This function may be useful as the dataframe includes the objects of the classifiers, which may become useful to store to analyze later (beta coefficients, weights, etc.) save_rate: The rate of save, in number of models trained. Every N models, the results are saved. """ reports = [] classifiers = list(zip(classifiers, classifier_kwargs)) sampling_strategies = ["oversample"] n_models = 0 if undersample: sampling_strategies.extend(["undersample"]) for (training_dfs, testing_dfs, name) in tqdm(cross_tuples, desc="Cross Tuples"): X, y, measures = numpify_merge_dataframes(training_dfs, testing_dfs, missing_feature_strategy) for sampl_stg in tqdm(sampling_strategies, desc="Sampling Strategy", leave=False): if sampl_stg == "oversample": sm = SMOTE(random_state=42, n_jobs=-1) X_sampled, y_sampled = sm.fit_resample(X, y) else: tl = TomekLinks(n_jobs=-1) X_sampled, y_sampled = tl.fit_resample(X, y) for (classifier, kwargs) in tqdm(classifiers, desc="Classifiers", leave=False): if "param_grid" in kwargs: try: clf = classifier(random_state=42) except: clf = classifier() grid_search = GridSearchCV( clf, kwargs["param_grid"], n_jobs=-1, cv=10, refit="f1", scoring=["f1", "precision", "recall", "accuracy"], return_train_score=True, ) grid_search.fit(X_sampled, y_sampled) clf = grid_search.best_estimator_ best_score = grid_search.best_score_ else: try: clf = classifier(**kwargs, random_state=42) except: clf = classifier(**kwargs) clf.fit(X_sampled, y_sampled) df_test = pd.concat(testing_dfs) X_test, y_test, _ = numpify_merge_dataframes( [df_test[measures + ["label"]]], [], "intersection") y_pred = clf.predict(X_test) f1 = f1_score(y_test, y_pred) acc = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) report_data = { "name": [name], 'measures': [measures], "classifier": [clf], "training_df": [training_dfs], "testing_df": [df_test], "sampling_strategy": [sampl_stg], "f1_train": [best_score], "f1_test": [f1], "acc_test": [acc], "recall_test": [recall], "precision_test": [precision], } if "param_grid" in kwargs: report_data["grid_search"] = [grid_search] reports.append(pd.DataFrame(data=report_data)) n_models += 1 if save is not None and n_models % save_rate == 0: df = pd.concat(reports, ignore_index=True) df = df.sort_values('f1_test', ascending=False) with open(save, "wb") as fw: pickle.dump(df, fw, pickle.HIGHEST_PROTOCOL) df = pd.concat(reports, ignore_index=True) df = df.sort_values('f1_train', ascending=False) if save is not None: with open(save, "wb") as fw: pickle.dump(df, fw, pickle.HIGHEST_PROTOCOL) return df
random_state=10, alpha=.0001, loss='squared_hinge', max_iter=200, penalty="l2", early_stopping=True, learning_rate='adaptive', eta0=0.1, verbose=0, n_jobs=-1) #smote_tomek = SMOTETomek(random_state=0) #X_train, y_train = smote_tomek.fit_resample(X, y) print("Iniciando resample...") tl = TomekLinks(return_indices=False, ratio='majority') X_train, y_train = tl.fit_resample(X, y) print("Fim resample...") print(X_test.shape, y_test.shape) print(X_train.shape, y_train.shape) print(type(y_test)) print(type(y_train)) import scipy.sparse as sp new_X = sp.vstack((X_train, X_test)) new_y = np.concatenate((y_train, y_test)) X_train = new_X y_train = new_y #print(new_X.shape,new_y.shape) print("Iniciando Treino...") clf.fit(X_train, y_train)
# one hot encode categorical, normalize numerical ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix), ('n', MinMaxScaler(), num_ix)]) # wrap the model in a pipeline pipeline = Pipeline(steps=[('t', ct), ('m', models[i])]) # evaluate the model and store results scores = evaluate_model(X_train, y_train, pipeline) train_results.append(scores) #Plot the results on a box and whisker plot plt.boxplot(train_results, labels=newnames, showmeans=True) plt.show() #Perform Sampling sampler1 = TomekLinks(sampling_strategy='majority') X_enn, y_enn = sampler1.fit_resample(X_train, y_train) print('TomekLinks counters') print(Counter(y_enn)) sampler2 = NearMiss(version=1, n_neighbors=3) X_nearmiss, y_nearmiss = sampler2.fit_resample(X_train, y_train) print('Near miss counters') print(Counter(y_nearmiss)) #spot check algorithms models, names = get_models_for_sampling() newnames = list() train_results = list() test_results = list() for i in range(len(models)):
ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(X, y) print( f"Information of the iris data set after making it " f"balanced by over-sampling: \n sampling_strategy={sampling_strategy} \n " f"y: {Counter(y_res)}") plot_pie(y_res) ############################################################################### # With **cleaning method**, the number of samples in each class will not be # equalized even if targeted. sampling_strategy = "not minority" tl = TomekLinks(sampling_strategy) X_res, y_res = tl.fit_resample(X, y) print( f"Information of the iris data set after making it " f"balanced by cleaning sampling: \n sampling_strategy={sampling_strategy} \n " f"y: {Counter(y_res)}") plot_pie(y_res) ############################################################################### # ``sampling_strategy`` as a ``dict`` # ................................... # # When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted # classes. The values correspond to the desired number of samples for each # targeted class. This is working for both **under- and over-sampling** # algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
df_test = pd.read_csv('data/treino_dividido/new_TEST.csv') test_aux = pd.read_csv('data/test.csv') df_answer = pd.DataFrame() df_train, df_test, df_answer = p_data(df_train, df_test, test_aux, df_answer) label = df_train['NU_NOTA_MT'] df_train.drop(['NU_NOTA_MT'], axis=1, inplace=True) tl = TomekLinks() newX, newY = tl.fit_resample(df_train.values, label.values) print(len(df_train), len(newX)) # model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10, # max_features='auto', max_leaf_nodes=None, # min_impurity_decrease=0.0, min_impurity_split=None, # min_samples_leaf=4, min_samples_split=5, # min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-2, # oob_score=False, random_state=None, verbose=0, warm_start=False) # model.fit(df_train, label) # #predictions = model.predict(df_test) # df_answer['NU_NOTA_MT'] = np.around(predictions, 2)
plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2 plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2) plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=y[deleted_ind], marker='x') colors = plt.cm.virdis(y[deleted_ind]/2) plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2) plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=colors, marker='x') from imblearn.under_sampling import TomekLinks tl=TomekLinks(sampling_strategy="all") X_resampled, y_resampled = tl.fit_resample(X,y) deleted_ind=np.setdiff1d(np.arange(len(X)), ind) colors=plt.cm.viridis(y[deleted_ind]/2) plt.scatter(X_resampled[:,0], X_resampled[:,1], c='gray', alpha=0.2) plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=colors, marker='x') from sklearn.linear_model import LogisticRegression clf=LogisticRegression() clf.fit(X,y) xmin,xmax,ymin,ymax=X[:,0].min(), X[:,0].max(),X[:,1].min(),X[:,1].max() xx,yy = np.meshgrid(np.linspace(xmin-0.5,xmax+0.5,100), np.linspace(ymin-0.5,ymax+0.5,100)) zz=np.c_[xx.ravel(),yy.ravel()]
Y = df["fetal_health"] #Step by Step "Fetal Health" Prediction-Detailed - ekshghsh gia standard scaler std_scale = StandardScaler() X_sc = std_scale.fit_transform(X) X_train, X_test, y_train,y_test = train_test_split(X_sc, Y, test_size=0.25, random_state=42) print("There are total "+str(len(X_train))+" rows in training dataset") print("There are total "+str(len(X_test))+" rows in test dataset") smt = SMOTE() X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train) tl = TomekLinks() X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train) nm = NearMiss(version = 1) X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train) nm2 = NearMiss(version = 2) X_train_nm2, y_train_nm2 = nm2.fit_resample(X_train, y_train) nm3 = NearMiss(version = 3) X_train_nm3, y_train_nm3 = nm3.fit_resample(X_train, y_train) def evaluate_model(clf, X_test, y_test, model_name, oversample_type): print('--------------------------------------------') print('Model ', model_name) print('Data Type ', oversample_type) y_pred = clf.predict(X_test)
X_cnn, y_cnn = cnn.fit_resample(X_data, y_data) print(X_cnn.shape) # ------- ENN -------- enn = EditedNearestNeighbours() X_enn, y_enn = enn.fit_resample(X_data, y_data) print(X_enn.shape) # ------- RENN -------- renn = RepeatedEditedNearestNeighbours() X_renn, y_renn = renn.fit_resample(X_data, y_data) print(X_renn.shape) # ------- Tomek -------- tl = TomekLinks() X_t, y_t = tl.fit_resample(X_data, y_data) print(X_t.shape) # ------- RUS -------- rus = RandomUnderSampler(random_state=42) X_rus, y_rus = rus.fit_resample(X_data, y_data) print(X_rus.shape) print('\n\n') datasets = [{ "X": X_data, "y": y_data, "name": "KNN" }, { "X": X_cnn,
def workflow_70_inos(self, num_ADASYN, train_p, train_n, new_samples_all_clusters, remove_tomeklinks, model_name): # format the new samples (transpose it and convert it to pandas dataframe) and concat them new_samples_pd_list = [] for cluster_index in range(len(new_samples_all_clusters)): new_samples_per_cluster = pd.DataFrame(np.real(new_samples_all_clusters[cluster_index])) print("debug, shape of new samples for cluster %d" % cluster_index) print(new_samples_per_cluster.shape) # add the converted dataframe back to the list; Now the list contains as many dataframe as number of clusters new_samples_pd_list.append(new_samples_per_cluster) # concat new samples for each cluster if len(new_samples_all_clusters) == 1: new_samples_concated = new_samples_per_cluster else: new_samples_concated = pd.concat([i for i in new_samples_pd_list], axis=0) # print("debug, shape of concatenated new samples for %d clusters:" % len(new_samples_all_clusters)) print(new_samples_concated.shape) # concatenated new samples in shape of n_samples * n_features train_x_expanded, train_y_binary = self.pre_process(test_data=False) inos_p_old = train_x_expanded[train_y_binary == 1] inos_n = train_x_expanded[train_y_binary == 0] print("debug, shape of inos_p_old, inos_n") print(inos_p_old.shape, inos_n.shape) ################################# # generate 30% ADASYN samples ################################# # prepare data to run ADASYN: ADASYN trains on entire original training data X = pd.concat((train_p.transpose(), train_n.transpose()), axis=0) # create y y_p = np.ones(train_p.shape[1]) y_n = np.zeros(train_n.shape[1]) y = np.concatenate((y_p, y_n)) # We will generate equal number of minority samples as majority samples majority_sample_cnt = train_n.shape[1] if num_ADASYN != 0: ada = ADASYN(sampling_strategy=1.0, n_neighbors=3) # X contains all data, should be in format of n_samples*n_features X_res, y_res = ada.fit_resample(X, y) # In X_res, the first segment is original minority class samples, 2nd segment is original majority class samples # last segment is synthesized minority samples, we only want the last segment num_adasyn_samples_generated = X_res.shape[0] - train_p.shape[1] - train_n.shape[1] starting_index = X_res.shape[0] - num_adasyn_samples_generated if num_ADASYN >= num_adasyn_samples_generated: X_adasyn = X_res.iloc[starting_index:X_res.shape[0], :] elif num_ADASYN < num_adasyn_samples_generated: X_adasyn = X_res.iloc[starting_index:(starting_index + num_ADASYN)] print("debug, X_adasyn shape") print(X_adasyn.shape) ############################combine all samples, prepare for training # combine p all clusters inos_p = pd.concat([inos_p_old, new_samples_concated, X_adasyn], axis=0) else: inos_p = pd.concat([inos_p_old, new_samples_concated], axis=0) # combine p and n x_res = pd.concat([inos_p, inos_n], axis=0) # create y_res y_res_p = np.ones(inos_p.shape[0]) y_res_n = np.zeros(inos_n.shape[0]) y_res = np.concatenate([y_res_p, y_res_n]) # print("debug, shape of training data:") # print(x_res.shape) # print(y_res.shape) # if remove_tomeklinks == True: tl = TomekLinks() x_res, y_res = tl.fit_resample(x_res, y_res) # print("shape of training data after removing tomek links:") # print(x_res.shape) # print(y_res.shape) else: pass tmo = self.build_model(x_res, y_res, model_name) # evaluates performance x_test, y_test_binary = self.pre_process(test_data=True) # f1_score, precision, recall = self.eval_model(tmo, x_test, y_test_binary) return f1_score, precision, recall
def resampling(X, Y, r): # print(sorted(Counter(Y).items())) smote_enn = TomekLinks() X_resampled, y_resampled = smote_enn.fit_resample(X, Y) #print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled
def tomek_undersample(X_train, y_train): tomek = TomekLinks() return tomek.fit_resample(X_train, y_train)