예제 #1
0
def test_tl_fit_resample():
    tl = TomekLinks()
    X_resampled, y_resampled = tl.fit_resample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [
        1.34192108, -0.13367336
    ], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [
        -0.37162401, -2.19400981
    ], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [
        0.61472253, -0.82309052
    ], [0.19893132, -0.47761769], [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734], [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049], [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929], [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #2
0
def test_deprecation_random_state():
    tl = TomekLinks(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        tl.fit_resample(X, Y)
예제 #3
0
#Remove passenger car samples randomly
desiredSampleCounts = {4: 75000}
rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts)
X_undersampled, y_undersampled = rus.fit_resample(X, y)

#Remove tractor samples randomly
desiredSampleCounts = {6: 75000}
rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts)
X_undersampled, y_undersampled = rus.fit_resample(X_undersampled,
                                                  y_undersampled)
print(np.bincount(y_undersampled))

#Remove Tomek Pairs
underSampleObj = TomekLinks(sampling_strategy='all', n_jobs=5)
X_undersampledTomek, y_undersampledTomek = underSampleObj.fit_resample(
    X_undersampled, y_undersampled)
print(np.bincount(y_undersampledTomek))

#Over sample minority classes to match majority classes
overSampleObj = SMOTENC(categorical_features=[2, 6, 7, 8, 9, 10, 11, 12],
                        n_jobs=6)
X_final, y_final = overSampleObj.fit_resample(X_undersampledTomek,
                                              y_undersampledTomek)
print(np.bincount(y_final))

print(time.clock() - start)
# =============================================================================
#
# Reconstruct full data frame to include BasicCategory
#
# =============================================================================
from plots import Plot

#load data
train_data = pd.read_csv('./input/train.csv')
train_labels = pd.read_csv('./input/train_labels.csv')
#print(train_labels.head(3))

#split train data
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.3, random_state=0)


unique, count = np.unique(y_train, return_counts=True)
print('counts of labels before undersampling: ', unique, count)

tl = TomekLinks(random_state=2)
X_train_res, y_train_res = tl.fit_resample(X_train, y_train.values.ravel())

unique1, count1 = np.unique(y_train_res, return_counts=True)
print('counts of labels after undersampling: ', unique1, count1)



clf = neighbors.KNeighborsClassifier(15)
clf.fit(X_train_res, y_train_res)


Z = clf.predict(X_train)
acc = clf.score(X_train, y_train)
print('Accuracy on split training data: ' + str(acc))

# Put the result into a confusion matrix
def tomek_links(X,y):
    #print(sorted(Counter(y).items()))
    tml = TomekLinks()
    X_resampled, y_resampled = tml.fit_resample(X, y)
    #print(sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled
예제 #6
0
                                                    bank_y,
                                                    stratify=bank_y,
                                                    train_size=0.7,
                                                    random_state=0)

#classifier = svm.SVC(kernel = 'rbf',C=1000,gamma=0.001)
classifier = LogisticRegression(max_iter=10000, C=0.1)

#easy_ensemble = imblearn.ensemble.EasyEnsembleClassifier(n_estimators=35, base_estimator=classifier, sampling_strategy='majority', n_jobs=-1)

oversample = BorderlineSMOTE(sampling_strategy=0.5,
                             n_jobs=-1,
                             kind='borderline-1')
x_train, y_train = oversample.fit_resample(x_train, y_train)
tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1)
x_train, y_train = tom_lin.fit_resample(x_train, y_train)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

h.printResults2(y_test, y_pred)
h.plotConfusionMatrix(y_test, y_pred, norm=True)
h.plotConfusionMatrix(y_test, y_pred, norm=False)

#White-box explanation
feature_names = bank_X.columns.values
interpr.plotFeaturesCoefficientGlobal(classifier, feature_names)

new_x_train = x_train
new_y_train = classifier.predict(x_train)
# undersample and plot imbalanced dataset with Tomek Links
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import TomekLinks
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# define the undersampling method
undersample = TomekLinks()
# transform the dataset
X, y = undersample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)
# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = where(y == label)[0]
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()
예제 #8
0
print(__doc__)

rng = np.random.RandomState(0)
n_samples_1 = 500
n_samples_2 = 50
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
              0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn, y_syn = shuffle(X_syn, y_syn)
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn,
                                                                    y_syn)

# remove Tomek links
tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, idx_resampled = tl.fit_resample(X_syn, y_syn)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]),
                                   idx_resampled)
idx_class_0 = y_resampled == 0
plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')

# make nice plotting
sampling_strategy = 'not majority'

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by over-sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# With **cleaning method**, the number of samples in each class will not be
# equalized even if targeted.

sampling_strategy = 'not minority'
tl = TomekLinks(sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` as a ``dict``
# ...................................
#
# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
# classes. The values correspond to the desired number of samples for each
# targeted class. This is working for both **under- and over-sampling**
# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.

예제 #10
0
if NN:
    np.savetxt('../4_learningData/RUStestNN.csv', M, delimiter=',', fmt='%d')
else: 
    np.savetxt('../4_learningData/RUStest.csv', M, delimiter=',', fmt='%d')



##########################
##     Tomek Links      ##
##########################

# create the tomek links undersampler object
sampler2 = TL()

# tomek link undersampling
xTL, yTL = sampler2.fit_resample(x,y)

if DEBUG:
    print(xTL.shape)
    print(yTL.shape)
    print(sorted(Counter(yTL).items()))

# split into training and test data 70/30 split
xTLtrain, xTLtest, yTLtrain, yTLtest = tts(xTL, yTL, test_size=TEST_SIZE, random_state=6375)

x_pr = np.concatenate([xTLtrain, xTLtest])
y_pr = np.concatenate([yTLtrain, yTLtest])
dfProto = np.column_stack([y_pr,x_pr])
df = pd.DataFrame(data=dfProto, columns=['contact_type', 'contact_class_score_diff', 'contact_id','counter', 'delay'])

import plotly.express as px
예제 #11
0
def tomeklinks(X, y):
    tl = TomekLinks()
    X_res, y_res = tl.fit_resample(X, y)
    return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train,
                        PM_ova_X_train, PM_ova_y_train, SC_ova_X_train,
                        SC_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN()
        AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ALLKNN":
        AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN()
        AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train,
                                                    AP_ova_y_train)
        PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train,
                                                    PM_ova_y_train)
        SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train,
                                                    SC_ova_y_train)
    elif imb_technique == "CNN":
        AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ENN":
        AP_enn, PM_enn, SC_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "IHT":
        AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NCR":
        AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AP_ova_y_train = [
            0 if i == "Add penalty" else 1 for i in AP_ova_y_train
        ]
        AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train]
        PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_ova_y_train = [
            0 if i == "Send for Credit Collection" else 1
            for i in SC_ova_y_train
        ]
        SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NM":
        AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss()
        AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "OSS":
        AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RENN":
        AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "SMOTE":
        AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE()
        AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "BSMOTE":
        AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN()
        AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train,
                                                   AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train,
                                                   PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train,
                                                   SC_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek()
        AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "TOMEK":
        AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks()
        AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "ROS":
        AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RUS":
        AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
def tomek_links(x, y):  # use with other resampler
    print("----TOMEK----")
    sampler = TomekLinks()
    X, y = sampler.fit_resample(x, y)
    return X, y
예제 #14
0
# class will be removed. If ``sampling_strategy='all'`` both samples will be
# removed.

sampler = TomekLinks()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
             'Removing all samples')
for ax, title, sampler in zip(ax_arr,
                              title_arr,
                              [TomekLinks(sampling_strategy='auto'),
                               TomekLinks(sampling_strategy='all')]):
    X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)),
                                        np.array([0] * X_minority.shape[0] +
                                                 [1] * X_majority.shape[0]))
    ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
               label='Minority class', s=200, marker='_')
    ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1],
               label='Majority class', s=200, marker='+')

    # highlight the samples of interest
    ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
               [X_minority[-1, 1], X_majority[1, 1]],
               label='Tomek link', s=200, alpha=0.3)

    ax.set_title(title)
    make_plot_despine(ax)
fig.tight_layout()
예제 #15
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
예제 #16
0
def train_and_eval2(cross_tuples,
                    classifiers,
                    classifier_kwargs,
                    missing_feature_strategy="intersection",
                    undersample=False,
                    save=None,
                    save_rate=20):
    """
    cross_tuples: A list of tuples with the following shape:
        (*[Training DataFrames], *[Testing DataFrames], name : string)
    classifiers: list of classifier classes
    classifier_kwargs: list of dictionaries that will be used as keyword arguments for the classifier.
                    If the kwargs includes a key 'param_grid' with a dictionary of value ranges,
                    the optimum hyperparameters will be searched for using a GridSearch.
    missing_feature_strategy: Either intersection or substitution. Intersection will remove
                    features not in common. Substitution will substitute the prediction
                    of the missing tool with a 0.
    undersample: Boolean. Indicates whether to try undersampling.
    save: String. Path to which to save the pickled dataframe.
        This function may be useful as the dataframe includes the objects of the classifiers, which may
        become useful to store to analyze later (beta coefficients, weights, etc.)
    save_rate: The rate of save, in number of models trained. Every N models, the results are saved.
    """

    reports = []

    classifiers = list(zip(classifiers, classifier_kwargs))
    sampling_strategies = ["oversample"]

    n_models = 0

    if undersample:
        sampling_strategies.extend(["undersample"])
    for (training_dfs, testing_dfs, name) in tqdm(cross_tuples,
                                                  desc="Cross Tuples"):
        X, y, measures = numpify_merge_dataframes(training_dfs, testing_dfs,
                                                  missing_feature_strategy)
        for sampl_stg in tqdm(sampling_strategies,
                              desc="Sampling Strategy",
                              leave=False):
            if sampl_stg == "oversample":
                sm = SMOTE(random_state=42, n_jobs=-1)
                X_sampled, y_sampled = sm.fit_resample(X, y)
            else:
                tl = TomekLinks(n_jobs=-1)
                X_sampled, y_sampled = tl.fit_resample(X, y)
            for (classifier, kwargs) in tqdm(classifiers,
                                             desc="Classifiers",
                                             leave=False):
                if "param_grid" in kwargs:
                    try:
                        clf = classifier(random_state=42)
                    except:
                        clf = classifier()
                    grid_search = GridSearchCV(
                        clf,
                        kwargs["param_grid"],
                        n_jobs=-1,
                        cv=10,
                        refit="f1",
                        scoring=["f1", "precision", "recall", "accuracy"],
                        return_train_score=True,
                    )
                    grid_search.fit(X_sampled, y_sampled)
                    clf = grid_search.best_estimator_
                    best_score = grid_search.best_score_
                else:
                    try:
                        clf = classifier(**kwargs, random_state=42)
                    except:
                        clf = classifier(**kwargs)
                    clf.fit(X_sampled, y_sampled)

                df_test = pd.concat(testing_dfs)

                X_test, y_test, _ = numpify_merge_dataframes(
                    [df_test[measures + ["label"]]], [], "intersection")

                y_pred = clf.predict(X_test)
                f1 = f1_score(y_test, y_pred)
                acc = accuracy_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)

                report_data = {
                    "name": [name],
                    'measures': [measures],
                    "classifier": [clf],
                    "training_df": [training_dfs],
                    "testing_df": [df_test],
                    "sampling_strategy": [sampl_stg],
                    "f1_train": [best_score],
                    "f1_test": [f1],
                    "acc_test": [acc],
                    "recall_test": [recall],
                    "precision_test": [precision],
                }

                if "param_grid" in kwargs:
                    report_data["grid_search"] = [grid_search]

                reports.append(pd.DataFrame(data=report_data))

                n_models += 1

                if save is not None and n_models % save_rate == 0:
                    df = pd.concat(reports, ignore_index=True)
                    df = df.sort_values('f1_test', ascending=False)
                    with open(save, "wb") as fw:
                        pickle.dump(df, fw, pickle.HIGHEST_PROTOCOL)

    df = pd.concat(reports, ignore_index=True)
    df = df.sort_values('f1_train', ascending=False)
    if save is not None:
        with open(save, "wb") as fw:
            pickle.dump(df, fw, pickle.HIGHEST_PROTOCOL)

    return df
                    random_state=10,
                    alpha=.0001,
                    loss='squared_hinge',
                    max_iter=200,
                    penalty="l2",
                    early_stopping=True,
                    learning_rate='adaptive',
                    eta0=0.1,
                    verbose=0,
                    n_jobs=-1)

#smote_tomek = SMOTETomek(random_state=0)
#X_train, y_train = smote_tomek.fit_resample(X, y)
print("Iniciando resample...")
tl = TomekLinks(return_indices=False, ratio='majority')
X_train, y_train = tl.fit_resample(X, y)
print("Fim resample...")
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)
print(type(y_test))
print(type(y_train))
import scipy.sparse as sp

new_X = sp.vstack((X_train, X_test))
new_y = np.concatenate((y_train, y_test))
X_train = new_X
y_train = new_y
#print(new_X.shape,new_y.shape)

print("Iniciando Treino...")
clf.fit(X_train, y_train)
    # one hot encode categorical, normalize numerical
    ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix),
                            ('n', MinMaxScaler(), num_ix)])
    # wrap the model in a pipeline
    pipeline = Pipeline(steps=[('t', ct), ('m', models[i])])
    # evaluate the model and store results
    scores = evaluate_model(X_train, y_train, pipeline)
    train_results.append(scores)

#Plot the results on a box and whisker plot
plt.boxplot(train_results, labels=newnames, showmeans=True)
plt.show()

#Perform Sampling
sampler1 = TomekLinks(sampling_strategy='majority')
X_enn, y_enn = sampler1.fit_resample(X_train, y_train)
print('TomekLinks counters')
print(Counter(y_enn))

sampler2 = NearMiss(version=1, n_neighbors=3)
X_nearmiss, y_nearmiss = sampler2.fit_resample(X_train, y_train)
print('Near miss counters')
print(Counter(y_nearmiss))

#spot check algorithms
models, names = get_models_for_sampling()
newnames = list()
train_results = list()
test_results = list()

for i in range(len(models)):
예제 #19
0
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X, y)
print(
    f"Information of the iris data set after making it "
    f"balanced by over-sampling: \n sampling_strategy={sampling_strategy} \n "
    f"y: {Counter(y_res)}")
plot_pie(y_res)

###############################################################################
# With **cleaning method**, the number of samples in each class will not be
# equalized even if targeted.

sampling_strategy = "not minority"
tl = TomekLinks(sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print(
    f"Information of the iris data set after making it "
    f"balanced by cleaning sampling: \n sampling_strategy={sampling_strategy} \n "
    f"y: {Counter(y_res)}")
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` as a ``dict``
# ...................................
#
# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
# classes. The values correspond to the desired number of samples for each
# targeted class. This is working for both **under- and over-sampling**
# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
    df_test = pd.read_csv('data/treino_dividido/new_TEST.csv')

    test_aux = pd.read_csv('data/test.csv')

    df_answer = pd.DataFrame()

    df_train, df_test, df_answer = p_data(df_train, df_test, test_aux,
                                          df_answer)

    label = df_train['NU_NOTA_MT']

    df_train.drop(['NU_NOTA_MT'], axis=1, inplace=True)

    tl = TomekLinks()

    newX, newY = tl.fit_resample(df_train.values, label.values)

    print(len(df_train), len(newX))

    # model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
    #                               max_features='auto', max_leaf_nodes=None,
    #                               min_impurity_decrease=0.0, min_impurity_split=None,
    #                               min_samples_leaf=4, min_samples_split=5,
    #                               min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-2,
    #                               oob_score=False, random_state=None, verbose=0, warm_start=False)

    # model.fit(df_train, label)

    # #predictions = model.predict(df_test)

    # df_answer['NU_NOTA_MT'] = np.around(predictions, 2)
예제 #21
0
plt.scatter(X[deleted_ind,0],X[deleted_ind,1],c=y[deleted_ind], marker='x', alpha=0.2
plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2)
plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=y[deleted_ind], marker='x')

colors = plt.cm.virdis(y[deleted_ind]/2)
plt.scatter(X_resampled[:,0], X_resampled[:,1], c="gray", alpha=0.2)
plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=colors, marker='x')

from imblearn.under_sampling import TomekLinks

tl=TomekLinks(sampling_strategy="all")

X_resampled, y_resampled = tl.fit_resample(X,y)

deleted_ind=np.setdiff1d(np.arange(len(X)), ind)
colors=plt.cm.viridis(y[deleted_ind]/2)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c='gray', alpha=0.2)
plt.scatter(X[deleted_ind,0], X[deleted_ind,1], c=colors, marker='x')

from sklearn.linear_model import LogisticRegression

clf=LogisticRegression()
clf.fit(X,y)

xmin,xmax,ymin,ymax=X[:,0].min(), X[:,0].max(),X[:,1].min(),X[:,1].max()
xx,yy = np.meshgrid(np.linspace(xmin-0.5,xmax+0.5,100), np.linspace(ymin-0.5,ymax+0.5,100))
zz=np.c_[xx.ravel(),yy.ravel()]
예제 #22
0
Y = df["fetal_health"]

#Step by Step "Fetal Health" Prediction-Detailed - ekshghsh gia standard scaler
std_scale = StandardScaler()
X_sc = std_scale.fit_transform(X)


X_train, X_test, y_train,y_test = train_test_split(X_sc, Y, test_size=0.25, random_state=42)
print("There are total "+str(len(X_train))+" rows in training dataset")
print("There are total "+str(len(X_test))+" rows in test dataset")

smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

tl = TomekLinks()
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

nm = NearMiss(version = 1)
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)
nm2 = NearMiss(version = 2)
X_train_nm2, y_train_nm2 = nm2.fit_resample(X_train, y_train)
nm3 = NearMiss(version = 3)
X_train_nm3, y_train_nm3 = nm3.fit_resample(X_train, y_train)



def evaluate_model(clf, X_test, y_test, model_name, oversample_type):
  print('--------------------------------------------')
  print('Model ', model_name)
  print('Data Type ', oversample_type)
  y_pred = clf.predict(X_test)
예제 #23
0
X_cnn, y_cnn = cnn.fit_resample(X_data, y_data)
print(X_cnn.shape)

# ------- ENN --------
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_data, y_data)
print(X_enn.shape)

# ------- RENN --------
renn = RepeatedEditedNearestNeighbours()
X_renn, y_renn = renn.fit_resample(X_data, y_data)
print(X_renn.shape)

# ------- Tomek --------
tl = TomekLinks()
X_t, y_t = tl.fit_resample(X_data, y_data)
print(X_t.shape)

# ------- RUS --------
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_data, y_data)
print(X_rus.shape)

print('\n\n')

datasets = [{
    "X": X_data,
    "y": y_data,
    "name": "KNN"
}, {
    "X": X_cnn,
	def workflow_70_inos(self, num_ADASYN, train_p, train_n, new_samples_all_clusters, remove_tomeklinks, model_name):

		# format the new samples (transpose it and convert it to pandas dataframe) and concat them
		new_samples_pd_list = []
		for cluster_index in range(len(new_samples_all_clusters)):
			new_samples_per_cluster = pd.DataFrame(np.real(new_samples_all_clusters[cluster_index]))

			print("debug, shape of new samples for cluster %d" % cluster_index)
			print(new_samples_per_cluster.shape)
			# add the converted dataframe back to the list; Now the list contains as many dataframe as number of clusters
			new_samples_pd_list.append(new_samples_per_cluster)

		# concat new samples for each cluster
		if len(new_samples_all_clusters) == 1:
			new_samples_concated = new_samples_per_cluster
		else:
			new_samples_concated = pd.concat([i for i in new_samples_pd_list], axis=0)
		#
		print("debug, shape of concatenated new samples for %d clusters:" % len(new_samples_all_clusters))
		print(new_samples_concated.shape)

		# concatenated new samples in shape of n_samples * n_features

		train_x_expanded, train_y_binary = self.pre_process(test_data=False)

		inos_p_old = train_x_expanded[train_y_binary == 1]
		inos_n = train_x_expanded[train_y_binary == 0]
		print("debug, shape of inos_p_old, inos_n")
		print(inos_p_old.shape, inos_n.shape)
		#################################
		# generate 30% ADASYN samples
		#################################
		# prepare data to run ADASYN: ADASYN trains on entire original training data
		X = pd.concat((train_p.transpose(), train_n.transpose()), axis=0)
		# create y
		y_p = np.ones(train_p.shape[1])
		y_n = np.zeros(train_n.shape[1])
		y = np.concatenate((y_p, y_n))
		# We will generate equal number of minority samples as majority samples
		majority_sample_cnt = train_n.shape[1]

		if num_ADASYN != 0:

			ada = ADASYN(sampling_strategy=1.0, n_neighbors=3)
			# X contains all data, should be in format of n_samples*n_features
			X_res, y_res = ada.fit_resample(X, y)
			# In X_res, the first segment is original minority class samples, 2nd segment is original majority class samples
			# last segment is synthesized minority samples, we only want the last segment
			num_adasyn_samples_generated = X_res.shape[0] - train_p.shape[1] - train_n.shape[1]
			starting_index = X_res.shape[0] - num_adasyn_samples_generated
			if num_ADASYN >= num_adasyn_samples_generated:
				X_adasyn = X_res.iloc[starting_index:X_res.shape[0], :]
			elif num_ADASYN < num_adasyn_samples_generated:
				X_adasyn = X_res.iloc[starting_index:(starting_index + num_ADASYN)]
			print("debug, X_adasyn shape")
			print(X_adasyn.shape)
			############################combine all samples, prepare for training
			# combine p all clusters
			inos_p = pd.concat([inos_p_old, new_samples_concated, X_adasyn], axis=0)
		else:
			inos_p = pd.concat([inos_p_old, new_samples_concated], axis=0)
		# combine p and n
		x_res = pd.concat([inos_p, inos_n], axis=0)
		# create y_res
		y_res_p = np.ones(inos_p.shape[0])
		y_res_n = np.zeros(inos_n.shape[0])
		y_res = np.concatenate([y_res_p, y_res_n])
		# print("debug, shape of training data:")
		# print(x_res.shape)
		# print(y_res.shape)
		#
		if remove_tomeklinks == True:
			tl = TomekLinks()
			x_res, y_res = tl.fit_resample(x_res, y_res)
			# print("shape of training data after removing tomek links:")
			# print(x_res.shape)
			# print(y_res.shape)
		else:
			pass

		tmo = self.build_model(x_res, y_res, model_name)
		# evaluates performance
		x_test, y_test_binary = self.pre_process(test_data=True)
		#
		f1_score, precision, recall = self.eval_model(tmo, x_test, y_test_binary)

		return f1_score, precision, recall
예제 #25
0
def resampling(X, Y, r):
    # print(sorted(Counter(Y).items()))
    smote_enn = TomekLinks()
    X_resampled, y_resampled = smote_enn.fit_resample(X, Y)
    #print(sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled
예제 #26
0
def tomek_undersample(X_train, y_train):
    tomek = TomekLinks()
    return tomek.fit_resample(X_train, y_train)