Пример #1
0
def test_nm1_fit_sample_half():
    """Test fit and sample routines with .5 ratio"""

    # Define the parameter for the under-sampling
    ratio = .7

    # Create the object
    nm1 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm1.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212],
                     [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228],
                     [-0.05903827, 0.10947647],
                     [0.03142011, 0.12323596],
                     [-0.60413357, 0.24628718],
                     [1.17737838, -0.2002118],
                     [0.50701028, -0.17636928],
                     [0.4960075, 0.86130762],
                     [0.45713638, 1.31069295],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #2
0
def test_nm_fit_resample_auto():
    sampling_strategy = 'auto'
    X_gt = [
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596],
                  [-0.60413357, 0.24628718], [0.50701028, -0.17636928],
                  [0.4960075, 0.86130762], [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596],
                  [-0.60413357, 0.24628718], [0.50701028, -0.17636928],
                  [0.4960075, 0.86130762], [0.45713638, 1.31069295]]),
        np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [
            -0.20497017, -0.26630228
        ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718],
                  [0.03142011, 0.12323596], [1.15157493, -1.2981518],
                  [-0.54619583, 1.73009918], [0.99272351, -0.11631728]])
    ]
    y_gt = [
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    ]
    for version_idx, version in enumerate(VERSION_NEARMISS):
        nm = NearMiss(sampling_strategy=sampling_strategy, version=version)
        X_resampled, y_resampled = nm.fit_resample(X, Y)
        assert_array_equal(X_resampled, X_gt[version_idx])
        assert_array_equal(y_resampled, y_gt[version_idx])
def test_nm3_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nn = NearestNeighbors(n_neighbors=3)
    nn3 = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(
        ratio=ratio,
        random_state=RND_SEED,
        version=VERSION_NEARMISS,
        return_indices=True,
        n_neighbors=nn,
        n_neighbors_ver3=nn3)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm3.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                     [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                     [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    idx_gt = np.array([3, 10, 11, 0, 2, 3, 5, 1, 4])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Пример #4
0
def test_nm1_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nn = NearestNeighbors(n_neighbors=3)
    nm1 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS, return_indices=True,
                   n_neighbors=nn)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm1.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212],
                     [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228],
                     [-0.05903827, 0.10947647],
                     [0.03142011, 0.12323596],
                     [-0.60413357, 0.24628718],
                     [0.50701028, -0.17636928],
                     [0.4960075, 0.86130762],
                     [0.45713638, 1.31069295]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    idx_gt = np.array([3, 10, 11, 2, 8, 5, 9, 1, 6])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Пример #5
0
def test_nm1_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    nm1 = NearMiss(random_state=RND_SEED)
    nm1.fit(X, Y)
    assert_raises(RuntimeError, nm1.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS)
    X_resampled, y_resampled = nm.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 166)
    assert_equal(count_y_res[2], 144)
Пример #7
0
def test_nm2_fit():
    """Test the fitting method"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)
    # Fit the data
    nm2.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(nm2.min_c_, 0)
    assert_equal(nm2.maj_c_, 1)
    assert_equal(nm2.stats_c_[0], 500)
    assert_equal(nm2.stats_c_[1], 4500)
Пример #8
0
def test_nm2_fit_sample_half():
    """Test fit and sample routines with .5 ratio"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm2.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #9
0
def test_nm2_fit_sample_auto_indices():
    """Test fit and sample routines with auto ratio and indices support"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm2 = NearMiss(ratio=ratio, random_state=RND_SEED,
                   version=VERSION_NEARMISS, return_indices=True)

    # Fit and sample
    X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'nm2_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_nm3_fit_sample_auto():
    """Test fit and sample routines with auto ratio"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    nm3 = NearMiss(
        ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS)

    # Fit and sample
    X_resampled, y_resampled = nm3.fit_sample(X, Y)

    X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302],
                     [-0.20497017, -0.26630228], [1.17737838, -0.2002118],
                     [-0.60413357, 0.24628718], [0.03142011, 0.12323596],
                     [1.15157493, -1.2981518], [-0.54619583, 1.73009918],
                     [0.99272351, -0.11631728]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #11
0
def test_nm_wrong_nn_obj():
    sampling_strategy = 'auto'
    nn = 'rnd'
    nm = NearMiss(
        sampling_strategy=sampling_strategy,
        version=VERSION_NEARMISS,
        return_indices=True,
        n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        nm.fit_resample(X, Y)
    nn3 = 'rnd'
    nn = NearestNeighbors(n_neighbors=3)
    nm3 = NearMiss(
        sampling_strategy=sampling_strategy,
        version=3,
        return_indices=True,
        n_neighbors=nn,
        n_neighbors_ver3=nn3)
    with raises(ValueError, match="has to be one of"):
        nm3.fit_resample(X, Y)
Пример #12
0
    undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]
    
undersample_Xtrain = undersample_Xtrain.values
undersample_Xtest = undersample_Xtest.values
undersample_ytrain = undersample_ytrain.values
undersample_ytest = undersample_ytest.values 

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique (or Undersampling)
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating 

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(undersample_Xtrain[test])
    
    undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
    undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
    undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
    undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))
    
    print("Train:", train_index, "Validation:", test_index)
    X1_train, X1_test = X.iloc[train_index], X.iloc[test_index]
    y1_train, y1_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X1_train, y1_train)
    prediction = clf.predict(X1_test)
    score = accuracy_score(prediction, y1_test)
    accuracy.append(score)

print(accuracy)
np.array(accuracy).mean()

# In[110]:

#using under Sampling technique :
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_undersample, y_undersample = nm.fit_sample(X, y.ravel())

# In[111]:

X_undersample.shape, y_undersample.shape

# In[112]:

from collections import Counter
print('original shape {}'.format(Counter(y)))
print('Resampled shape {}'.format(Counter(y_undersample)))

# In[113]:

#split into 70:30 ratio
Пример #14
0
def test_deprecation_random_state():
    nm = NearMiss(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        nm.fit_resample(X, Y)
Пример #15
0
print("original training data size", traindatasize)
numofmaj = traindatasize[0]
numofmin = traindatasize[1]
y_train_arr = np.array(y_train['Class'])
X_train_arr = np.array(X_train)
A = [
    0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05,
    0.04, 0.03, 0.02, 0.01
]
for i in A:
    rat = numofmin / (numofmaj * i)
    print("---------------")
    print("ratio", i)
    print("sampling_strategy", rat)
    nm = NearMiss(sampling_strategy=rat,
                  version=1,
                  random_state=5,
                  n_neighbors=3)
    #ratio after sampling Nmin/Mmaj n_neighbors=5 number of neighbour to be taken in consideration at a time
    X_train_sampled, y_train_sampled = nm.fit_sample(X_train_arr, y_train_arr)
    #print("original data size class 1: ",len(y.loc[y['Class'] == 1]))
    #print("original data size class 0: ",len(y.loc[y['Class'] == 0]))
    print("sampled training data size", collections.Counter(y_train_sampled))

    #random forest
    clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    clf.fit(X_train_sampled, y_train_sampled)
    X_test_arr = np.array(X_test)
    y_pred = clf.predict(X_test_arr)
    print("predicted")
    print(y_pred)
    print("actual")
    )
    return model


def test_balanced_batch_generator_class_no_return_indices(data):
    with pytest.raises(ValueError, match="needs to have an attribute"):
        BalancedBatchGenerator(*data, sampler=ClusterCentroids(), batch_size=10)


@pytest.mark.filterwarnings("ignore:`wait_time` is not used")  # keras 2.2.4
@pytest.mark.parametrize(
    "sampler, sample_weight",
    [
        (None, None),
        (RandomOverSampler(), None),
        (NearMiss(), None),
        (None, np.random.uniform(size=120)),
    ],
)
def test_balanced_batch_generator_class(data, sampler, sample_weight):
    X, y = data
    model = _build_keras_model(y.shape[1], X.shape[1])
    training_generator = BalancedBatchGenerator(
        X,
        y,
        sample_weight=sample_weight,
        sampler=sampler,
        batch_size=10,
        random_state=42,
    )
    model.fit_generator(generator=training_generator, epochs=10)
Пример #17
0
x = result1.drop(['tripid', 'pickup_time', 'drop_time', 'label'], axis=1)
y = result1['label']
codes = {'correct': 1, 'incorrect': 0}
y = y.map(codes)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=42)
# scaler = StandardScaler()
# x_train_scaled=scaler.fit_transform(x_train)
# x_test_scaled=scaler.fit_transform(x_test)

sm = SMOTE(random_state=2)
x_train_smote, y_train_smote = sm.fit_sample(x_train, y_train)

nr = NearMiss()
x_train_near_miss, y_train_near_miss = nr.fit_sample(x_train, y_train)

log_regression = LogisticRegression(solver='lbfgs')
# log_regression_smote = LogisticRegression(solver='lbfgs')
# log_regression_near_miss = LogisticRegression(solver='lbfgs')
log_regression.fit(x_train, y_train)
# log_regression_smote.fit(x_train_smote,y_train_smote)
# log_regression_near_miss.fit(x_train_near_miss,y_train_near_miss)
y_pred_log_regression = log_regression.predict(x_test)
# y_pred_log_regression_smote=log_regression_smote.predict(x_test)
# y_pred_log_regression_near_miss=log_regression_near_miss.predict(x_test)
y_predict_log_regression_test_data = log_regression.predict(test)
# y_predict_log_regression_smote_test_data=log_regression_smote.predict(test)
# y_predict_log_regression_near_miss_test_data=log_regression_near_miss.predict(test)
accuracy_log_regression = accuracy_score(y_test, y_pred_log_regression)
def sampling(**kwargs):
    X, y = kwargs['ti'].xcom_pull(task_ids='split_dataset')
    print("Under sampling")
    sm = NearMiss(random_state=42)
    x_res, y_res = sm.fit_sample(X, y)
    return x_res, y_res
print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data,
                      iris.target,
                      sampling_strategy={
                          0: 25,
                          1: 50,
                          2: 50
                      },
                      random_state=RANDOM_STATE)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=RANDOM_STATE)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
Пример #20
0
def load_dataset(db_name,
                 root_dir,
                 db_version,
                 sampler_algo,
                 batch_size,
                 num_epochs,
                 buffer_size,
                 size_cub,
                 prefetch_batchs=2000,
                 random_state=42,
                 device=""):
    # get data and predict
    if device:
        db_filename = os.path.join(root_dir,
                                   '%s%s%s' % (db_name, db_version, device))
    else:
        db_filename = os.path.join(root_dir, '%s%s' % (db_name, db_version))
    print('use dataset location: ' + db_filename)

    with bz2.BZ2File(db_filename, 'r') as sfile:
        data, labels = pickle.load(sfile)

    print('nr of samples coughing: %d' % labels.count(1))
    print('nr of samples NOT coughing: %d' % labels.count(0))
    print('nr of samples in total: %d' % len(labels))

    if sampler_algo is not None:
        if sampler_algo == "randomOverSampling":
            from imblearn.over_sampling import RandomOverSampler
            sampler = RandomOverSampler(random_state=random_state)
        elif sampler_algo == "randomUnderSampling":
            from imblearn.under_sampling import RandomUnderSampler
            sampler = RandomUnderSampler(random_state=random_state)
        elif sampler_algo == "smote":
            from imblearn.over_sampling import SMOTE
            sampler = SMOTE(random_state=random_state)
        elif sampler_algo == "nearmiss":
            from imblearn.under_sampling import NearMiss
            sampler = NearMiss(random_state=random_state)
        elif sampler_algo == "tomek":
            from imblearn.under_sampling import TomekLinks
            sampler = TomekLinks(random_state=random_state)
        elif sampler_algo == "enn":
            from imblearn.under_sampling import EditedNearestNeighbours
            sampler = EditedNearestNeighbours(random_state=random_state)
        else:
            raise Exception("unknown sampler %s" % sampler_algo)

        data, labels = sampler.fit_sample(data, labels)

    data = tf.convert_to_tensor(np.asarray(data, np.float32))
    labels = tf.convert_to_tensor(
        np.asarray(labels, np.int32).reshape((-1, 1)))

    # feed data
    print("build data pipeline")
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    print("cache")
    dataset = dataset.cache(filename='./' + db_name + 'cache.tf-data')
    print("shuffle")
    dataset = dataset.shuffle(buffer_size=buffer_size)
    print("repeat", num_epochs)
    dataset = dataset.repeat(count=num_epochs)
    print("batch")
    dataset = dataset.batch(batch_size)
    print("prefetch")
    dataset = dataset.prefetch(prefetch_batchs)
    iterator = dataset.make_one_shot_iterator()
    print("get next batch")
    features, labels = iterator.get_next()
    features = tf.reshape(features, [-1, 16, size_cub])
    labels = tf.reshape(labels, [-1])
    return features, labels, iterator
Пример #21
0

# In[94]:


confusion_matrix(ytest,b)


# # Oversampling

# In[75]:


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
rs=NearMiss()


# In[76]:


nm=RandomOverSampler()


# In[77]:


xnew,ynew=nm.fit_sample(X,y)


# In[78]:
Пример #22
0
                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 1
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
    pipeline = Pipeline(steps=[('t', ct), ('m', models[i])])
    # evaluate the model and store results
    scores = evaluate_model(X_train, y_train, pipeline)
    train_results.append(scores)

#Plot the results on a box and whisker plot
plt.boxplot(train_results, labels=newnames, showmeans=True)
plt.show()

#Perform Sampling
sampler1 = TomekLinks(sampling_strategy='majority')
X_enn, y_enn = sampler1.fit_resample(X_train, y_train)
print('TomekLinks counters')
print(Counter(y_enn))

sampler2 = NearMiss(version=1, n_neighbors=3)
X_nearmiss, y_nearmiss = sampler2.fit_resample(X_train, y_train)
print('Near miss counters')
print(Counter(y_nearmiss))

#spot check algorithms
models, names = get_models_for_sampling()
newnames = list()
train_results = list()
test_results = list()

for i in range(len(models)):
    # evaluate the model and store results
    scores = evaluate_model(X_enn, y_enn, models[i])
    train_results.append(scores)
    # summarize and store
Пример #24
0
X_train_sfs = X_train[top_features]
X_test_sfs = X_test[top_features]

X_train_sfs_scaled = X_train_sfs
X_test_sfs_scaled = X_test_sfs

#Import performance metrics, imbalanced rectifiers
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
np.random.seed(
    42)  #for reproducibility since SMOTE and Near Miss use randomizations

smt = SMOTE()
nr = NearMiss()


def compute_performance(model, X_train, y_train, X_test, y_test):
    start_time = timeit.default_timer()
    scores = cross_val_score(model, X_train, y_train, cv=3,
                             scoring='accuracy').mean()
    'Accuracy: ', scores
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    'Confusion Matrix: ', cm
    cr = classification_report(y_test, y_pred)
    'Classification Report: ', cr
    mcc = matthews_corrcoef(y_test, y_pred)
    'Matthews Correlation Coefficient: ', mcc
Пример #25
0
def test_deprecation_random_state():
    nm = NearMiss(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        nm.fit_resample(X, Y)
from feature_creation import selector, idx, df_reduced_train
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import TomekLinks, ClusterCentroids, NearMiss, CondensedNearestNeighbour, RandomUnderSampler
from imblearn.under_sampling import OneSidedSelection, InstanceHardnessThreshold
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

imbalances = [
    RandomUnderSampler(),
    TomekLinks(),
    ClusterCentroids(),
    NearMiss(version=1),
    NearMiss(version=2),
    NearMiss(version=3),
    CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51),
    OneSidedSelection(size_ngh=5, n_seeds_S=51),
    InstanceHardnessThreshold(),
    RandomOverSampler(ratio='auto'),
    SMOTE(ratio='auto', kind='regular'),
    SMOTE(ratio='auto', kind='borderline1'),
    SMOTE(ratio='auto', kind='borderline2'),
    SMOTETomek(ratio='auto'),
    SMOTEENN(ratio='auto')
]

classifiers = [
    LogisticRegression(),
Пример #27
0
                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 2
nm2 = NearMiss(version=2)
X_resampled, y_resampled = nm2.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
        train_index], undersample_y.iloc[test_index]

undersample_Xtrain = undersample_Xtrain.values
undersample_Xtest = undersample_Xtest.values
undersample_ytrain = undersample_ytrain.values
undersample_ytest = undersample_ytest.values

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values,
                                               undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    undersample_pipeline = imbalanced_make_pipeline(
        NearMiss(sampling_strategy='majority'),
        log_reg)  # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train],
                                                 undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(
        undersample_Xtrain[test])

    undersample_accuracy.append(
        undersample_pipeline.score(original_Xtrain[test],
                                   original_ytrain[test]))
def test_nearmiss_wrong_version():
    """Test either if an error is raised when the version is unknown."""

    version = 1000
    nm2 = NearMiss(version=version, random_state=RND_SEED)
    assert_raises(ValueError, nm2.fit_sample, X, Y)
Пример #30
0
def sample_data(model):
    r"""Sample the training data.

    Sampling is configured in the ``model.yml`` file (data:sampling:method)
    You can learn more about resampling techniques here [IMB]_.

    Parameters
    ----------
    model : alphapy.Model
        The model object describing the data.

    Returns
    -------
    model : alphapy.Model
        The model object with the sampled data.

    """

    logger.info("Sampling Data")

    # Extract model parameters.

    sampling_method = model.specs['sampling_method']
    sampling_ratio = model.specs['sampling_ratio']
    target = model.specs['target']
    target_value = model.specs['target_value']

    # Extract model data.

    X_train = model.X_train
    y_train = model.y_train

    # Calculate the sampling ratio if one is not provided.

    if sampling_ratio > 0.0:
        ratio = sampling_ratio
    else:
        uv, uc = np.unique(y_train, return_counts=True)
        target_index = np.where(uv == target_value)[0][0]
        nontarget_index = np.where(uv != target_value)[0][0]
        ratio = (uc[nontarget_index] / uc[target_index]) - 1.0
    logger.info("Sampling Ratio for target %s [%r]: %f", target, target_value,
                ratio)

    # Choose the sampling method.

    if sampling_method == SamplingMethod.under_random:
        sampler = RandomUnderSampler()
    elif sampling_method == SamplingMethod.under_tomek:
        sampler = TomekLinks()
    elif sampling_method == SamplingMethod.under_cluster:
        sampler = ClusterCentroids()
    elif sampling_method == SamplingMethod.under_nearmiss:
        sampler = NearMiss(version=1)
    elif sampling_method == SamplingMethod.under_ncr:
        sampler = NeighbourhoodCleaningRule(size_ngh=51)
    elif sampling_method == SamplingMethod.over_random:
        sampler = RandomOverSampler(ratio=ratio)
    elif sampling_method == SamplingMethod.over_smote:
        sampler = SMOTE(ratio=ratio, kind='regular')
    elif sampling_method == SamplingMethod.over_smoteb:
        sampler = SMOTE(ratio=ratio, kind='borderline1')
    elif sampling_method == SamplingMethod.over_smotesv:
        sampler = SMOTE(ratio=ratio, kind='svm')
    elif sampling_method == SamplingMethod.overunder_smote_tomek:
        sampler = SMOTETomek(ratio=ratio)
    elif sampling_method == SamplingMethod.overunder_smote_enn:
        sampler = SMOTEENN(ratio=ratio)
    elif sampling_method == SamplingMethod.ensemble_easy:
        sampler = EasyEnsemble()
    elif sampling_method == SamplingMethod.ensemble_bc:
        sampler = BalanceCascade()
    else:
        raise ValueError("Unknown Sampling Method %s" % sampling_method)

    # Get the newly sampled features.

    X, y = sampler.fit_sample(X_train, y_train)

    logger.info("Original Samples : %d", X_train.shape[0])
    logger.info("New Samples      : %d", X.shape[0])

    # Store the new features in the model.

    model.X_train = X
    model.y_train = y

    return model
                           initial_size=initial_size,
                           step_size=step_size)
        evl.compute_metrics()
        evl.save_to_csv_metrics()
        print("End", stream_name, method_name, time.time() - start)
    except Exception as ex:
        print(str(ex))
        traceback.print_exc()
        print("Exception in ", stream_name, method_name)


cores = open('/proc/cpuinfo').read().count('processor\t:')

methods = [
    DeterministicSamplingClassifier(oversampling=SMOTE(),
                                    undersampling=NearMiss()),
    DeterministicSamplingClassifier(),
    KMeanClustering(),
    LearnppCDS(),
    LearnppNIE(),
    REA(),
    OUSE(),
    MLPClassifier(),
]

names = [
    "DSC-S",
    "DSC-R",
    "KMeanClustering",
    "LearnppCDS",
    "LearnppNIE",
Пример #32
0
    # define the keras model
    model = Sequential()
    model.add(Embedding(X.shape[1], 128, input_length=None))
    model.add(LSTM(128))
    model.add(Dense(y_count, activation='softmax'))

    # compile the keras model
    sgd = optimizers.adam(lr=0.02)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])

    # fit the keras model on the dataset
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    training_generator = BalancedBatchGenerator(
        X, y, sampler=NearMiss(), batch_size=8, random_state=42)
    model.fit_generator(generator=training_generator, epochs=8, verbose=1)
    cvscores.append(model.evaluate(x_test, y_test))
    print('Model evaluation ', cvscores[-1])
    print('\n')
    cfm = confusion_matrix(np.argmax(y_test, axis=1),
                           model.predict_classes(x_test),
                           labels=[i for i in range(y_count)]
                           )
    cfms[n::] = cfm
    n += 1
    cfm = pd.DataFrame(cfm, col, col)
    print(cfm)


print('\n')
Пример #33
0
def test_nearmiss_error(nearmiss_params, err_msg):
    nm = NearMiss(**nearmiss_params)
    with pytest.raises(ValueError, match=err_msg):
        nm.fit_resample(X, Y)
Пример #34
0
# Import under sampling functions
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour

# Impot over sampling functions
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

# Instantiate the under sampling techniques
rus = RandomUnderSampler(random_state=42)
nm1 = NearMiss(version=1)
nm2 = NearMiss(version=2)
nm3 = NearMiss(version=3)
tl = TomekLinks()
cnn = CondensedNearestNeighbour(random_state=42)

# Instantiate the over sampling techniques
sm = SMOTE(random_state=42)
blSMOTE = BorderlineSMOTE(random_state=42)
smotenc = SMOTENC(random_state=42, categorical_features=[0,1])
adasyn = ADASYN(random_state=42)
ros = RandomOverSampler(random_state=42)

# Create a list with the resampling techniques
techniques = [rus, nm1, nm2, nm3, tl, cnn, sm, blSMOTE, smotenc, adasyn, ros]
Пример #35
0
def main(args):
    # 取得連線引擎
    engine = preprocessing.get_connector(user=args.user,
                                         password=args.password,
                                         host=args.host,
                                         port=args.port,
                                         database=args.database,
                                         protocol=args.protocol)
    # 取得訓練階段所需的 tables,接著合併、預處理所有 tables 。
    table_names_train = [
        'posts_train', 'post_shared_train', 'post_comment_created_train',
        'post_liked_train', 'post_collected_train'
    ]
    tables_train = preprocessing.get_tables(engine, table_names_train)
    total_df_train = preprocessing.merge_tables(tables_train,
                                                table_names_train,
                                                how='left')
    total_df_train = preprocessing.preprocess_total_df(
        total_df_train, has_like_count_36_hour=True)
    # 開始訓練模型
    preprocessing.print_info("TRAINING START!")
    # STEP 1: 建立 Pipeline
    cachedir = mkdtemp()
    pipe = Pipeline(
        steps=[
            ('resampler', 'passthrough'),
            # ('columntransformer', 'passthrough'),
            ('classifier', 'passthrough')
        ],
        memory=cachedir)
    # poly_cols = ['shared_count', 'comment_count', 'liked_count', 'collected_count']
    # col_trans = make_column_transformer((OneHotEncoder(dtype='int'), ['weekday']),
    #                                     (PolynomialFeatures(include_bias=False), poly_cols),
    #                                     remainder='passthrough')
    # STEP 2: 設定超參數空間以及衡量指標,建立 GridsearchCV
    param_grid_ada = {
        'resampler': ['passthrough', SMOTE(),
                      NearMiss()],
        # 'columntransformer': ['passthrough', col_trans],
        'classifier': [AdaBoostClassifier()],
        'classifier__n_estimators': [90, 100, 110, 120],
        'classifier__base_estimator': [
            DecisionTreeClassifier(max_depth=1),
            DecisionTreeClassifier(max_depth=2),
            DecisionTreeClassifier(max_depth=3)
        ]
    }
    param_grid_gb = {
        'resampler': ['passthrough', SMOTE(),
                      NearMiss()],
        # 'columntransformer': ['passthrough', col_trans],
        'classifier': [GradientBoostingClassifier(),
                       XGBClassifier()],
        'classifier__n_estimators': [90, 100, 110, 120],
        'classifier__learning_rate': [0.025, 0.05, 0.1]
    }
    param_grid = [param_grid_ada, param_grid_gb]
    scoring = {
        'precision': 'precision',
        'recall': 'recall',
        'specificity': make_scorer(specificity_score),
        'balanced_accuracy': 'balanced_accuracy',
        'f1_score': 'f1',
    }
    grid_search = GridSearchCV(pipe,
                               param_grid=param_grid,
                               scoring=scoring,
                               refit='f1_score',
                               n_jobs=-1,
                               cv=3,
                               return_train_score=True)
    # STEP 3: 搜尋最佳超參數組合並印出所需時間
    start_time = time()
    grid_search.fit(total_df_train.drop('is_trending', axis=1),
                    total_df_train['is_trending'])
    preprocessing.print_info(f"GRID SEARCH: {time()-start_time:.2f} secs")
    # STEP 4: 將最佳模型和交叉驗證結果儲存起來
    output_path = PurePath(f'{args.output_path}')
    model_name, results_name = PurePath('best_model.h5'), PurePath(
        'cv_results.csv')
    dump(grid_search.best_estimator_, str(output_path / model_name))
    cv_results = pd.DataFrame(grid_search.cv_results_)
    cv_results.to_csv(str(output_path / results_name),
                      index=False,
                      encoding='utf8')
    rmtree(cachedir)
    preprocessing.print_info("DONE!")
Пример #36
0
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.decomposition import PCA
#from sklearn.decomposition import TruncatedSVD as PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, PowerTransformer

from utils import generate_domain_space

PROTOTYPE = {
    "rebalance": [None, NearMiss(), CondensedNearestNeighbour(), SMOTE()],
    "normalizer": [None, StandardScaler(), PowerTransformer(), MinMaxScaler(), RobustScaler()],
    "features": [None, PCA(), SelectKBest(), FeatureUnion([("pca", PCA()), ("selectkbest", SelectKBest())])]
}

DOMAIN_SPACE = generate_domain_space(PROTOTYPE)

def get_baseline():
    baseline = {}
    for k in PROTOTYPE.keys():
        baseline[k] = ('{}_NoneType'.format(k), {})
    return baseline

def pipeline_conf_to_full_pipeline(args, algorithm, seed, algo_config):
        if args == {}:
            args = get_baseline()
        op_to_class = {'pca': PCA, 'selectkbest': SelectKBest}
        parts = ['rebalance', 'normalizer', 'features']
Пример #37
0
palette = sns.color_palette()


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 2
nm2 = NearMiss(version=2)
X_resampled, y_resampled = nm2.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
Пример #38
0
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y)
X_resampled = pd.DataFrame(X_resampled)
X_resampled.columns = [
    'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface',
    'is_local_class', 'call_external_method'
]
y_resampled = pd.DataFrame(y_resampled)
y_resampled.columns = ['is_code_smell']
undersampled_data = pd.concat([X_resampled, y_resampled], axis=1)
print("InstanceHardnessThreshold")
print(undersampled_data.describe())
undersampled_data.to_csv('../../dataset/LIC/LIC_InstanceHardnessThreshold.csv',
                         index=False)

#NearMiss
rus = NearMiss(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y)
X_resampled = pd.DataFrame(X_resampled)
X_resampled.columns = [
    'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface',
    'is_local_class', 'call_external_method'
]
y_resampled = pd.DataFrame(y_resampled)
y_resampled.columns = ['is_code_smell']
undersampled_data = pd.concat([X_resampled, y_resampled], axis=1)
print("NearMiss")
print(undersampled_data.describe())
undersampled_data.to_csv('../../dataset/LIC/LIC_NearMiss.csv', index=False)

#OneSidedSelection
rus = OneSidedSelection(return_indices=True)
     
# Dealing with imbalanced classes
if imb_class == 0:
    pass
elif imb_class == 1:
    # Oversample with SMOTE
    print('Balanced Classes Turned On')
    for i in range(0, len(skpipes)):
        skpipes[i].append(('smote'+str(i), SMOTE(random_state = rand_st)))
        
elif imb_class == 2:
    # Undersample using NearMiss
    print('Balanced Classes Turned On')
    for i in range(0, len(skpipes)):
        skpipes[i].append(('NearMiss'+str(i), NearMiss(version=3)))
    
elif imb_class == 3:
    # Undersample using NearMiss
    print('Balanced Classes Turned On')
    for i in range(0, len(skpipes)):
        skpipes[i].append(('undersample'+str(i), RandomUnderSampler()))
    

# %%
#############################################################################
#
# Feature Selection
#
##########################################
import numpy as np
import xgboost as xgb
from hyperopt import hp
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import NearMiss

from config import random_seed
from utils.python_utils import quniform_int

steps = [
    ('undersampler', NearMiss(random_state = random_seed)),
    ('pca', PCA(n_components=50,  random_state=random_seed)),
    ('xgb', xgb.XGBClassifier(n_estimators=1000, silent=True, nthread=3, seed=random_seed))
]

model = Pipeline(steps=steps)

params_space = {
    'pca__n_components': quniform_int('n_components', 20, 200, 10),
    'xgb__max_depth': quniform_int('max_depth', 10, 30, 1),
    'xgb__min_child_weight': hp.quniform('min_child_weight', 1, 20, 1),
    'xgb__subsample': hp.uniform('subsample', 0.8, 1),
    'xgb__n_estimators': quniform_int('n_estimators', 1000, 10000, 50),
    'xgb__learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001,
    'xgb__gamma': hp.loguniform('gamma', np.log(0.0001), np.log(5)) - 0.0001,
    'xgb__colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05)
}
Пример #41
0
def test_nearmiss_wrong_version():
    version = 1000
    nm = NearMiss(version=version)
    with raises(ValueError, match="must be 1, 2 or 3"):
        nm.fit_resample(X, Y)
Пример #42
0
def under_sampling_nm(x, y):
	nm = NearMiss(version = 3)
	x_train, y_train = nm.fit_resample(x, y)
	return x_train, y_train
Пример #43
0
#Step by Step "Fetal Health" Prediction-Detailed - ekshghsh gia standard scaler
std_scale = StandardScaler()
X_sc = std_scale.fit_transform(X)


X_train, X_test, y_train,y_test = train_test_split(X_sc, Y, test_size=0.25, random_state=42)
print("There are total "+str(len(X_train))+" rows in training dataset")
print("There are total "+str(len(X_test))+" rows in test dataset")

smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

tl = TomekLinks()
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

nm = NearMiss(version = 1)
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)
nm2 = NearMiss(version = 2)
X_train_nm2, y_train_nm2 = nm2.fit_resample(X_train, y_train)
nm3 = NearMiss(version = 3)
X_train_nm3, y_train_nm3 = nm3.fit_resample(X_train, y_train)



def evaluate_model(clf, X_test, y_test, model_name, oversample_type):
  print('--------------------------------------------')
  print('Model ', model_name)
  print('Data Type ', oversample_type)
  y_pred = clf.predict(X_test)

  f1 = f1_score(y_test, y_pred, average='weighted')
# 3. Under sampling
print("> 3. Over Sampling")

print(">> 3.1 Prototype Generation")
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_sample(X, y)

print(sorted(Counter(y_resampled).items()))

print(">> 3.2 Prototype Selection")
print(">>> 3.2.1 Controlled under-sampling techniques")
print(
    ">>>> 3.2.1.1 Controlled under-sampling techniques: Random Under Sampler")
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X, y)

print(sorted(Counter(y_resampled).items()))

print(">>>> 3.2.1.2 Controlled under-sampling techniques: Bootstrap")
np.vstack({tuple(row) for row in X_resampled}).shape

rus = RandomUnderSampler(random_state=0, replacement=True)
X_resampled, y_resampled = rus.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

print(">>>> 3.2.1.3 Controlled under-sampling techniques: Bootstrap")
nm1 = NearMiss(random_state=0, version=1)
X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))