def test_nm1_fit_sample_half(): """Test fit and sample routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .7 # Create the object nm1 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm1.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [1.17737838, -0.2002118], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm_fit_resample_auto(): sampling_strategy = 'auto' X_gt = [ np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]), np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [ -0.20497017, -0.26630228 ], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) ] y_gt = [ np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) ] for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(sampling_strategy=sampling_strategy, version=version) X_resampled, y_resampled = nm.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt[version_idx]) assert_array_equal(y_resampled, y_gt[version_idx])
def test_nm3_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nn = NearestNeighbors(n_neighbors=3) nn3 = NearestNeighbors(n_neighbors=3) nm3 = NearMiss( ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) # Fit and sample X_resampled, y_resampled, idx_under = nm3.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) idx_gt = np.array([3, 10, 11, 0, 2, 3, 5, 1, 4]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm1_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nn = NearestNeighbors(n_neighbors=3) nm1 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) # Fit and sample X_resampled, y_resampled, idx_under = nm1.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [-0.05903827, 0.10947647], [0.03142011, 0.12323596], [-0.60413357, 0.24628718], [0.50701028, -0.17636928], [0.4960075, 0.86130762], [0.45713638, 1.31069295]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) idx_gt = np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm1_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object nm1 = NearMiss(random_state=RND_SEED) nm1.fit(X, Y) assert_raises(RuntimeError, nm1.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data nm = NearMiss(random_state=RND_SEED, version=VERSION_NEARMISS) X_resampled, y_resampled = nm.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 166) assert_equal(count_y_res[2], 144)
def test_nm2_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit the data nm2.fit(X, Y) # Check if the data information have been computed assert_equal(nm2.min_c_, 0) assert_equal(nm2.maj_c_, 1) assert_equal(nm2.stats_c_[0], 500) assert_equal(nm2.stats_c_[1], 4500)
def test_nm2_fit_sample_half(): """Test fit and sample routines with .5 ratio""" # Define the parameter for the under-sampling ratio = .5 # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm2.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm2_fit_sample_auto_indices(): """Test fit and sample routines with auto ratio and indices support""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm2 = NearMiss(ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS, return_indices=True) # Fit and sample X_resampled, y_resampled, idx_under = nm2.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'nm2_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'nm2_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'nm2_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_nm3_fit_sample_auto(): """Test fit and sample routines with auto ratio""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object nm3 = NearMiss( ratio=ratio, random_state=RND_SEED, version=VERSION_NEARMISS) # Fit and sample X_resampled, y_resampled = nm3.fit_sample(X, Y) X_gt = np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], [1.17737838, -0.2002118], [-0.60413357, 0.24628718], [0.03142011, 0.12323596], [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_nm_wrong_nn_obj(): sampling_strategy = 'auto' nn = 'rnd' nm = NearMiss( sampling_strategy=sampling_strategy, version=VERSION_NEARMISS, return_indices=True, n_neighbors=nn) with raises(ValueError, match="has to be one of"): nm.fit_resample(X, Y) nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) nm3 = NearMiss( sampling_strategy=sampling_strategy, version=3, return_indices=True, n_neighbors=nn, n_neighbors_ver3=nn3) with raises(ValueError, match="has to be one of"): nm3.fit_resample(X, Y)
undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index] undersample_Xtrain = undersample_Xtrain.values undersample_Xtest = undersample_Xtest.values undersample_ytrain = undersample_ytrain.values undersample_ytest = undersample_ytest.values undersample_accuracy = [] undersample_precision = [] undersample_recall = [] undersample_f1 = [] undersample_auc = [] # Implementing NearMiss Technique (or Undersampling) # Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables) X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values) print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss))) # Cross Validating for train, test in sss.split(undersample_Xtrain, undersample_ytrain): undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before.. undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train]) undersample_prediction = undersample_model.predict(undersample_Xtrain[test]) undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test])) undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction)) undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction)) undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction)) undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))
print("Train:", train_index, "Validation:", test_index) X1_train, X1_test = X.iloc[train_index], X.iloc[test_index] y1_train, y1_test = y.iloc[train_index], y.iloc[test_index] clf.fit(X1_train, y1_train) prediction = clf.predict(X1_test) score = accuracy_score(prediction, y1_test) accuracy.append(score) print(accuracy) np.array(accuracy).mean() # In[110]: #using under Sampling technique : from imblearn.under_sampling import NearMiss nm = NearMiss() X_undersample, y_undersample = nm.fit_sample(X, y.ravel()) # In[111]: X_undersample.shape, y_undersample.shape # In[112]: from collections import Counter print('original shape {}'.format(Counter(y))) print('Resampled shape {}'.format(Counter(y_undersample))) # In[113]: #split into 70:30 ratio
def test_deprecation_random_state(): nm = NearMiss(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): nm.fit_resample(X, Y)
print("original training data size", traindatasize) numofmaj = traindatasize[0] numofmin = traindatasize[1] y_train_arr = np.array(y_train['Class']) X_train_arr = np.array(X_train) A = [ 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01 ] for i in A: rat = numofmin / (numofmaj * i) print("---------------") print("ratio", i) print("sampling_strategy", rat) nm = NearMiss(sampling_strategy=rat, version=1, random_state=5, n_neighbors=3) #ratio after sampling Nmin/Mmaj n_neighbors=5 number of neighbour to be taken in consideration at a time X_train_sampled, y_train_sampled = nm.fit_sample(X_train_arr, y_train_arr) #print("original data size class 1: ",len(y.loc[y['Class'] == 1])) #print("original data size class 0: ",len(y.loc[y['Class'] == 0])) print("sampled training data size", collections.Counter(y_train_sampled)) #random forest clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0) clf.fit(X_train_sampled, y_train_sampled) X_test_arr = np.array(X_test) y_pred = clf.predict(X_test_arr) print("predicted") print(y_pred) print("actual")
) return model def test_balanced_batch_generator_class_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): BalancedBatchGenerator(*data, sampler=ClusterCentroids(), batch_size=10) @pytest.mark.filterwarnings("ignore:`wait_time` is not used") # keras 2.2.4 @pytest.mark.parametrize( "sampler, sample_weight", [ (None, None), (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=120)), ], ) def test_balanced_batch_generator_class(data, sampler, sample_weight): X, y = data model = _build_keras_model(y.shape[1], X.shape[1]) training_generator = BalancedBatchGenerator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42, ) model.fit_generator(generator=training_generator, epochs=10)
x = result1.drop(['tripid', 'pickup_time', 'drop_time', 'label'], axis=1) y = result1['label'] codes = {'correct': 1, 'incorrect': 0} y = y.map(codes) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42) # scaler = StandardScaler() # x_train_scaled=scaler.fit_transform(x_train) # x_test_scaled=scaler.fit_transform(x_test) sm = SMOTE(random_state=2) x_train_smote, y_train_smote = sm.fit_sample(x_train, y_train) nr = NearMiss() x_train_near_miss, y_train_near_miss = nr.fit_sample(x_train, y_train) log_regression = LogisticRegression(solver='lbfgs') # log_regression_smote = LogisticRegression(solver='lbfgs') # log_regression_near_miss = LogisticRegression(solver='lbfgs') log_regression.fit(x_train, y_train) # log_regression_smote.fit(x_train_smote,y_train_smote) # log_regression_near_miss.fit(x_train_near_miss,y_train_near_miss) y_pred_log_regression = log_regression.predict(x_test) # y_pred_log_regression_smote=log_regression_smote.predict(x_test) # y_pred_log_regression_near_miss=log_regression_near_miss.predict(x_test) y_predict_log_regression_test_data = log_regression.predict(test) # y_predict_log_regression_smote_test_data=log_regression_smote.predict(test) # y_predict_log_regression_near_miss_test_data=log_regression_near_miss.predict(test) accuracy_log_regression = accuracy_score(y_test, y_pred_log_regression)
def sampling(**kwargs): X, y = kwargs['ti'].xcom_pull(task_ids='split_dataset') print("Under sampling") sm = NearMiss(random_state=42) x_res, y_res = sm.fit_sample(X, y) return x_res, y_res
print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 25, 1: 50, 2: 50 }, random_state=RANDOM_STATE) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) print('Training target statistics: {}'.format(Counter(y_train))) print('Testing target statistics: {}'.format(Counter(y_test))) # Create a pipeline pipeline = make_pipeline(NearMiss(version=2), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
def load_dataset(db_name, root_dir, db_version, sampler_algo, batch_size, num_epochs, buffer_size, size_cub, prefetch_batchs=2000, random_state=42, device=""): # get data and predict if device: db_filename = os.path.join(root_dir, '%s%s%s' % (db_name, db_version, device)) else: db_filename = os.path.join(root_dir, '%s%s' % (db_name, db_version)) print('use dataset location: ' + db_filename) with bz2.BZ2File(db_filename, 'r') as sfile: data, labels = pickle.load(sfile) print('nr of samples coughing: %d' % labels.count(1)) print('nr of samples NOT coughing: %d' % labels.count(0)) print('nr of samples in total: %d' % len(labels)) if sampler_algo is not None: if sampler_algo == "randomOverSampling": from imblearn.over_sampling import RandomOverSampler sampler = RandomOverSampler(random_state=random_state) elif sampler_algo == "randomUnderSampling": from imblearn.under_sampling import RandomUnderSampler sampler = RandomUnderSampler(random_state=random_state) elif sampler_algo == "smote": from imblearn.over_sampling import SMOTE sampler = SMOTE(random_state=random_state) elif sampler_algo == "nearmiss": from imblearn.under_sampling import NearMiss sampler = NearMiss(random_state=random_state) elif sampler_algo == "tomek": from imblearn.under_sampling import TomekLinks sampler = TomekLinks(random_state=random_state) elif sampler_algo == "enn": from imblearn.under_sampling import EditedNearestNeighbours sampler = EditedNearestNeighbours(random_state=random_state) else: raise Exception("unknown sampler %s" % sampler_algo) data, labels = sampler.fit_sample(data, labels) data = tf.convert_to_tensor(np.asarray(data, np.float32)) labels = tf.convert_to_tensor( np.asarray(labels, np.int32).reshape((-1, 1))) # feed data print("build data pipeline") dataset = tf.data.Dataset.from_tensor_slices((data, labels)) print("cache") dataset = dataset.cache(filename='./' + db_name + 'cache.tf-data') print("shuffle") dataset = dataset.shuffle(buffer_size=buffer_size) print("repeat", num_epochs) dataset = dataset.repeat(count=num_epochs) print("batch") dataset = dataset.batch(batch_size) print("prefetch") dataset = dataset.prefetch(prefetch_batchs) iterator = dataset.make_one_shot_iterator() print("get next batch") features, labels = iterator.get_next() features = tf.reshape(features, [-1, 16, size_cub]) labels = tf.reshape(labels, [-1]) return features, labels, iterator
# In[94]: confusion_matrix(ytest,b) # # Oversampling # In[75]: from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import NearMiss rs=NearMiss() # In[76]: nm=RandomOverSampler() # In[77]: xnew,ynew=nm.fit_sample(X,y) # In[78]:
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Nearmiss 1 nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
pipeline = Pipeline(steps=[('t', ct), ('m', models[i])]) # evaluate the model and store results scores = evaluate_model(X_train, y_train, pipeline) train_results.append(scores) #Plot the results on a box and whisker plot plt.boxplot(train_results, labels=newnames, showmeans=True) plt.show() #Perform Sampling sampler1 = TomekLinks(sampling_strategy='majority') X_enn, y_enn = sampler1.fit_resample(X_train, y_train) print('TomekLinks counters') print(Counter(y_enn)) sampler2 = NearMiss(version=1, n_neighbors=3) X_nearmiss, y_nearmiss = sampler2.fit_resample(X_train, y_train) print('Near miss counters') print(Counter(y_nearmiss)) #spot check algorithms models, names = get_models_for_sampling() newnames = list() train_results = list() test_results = list() for i in range(len(models)): # evaluate the model and store results scores = evaluate_model(X_enn, y_enn, models[i]) train_results.append(scores) # summarize and store
X_train_sfs = X_train[top_features] X_test_sfs = X_test[top_features] X_train_sfs_scaled = X_train_sfs X_test_sfs_scaled = X_test_sfs #Import performance metrics, imbalanced rectifiers from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss np.random.seed( 42) #for reproducibility since SMOTE and Near Miss use randomizations smt = SMOTE() nr = NearMiss() def compute_performance(model, X_train, y_train, X_test, y_test): start_time = timeit.default_timer() scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean() 'Accuracy: ', scores model.fit(X_train, y_train) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) 'Confusion Matrix: ', cm cr = classification_report(y_test, y_pred) 'Classification Report: ', cr mcc = matthews_corrcoef(y_test, y_pred) 'Matthews Correlation Coefficient: ', mcc
def test_deprecation_random_state(): nm = NearMiss(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): nm.fit_resample(X, Y)
from feature_creation import selector, idx, df_reduced_train from imblearn.over_sampling import RandomOverSampler, SMOTE from imblearn.under_sampling import TomekLinks, ClusterCentroids, NearMiss, CondensedNearestNeighbour, RandomUnderSampler from imblearn.under_sampling import OneSidedSelection, InstanceHardnessThreshold from imblearn.combine import SMOTEENN, SMOTETomek from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier imbalances = [ RandomUnderSampler(), TomekLinks(), ClusterCentroids(), NearMiss(version=1), NearMiss(version=2), NearMiss(version=3), CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51), OneSidedSelection(size_ngh=5, n_seeds_S=51), InstanceHardnessThreshold(), RandomOverSampler(ratio='auto'), SMOTE(ratio='auto', kind='regular'), SMOTE(ratio='auto', kind='borderline1'), SMOTE(ratio='auto', kind='borderline2'), SMOTETomek(ratio='auto'), SMOTEENN(ratio='auto') ] classifiers = [ LogisticRegression(),
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Nearmiss 2 nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
train_index], undersample_y.iloc[test_index] undersample_Xtrain = undersample_Xtrain.values undersample_Xtest = undersample_Xtest.values undersample_ytrain = undersample_ytrain.values undersample_ytest = undersample_ytest.values undersample_accuracy = [] undersample_precision = [] undersample_recall = [] undersample_f1 = [] undersample_auc = [] # Implementing NearMiss Technique # Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables) X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values) print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss))) # Cross Validating the right way for train, test in sss.split(undersample_Xtrain, undersample_ytrain): undersample_pipeline = imbalanced_make_pipeline( NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before.. undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train]) undersample_prediction = undersample_model.predict( undersample_Xtrain[test]) undersample_accuracy.append( undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
def test_nearmiss_wrong_version(): """Test either if an error is raised when the version is unknown.""" version = 1000 nm2 = NearMiss(version=version, random_state=RND_SEED) assert_raises(ValueError, nm2.fit_sample, X, Y)
def sample_data(model): r"""Sample the training data. Sampling is configured in the ``model.yml`` file (data:sampling:method) You can learn more about resampling techniques here [IMB]_. Parameters ---------- model : alphapy.Model The model object describing the data. Returns ------- model : alphapy.Model The model object with the sampled data. """ logger.info("Sampling Data") # Extract model parameters. sampling_method = model.specs['sampling_method'] sampling_ratio = model.specs['sampling_ratio'] target = model.specs['target'] target_value = model.specs['target_value'] # Extract model data. X_train = model.X_train y_train = model.y_train # Calculate the sampling ratio if one is not provided. if sampling_ratio > 0.0: ratio = sampling_ratio else: uv, uc = np.unique(y_train, return_counts=True) target_index = np.where(uv == target_value)[0][0] nontarget_index = np.where(uv != target_value)[0][0] ratio = (uc[nontarget_index] / uc[target_index]) - 1.0 logger.info("Sampling Ratio for target %s [%r]: %f", target, target_value, ratio) # Choose the sampling method. if sampling_method == SamplingMethod.under_random: sampler = RandomUnderSampler() elif sampling_method == SamplingMethod.under_tomek: sampler = TomekLinks() elif sampling_method == SamplingMethod.under_cluster: sampler = ClusterCentroids() elif sampling_method == SamplingMethod.under_nearmiss: sampler = NearMiss(version=1) elif sampling_method == SamplingMethod.under_ncr: sampler = NeighbourhoodCleaningRule(size_ngh=51) elif sampling_method == SamplingMethod.over_random: sampler = RandomOverSampler(ratio=ratio) elif sampling_method == SamplingMethod.over_smote: sampler = SMOTE(ratio=ratio, kind='regular') elif sampling_method == SamplingMethod.over_smoteb: sampler = SMOTE(ratio=ratio, kind='borderline1') elif sampling_method == SamplingMethod.over_smotesv: sampler = SMOTE(ratio=ratio, kind='svm') elif sampling_method == SamplingMethod.overunder_smote_tomek: sampler = SMOTETomek(ratio=ratio) elif sampling_method == SamplingMethod.overunder_smote_enn: sampler = SMOTEENN(ratio=ratio) elif sampling_method == SamplingMethod.ensemble_easy: sampler = EasyEnsemble() elif sampling_method == SamplingMethod.ensemble_bc: sampler = BalanceCascade() else: raise ValueError("Unknown Sampling Method %s" % sampling_method) # Get the newly sampled features. X, y = sampler.fit_sample(X_train, y_train) logger.info("Original Samples : %d", X_train.shape[0]) logger.info("New Samples : %d", X.shape[0]) # Store the new features in the model. model.X_train = X model.y_train = y return model
initial_size=initial_size, step_size=step_size) evl.compute_metrics() evl.save_to_csv_metrics() print("End", stream_name, method_name, time.time() - start) except Exception as ex: print(str(ex)) traceback.print_exc() print("Exception in ", stream_name, method_name) cores = open('/proc/cpuinfo').read().count('processor\t:') methods = [ DeterministicSamplingClassifier(oversampling=SMOTE(), undersampling=NearMiss()), DeterministicSamplingClassifier(), KMeanClustering(), LearnppCDS(), LearnppNIE(), REA(), OUSE(), MLPClassifier(), ] names = [ "DSC-S", "DSC-R", "KMeanClustering", "LearnppCDS", "LearnppNIE",
# define the keras model model = Sequential() model.add(Embedding(X.shape[1], 128, input_length=None)) model.add(LSTM(128)) model.add(Dense(y_count, activation='softmax')) # compile the keras model sgd = optimizers.adam(lr=0.02) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # fit the keras model on the dataset x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] training_generator = BalancedBatchGenerator( X, y, sampler=NearMiss(), batch_size=8, random_state=42) model.fit_generator(generator=training_generator, epochs=8, verbose=1) cvscores.append(model.evaluate(x_test, y_test)) print('Model evaluation ', cvscores[-1]) print('\n') cfm = confusion_matrix(np.argmax(y_test, axis=1), model.predict_classes(x_test), labels=[i for i in range(y_count)] ) cfms[n::] = cfm n += 1 cfm = pd.DataFrame(cfm, col, col) print(cfm) print('\n')
def test_nearmiss_error(nearmiss_params, err_msg): nm = NearMiss(**nearmiss_params) with pytest.raises(ValueError, match=err_msg): nm.fit_resample(X, Y)
# Import under sampling functions from imblearn.under_sampling import RandomUnderSampler from imblearn.under_sampling import NearMiss from imblearn.under_sampling import TomekLinks from imblearn.under_sampling import CondensedNearestNeighbour # Impot over sampling functions from imblearn.over_sampling import SMOTE from imblearn.over_sampling import BorderlineSMOTE from imblearn.over_sampling import SMOTENC from imblearn.over_sampling import ADASYN from imblearn.over_sampling import RandomOverSampler # Instantiate the under sampling techniques rus = RandomUnderSampler(random_state=42) nm1 = NearMiss(version=1) nm2 = NearMiss(version=2) nm3 = NearMiss(version=3) tl = TomekLinks() cnn = CondensedNearestNeighbour(random_state=42) # Instantiate the over sampling techniques sm = SMOTE(random_state=42) blSMOTE = BorderlineSMOTE(random_state=42) smotenc = SMOTENC(random_state=42, categorical_features=[0,1]) adasyn = ADASYN(random_state=42) ros = RandomOverSampler(random_state=42) # Create a list with the resampling techniques techniques = [rus, nm1, nm2, nm3, tl, cnn, sm, blSMOTE, smotenc, adasyn, ros]
def main(args): # 取得連線引擎 engine = preprocessing.get_connector(user=args.user, password=args.password, host=args.host, port=args.port, database=args.database, protocol=args.protocol) # 取得訓練階段所需的 tables,接著合併、預處理所有 tables 。 table_names_train = [ 'posts_train', 'post_shared_train', 'post_comment_created_train', 'post_liked_train', 'post_collected_train' ] tables_train = preprocessing.get_tables(engine, table_names_train) total_df_train = preprocessing.merge_tables(tables_train, table_names_train, how='left') total_df_train = preprocessing.preprocess_total_df( total_df_train, has_like_count_36_hour=True) # 開始訓練模型 preprocessing.print_info("TRAINING START!") # STEP 1: 建立 Pipeline cachedir = mkdtemp() pipe = Pipeline( steps=[ ('resampler', 'passthrough'), # ('columntransformer', 'passthrough'), ('classifier', 'passthrough') ], memory=cachedir) # poly_cols = ['shared_count', 'comment_count', 'liked_count', 'collected_count'] # col_trans = make_column_transformer((OneHotEncoder(dtype='int'), ['weekday']), # (PolynomialFeatures(include_bias=False), poly_cols), # remainder='passthrough') # STEP 2: 設定超參數空間以及衡量指標,建立 GridsearchCV param_grid_ada = { 'resampler': ['passthrough', SMOTE(), NearMiss()], # 'columntransformer': ['passthrough', col_trans], 'classifier': [AdaBoostClassifier()], 'classifier__n_estimators': [90, 100, 110, 120], 'classifier__base_estimator': [ DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3) ] } param_grid_gb = { 'resampler': ['passthrough', SMOTE(), NearMiss()], # 'columntransformer': ['passthrough', col_trans], 'classifier': [GradientBoostingClassifier(), XGBClassifier()], 'classifier__n_estimators': [90, 100, 110, 120], 'classifier__learning_rate': [0.025, 0.05, 0.1] } param_grid = [param_grid_ada, param_grid_gb] scoring = { 'precision': 'precision', 'recall': 'recall', 'specificity': make_scorer(specificity_score), 'balanced_accuracy': 'balanced_accuracy', 'f1_score': 'f1', } grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=scoring, refit='f1_score', n_jobs=-1, cv=3, return_train_score=True) # STEP 3: 搜尋最佳超參數組合並印出所需時間 start_time = time() grid_search.fit(total_df_train.drop('is_trending', axis=1), total_df_train['is_trending']) preprocessing.print_info(f"GRID SEARCH: {time()-start_time:.2f} secs") # STEP 4: 將最佳模型和交叉驗證結果儲存起來 output_path = PurePath(f'{args.output_path}') model_name, results_name = PurePath('best_model.h5'), PurePath( 'cv_results.csv') dump(grid_search.best_estimator_, str(output_path / model_name)) cv_results = pd.DataFrame(grid_search.cv_results_) cv_results.to_csv(str(output_path / results_name), index=False, encoding='utf8') rmtree(cachedir) preprocessing.print_info("DONE!")
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline from sklearn.decomposition import PCA #from sklearn.decomposition import TruncatedSVD as PCA from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, PowerTransformer from utils import generate_domain_space PROTOTYPE = { "rebalance": [None, NearMiss(), CondensedNearestNeighbour(), SMOTE()], "normalizer": [None, StandardScaler(), PowerTransformer(), MinMaxScaler(), RobustScaler()], "features": [None, PCA(), SelectKBest(), FeatureUnion([("pca", PCA()), ("selectkbest", SelectKBest())])] } DOMAIN_SPACE = generate_domain_space(PROTOTYPE) def get_baseline(): baseline = {} for k in PROTOTYPE.keys(): baseline[k] = ('{}_NoneType'.format(k), {}) return baseline def pipeline_conf_to_full_pipeline(args, algorithm, seed, algo_config): if args == {}: args = get_baseline() op_to_class = {'pca': PCA, 'selectkbest': SelectKBest} parts = ['rebalance', 'normalizer', 'features']
palette = sns.color_palette() # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Nearmiss 2 nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y) X_resampled = pd.DataFrame(X_resampled) X_resampled.columns = [ 'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface', 'is_local_class', 'call_external_method' ] y_resampled = pd.DataFrame(y_resampled) y_resampled.columns = ['is_code_smell'] undersampled_data = pd.concat([X_resampled, y_resampled], axis=1) print("InstanceHardnessThreshold") print(undersampled_data.describe()) undersampled_data.to_csv('../../dataset/LIC/LIC_InstanceHardnessThreshold.csv', index=False) #NearMiss rus = NearMiss(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, Y) X_resampled = pd.DataFrame(X_resampled) X_resampled.columns = [ 'is_static', 'is_enum', 'uses_variables', 'call_method', 'is_interface', 'is_local_class', 'call_external_method' ] y_resampled = pd.DataFrame(y_resampled) y_resampled.columns = ['is_code_smell'] undersampled_data = pd.concat([X_resampled, y_resampled], axis=1) print("NearMiss") print(undersampled_data.describe()) undersampled_data.to_csv('../../dataset/LIC/LIC_NearMiss.csv', index=False) #OneSidedSelection rus = OneSidedSelection(return_indices=True)
# Dealing with imbalanced classes if imb_class == 0: pass elif imb_class == 1: # Oversample with SMOTE print('Balanced Classes Turned On') for i in range(0, len(skpipes)): skpipes[i].append(('smote'+str(i), SMOTE(random_state = rand_st))) elif imb_class == 2: # Undersample using NearMiss print('Balanced Classes Turned On') for i in range(0, len(skpipes)): skpipes[i].append(('NearMiss'+str(i), NearMiss(version=3))) elif imb_class == 3: # Undersample using NearMiss print('Balanced Classes Turned On') for i in range(0, len(skpipes)): skpipes[i].append(('undersample'+str(i), RandomUnderSampler())) # %% ############################################################################# # # Feature Selection # ##########################################
import numpy as np import xgboost as xgb from hyperopt import hp from sklearn.decomposition import PCA from imblearn.pipeline import Pipeline from imblearn.under_sampling import NearMiss from config import random_seed from utils.python_utils import quniform_int steps = [ ('undersampler', NearMiss(random_state = random_seed)), ('pca', PCA(n_components=50, random_state=random_seed)), ('xgb', xgb.XGBClassifier(n_estimators=1000, silent=True, nthread=3, seed=random_seed)) ] model = Pipeline(steps=steps) params_space = { 'pca__n_components': quniform_int('n_components', 20, 200, 10), 'xgb__max_depth': quniform_int('max_depth', 10, 30, 1), 'xgb__min_child_weight': hp.quniform('min_child_weight', 1, 20, 1), 'xgb__subsample': hp.uniform('subsample', 0.8, 1), 'xgb__n_estimators': quniform_int('n_estimators', 1000, 10000, 50), 'xgb__learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001, 'xgb__gamma': hp.loguniform('gamma', np.log(0.0001), np.log(5)) - 0.0001, 'xgb__colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05) }
def test_nearmiss_wrong_version(): version = 1000 nm = NearMiss(version=version) with raises(ValueError, match="must be 1, 2 or 3"): nm.fit_resample(X, Y)
def under_sampling_nm(x, y): nm = NearMiss(version = 3) x_train, y_train = nm.fit_resample(x, y) return x_train, y_train
#Step by Step "Fetal Health" Prediction-Detailed - ekshghsh gia standard scaler std_scale = StandardScaler() X_sc = std_scale.fit_transform(X) X_train, X_test, y_train,y_test = train_test_split(X_sc, Y, test_size=0.25, random_state=42) print("There are total "+str(len(X_train))+" rows in training dataset") print("There are total "+str(len(X_test))+" rows in test dataset") smt = SMOTE() X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train) tl = TomekLinks() X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train) nm = NearMiss(version = 1) X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train) nm2 = NearMiss(version = 2) X_train_nm2, y_train_nm2 = nm2.fit_resample(X_train, y_train) nm3 = NearMiss(version = 3) X_train_nm3, y_train_nm3 = nm3.fit_resample(X_train, y_train) def evaluate_model(clf, X_test, y_test, model_name, oversample_type): print('--------------------------------------------') print('Model ', model_name) print('Data Type ', oversample_type) y_pred = clf.predict(X_test) f1 = f1_score(y_test, y_pred, average='weighted')
# 3. Under sampling print("> 3. Over Sampling") print(">> 3.1 Prototype Generation") cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) print(">> 3.2 Prototype Selection") print(">>> 3.2.1 Controlled under-sampling techniques") print( ">>>> 3.2.1.1 Controlled under-sampling techniques: Random Under Sampler") rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) print(">>>> 3.2.1.2 Controlled under-sampling techniques: Bootstrap") np.vstack({tuple(row) for row in X_resampled}).shape rus = RandomUnderSampler(random_state=0, replacement=True) X_resampled, y_resampled = rus.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) print(">>>> 3.2.1.3 Controlled under-sampling techniques: Bootstrap") nm1 = NearMiss(random_state=0, version=1) X_resampled_nm1, y_resampled = nm1.fit_sample(X, y) print(sorted(Counter(y_resampled).items()))