def test_sample_kmeans_density_estimation(data, density_exponent, cluster_balance_threshold): X, y = data smote = KMeansSMOTE(random_state=42, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold) smote.fit_sample(X, y)
def test_sample_kmeans_not_enough_clusters(): rng = np.random.RandomState(42) X = rng.randn(30, 2) y = np.array([1] * 20 + [0] * 10) smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2) with pytest.raises(RuntimeError): smote.fit_sample(X, y)
def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): X, y = data kmeans_smote = KMeansSMOTE(random_state=42, kmeans_estimator=kmeans_estimator, k_neighbors=k_neighbors) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) assert X_resampled.shape == (24, 2) assert y_resampled.shape == (24, ) assert kmeans_smote.nn_k_.n_neighbors == 3 assert kmeans_smote.kmeans_estimator_.n_clusters == 3
def test_kmeans_smote(data): X, y = data kmeans_smote = KMeansSMOTE(kmeans_estimator=1, random_state=42, cluster_balance_threshold=0.0, k_neighbors=5) smote = SMOTE(random_state=42) X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y) X_res_2, y_res_2 = smote.fit_sample(X, y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) assert kmeans_smote.nn_k_.n_neighbors == 6 assert kmeans_smote.kmeans_estimator_.n_clusters == 1 assert 'batch_size' in kmeans_smote.kmeans_estimator_.get_params()
X_train = imp.fit_transform(X_train) # 训练集插补 X_test = imp.transform(X_test) # 测试集插补 prep = StandardScaler() X_train = prep.fit_transform(X_train) X_test = prep.transform(X_test) ops_ada = ADASYN(random_state=10) ops_bsmote = BorderlineSMOTE(random_state=10) ops_ksmote = KMeansSMOTE(random_state=10) ops_rs = RandomOverSampler(random_state=10) ops_s = SMOTE(random_state=10) X_train_ada, y_train_ada = ops_ada.fit_sample(X_train, y_train) X_train_bsmote, y_train_bsmote = ops_bsmote.fit_sample(X_train, y_train) X_train_ksmote, y_train_ksmote = ops_ksmote.fit_sample(X_train, y_train) X_train_rs, y_train_rs = ops_rs.fit_sample(X_train, y_train) X_train_s, y_train_s = ops_s.fit_sample(X_train, y_train) dic_ = { 'ADASYN': [X_train_ada, y_train_ada], 'BorderlineSMOTE': [X_train_bsmote, y_train_bsmote], 'RandomOverSampler': [X_train_rs, y_train_rs], 'SMOTE': [X_train_s, y_train_s] } for t in dic_.keys(): print('over sampler: %s \n' % t) X_ = dic_[t][0] y_ = dic_[t][1] X_t = X_test
res = adaBoost.predict(features[test_index]) bl_smote_scores['AB'] += metrics.f1_score(res, target[test_index]) bl_smote_con_mat['AB'] += confusion_matrix(y_true=target[test_index], y_pred=res) # Gradient Boost Classifier gradBoost = GradientBoostingClassifier(random_state=0) gradBoost.fit(X_train, y_train) res = gradBoost.predict(features[test_index]) bl_smote_scores['GB'] += metrics.f1_score(res, target[test_index]) bl_smote_con_mat['GB'] += confusion_matrix(y_true=target[test_index], y_pred=res) # K-Means Smote km_smote = KMeansSMOTE(random_state=0) X_train, y_train = km_smote.fit_sample(features[train_index], target[train_index]) # unique, counts = np.unique(y_train, return_counts=True) # print("Kmeans uni, count:",np.asarray((unique, counts)).T) # Logistic Regression logistic = LogisticRegression(random_state=0) logistic.fit(X_train, y_train) res = logistic.predict(features[test_index]) km_scores['LR'] += metrics.f1_score(res, target[test_index]) km_con_mat['LR'] += confusion_matrix(y_true=target[test_index], y_pred=res) # # Ada Boost Classifier adaBoost = AdaBoostClassifier(random_state=0) adaBoost.fit(X_train, y_train) res = adaBoost.predict(features[test_index]) km_scores['AB'] += metrics.f1_score(res, target[test_index])
y = np.ravel(y) print(y.shape) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42) # forest.fit(X_train, y_train) # print("Original set\n{}".format(classification_report(y_test, forest.predict(X_test)))) pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X_train) # Apply the random over-sampling ada = KMeansSMOTE(random_state=42) X_resampled, y_resampled = ada.fit_sample(X_train, y_train) y_resampled = np.ravel(y_resampled) forest.fit(X_resampled, y_resampled) print(Counter(y_resampled)) print(y_resampled.shape) X_res_vis = pca.transform(X_resampled) print("KMeansSMOTE\n{}".format( classification_report(y_test, forest.predict(X_test)))) f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0", alpha=0.5)
def kmeans_smote(x, y): print("----KMeans SMOTE----") sampler = KMeansSMOTE(random_state=42) X, y = sampler.fit_sample(x, y) return X, y