예제 #1
0
def test_sample_kmeans_not_enough_clusters():
    rng = np.random.RandomState(42)
    X = rng.randn(30, 2)
    y = np.array([1] * 20 + [0] * 10)

    smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2)
    with pytest.raises(RuntimeError):
        smote.fit_resample(X, y)
def test_kmeans_smote_param_error(data, density_exponent, cluster_balance_threshold):
    X, y = data
    kmeans_smote = KMeansSMOTE(
        density_exponent=density_exponent,
        cluster_balance_threshold=cluster_balance_threshold,
    )
    with pytest.raises(ValueError, match="should be 'auto' when a string"):
        kmeans_smote.fit_resample(X, y)
def test_sample_kmeans_density_estimation(data, density_exponent,
                                          cluster_balance_threshold):
    X, y = data
    smote = KMeansSMOTE(
        random_state=42,
        density_exponent=density_exponent,
        cluster_balance_threshold=cluster_balance_threshold,
    )
    smote.fit_resample(X, y)
def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold):
    X, y = make_classification(
        n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42
    )
    smote = KMeansSMOTE(
        random_state=0,
        density_exponent=density_exponent,
        cluster_balance_threshold=cluster_balance_threshold,
    )
    smote.fit_resample(X, y)
def sample(xtrain, ytrain):
    sm = KMeansSMOTE(random_state=42)
    x_res, y_res = sm.fit_resample(xtrain, ytrain)
    y = y_res
    x_res = pd.DataFrame(x_res)
    #y_res=pd.DataFrame(y_res)
    x_res.columns = xtrain.columns
    #y_res.columns=["Leak_type"]
    return x_res, y_res
예제 #6
0
def over_sample_data(matrix, y_train):
    add_to_log('Over Sampling')
    add_to_log('Sample distribution %s' % Counter(y_train))
    b_line = KMeansSMOTE(k_neighbors=5,
                         sampling_strategy='not majority',
                         n_jobs=-1,
                         random_state=3,
                         kmeans_estimator=100)
    matrix_resampled, y_resampled = b_line.fit_resample(matrix, y_train)
    add_to_log('Resample distribution %s' % Counter(y_resampled))
    return matrix_resampled, y_resampled
def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator):
    X, y = data
    kmeans_smote = KMeansSMOTE(
        random_state=42,
        kmeans_estimator=kmeans_estimator,
        k_neighbors=k_neighbors,
    )
    X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
    assert X_resampled.shape == (24, 2)
    assert y_resampled.shape == (24,)

    assert kmeans_smote.nn_k_.n_neighbors == 3
    assert kmeans_smote.kmeans_estimator_.n_clusters == 3
예제 #8
0
def keans_smote(X,
                y,
                visualize=False,
                pca2d=True,
                pca3d=True,
                tsne=True,
                pie_evr=True):
    sm = KMeansSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def test_kmeans_smote(data):
    X, y = data
    kmeans_smote = KMeansSMOTE(
        kmeans_estimator=1,
        random_state=42,
        cluster_balance_threshold=0.0,
        k_neighbors=5,
    )
    smote = SMOTE(random_state=42)

    X_res_1, y_res_1 = kmeans_smote.fit_resample(X, y)
    X_res_2, y_res_2 = smote.fit_resample(X, y)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)

    assert kmeans_smote.nn_k_.n_neighbors == 6
    assert kmeans_smote.kmeans_estimator_.n_clusters == 1
    assert "batch_size" in kmeans_smote.kmeans_estimator_.get_params()
def test_sample_kmeans_not_enough_clusters(data):
    X, y = data
    smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42)
    with pytest.raises(RuntimeError):
        smote.fit_resample(X, y)
예제 #11
0
    #
    # border_sm = BorderlineSMOTE(k_neighbors=27, random_state=91, sampling_strategy=1)
    #
    # sm = SVMSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, svm_estimator=SVM_smote)
    #
    # ada = ADASYN(random_state=91, n_neighbors=27, sampling_strategy=1, n_jobs=6)

    Kmeans = KMeansSMOTE(random_state=91,
                         k_neighbors=2,
                         sampling_strategy=1,
                         n_jobs=6,
                         kmeans_estimator=MiniBatchKMeans(n_clusters=20))
    '''Muestreo Sintetico'''

    # Xtrain, ytrain = SMOTE().fit_resample(Xtrain, ytrain)
    Xtrain, ytrain = Kmeans.fit_resample(Xtrain, ytrain)

    X_train, X_val, y_train, y_val = train_test_split(Xtrain,
                                                      ytrain,
                                                      test_size=0.33,
                                                      random_state=42)
    '''Selección de caracteristicas'''

    # rel_MI = SelectKBest(score_func=score_func, k=num_features)
    # Xtrain = rel_MI.fit_transform(Xtrain, ytrain)
    # Xtest = rel_MI.transform(Xtest)
    # rel_MI_support = rel_MI.get_support()
    # rel_MI_feature = X_frame.loc[:, rel_MI_support].columns.tolist()
    # rel_MI_scores = rel_MI.scores_[rel_MI_support].tolist()
    # feature_selection_df = pd.DataFrame({'Feature': rel_MI_feature, 'Score':rel_MI_scores})
예제 #12
0
def keans_smote(X, y):
    sm = KMeansSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res