예제 #1
0
def test_ncr_wrong_nn_obj():
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(return_indices=True,
                                    random_state=RND_SEED,
                                    n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ncr.fit_sample(X, Y)
def neighbourhood_cleaning_rule(feature_list_of_all_instances, class_list_of_all_instances,neighbours):
    # Apply neighbourhood cleaning rule
    c1 = 0
    c2 = 0
    count = 0
    for i in class_list_of_all_instances:
        if i == 1:
            c1 += 1
        if i == 0:
            c2 += 1
        if i != 1 and i != 0:
            count += 1

    print("     Data of class 1 ", c1, " ,Data of cls 0 ", c2, ",Other class ", count)

    # for i in range(5,200,5):

    ncl = NeighbourhoodCleaningRule(n_neighbors=neighbours, n_jobs=4)
    X_resampled, y_resampled = ncl.fit_sample(feature_list_of_all_instances, class_list_of_all_instances)
    # X_res_vis = pca.transform(X_resampled)
    # 13
    print("     Cleaned ", len(feature_list_of_all_instances) - len(X_resampled), " points", end='')

    c1 = 0
    c2 = 0
    for ii in y_resampled:
        if ii == 1:
            c1 += 1
        if ii == 0:
            c2 += 1

    print(" and data of class 1 ", c1, "data of cls 0 ", c2, "for ", neighbours, "neighbours ")


    return X_resampled, y_resampled  # feature_list_of_all_instances,class_list_of_all_instances=neighbourhood_cleaning_rule(feature_list_of_all_instances,class_list_of_all_instances)
예제 #3
0
def test_ncr_fit_sample():
    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [0.35967591, 2.61186964],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #6
0
def test_ncr_fit_sample_mode():
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #8
0
def test_ncr_fit_sample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718], [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928], [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228], [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964], [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 1, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ncr_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ncr_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ncr_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #11
0
def test_ncr_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964], [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 1, 2, 1, 2])
    idx_gt = np.array([10, 11, 3, 5, 7, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #12
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ncr = NeighbourhoodCleaningRule(random_state=RND_SEED)
    X_resampled, y_resampled = ncr.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 2268)
    assert_equal(count_y_res[2], 42)
예제 #13
0
def test_ncr_fit_sample_nn_obj():
    # Resample the data
    nn = NearestNeighbors(n_neighbors=3)
    ncr = NeighbourhoodCleaningRule(return_indices=True,
                                    random_state=RND_SEED,
                                    n_neighbors=nn)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228],
                     [-0.91735824, 0.93110278], [0.35967591, 2.61186964],
                     [-1.55581933, 1.09609604], [1.55157493, -1.6981518]])
    y_gt = np.array([0, 0, 1, 2, 1, 2])
    idx_gt = np.array([10, 11, 3, 7, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_ncr_fit_sample_mode():
    ncr = NeighbourhoodCleaningRule(kind_sel='mode')
    X_resampled, y_resampled = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647],
                     [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928],
                     [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228],
                     [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_ncr_fit_sample_with_indices():
    ncr = NeighbourhoodCleaningRule(return_indices=True)
    X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y)

    X_gt = np.array([[0.34096173, 0.50947647],
                     [-0.91735824, 0.93110278],
                     [-0.20413357, 0.64628718],
                     [0.35967591, 2.61186964],
                     [0.90701028, -0.57636928],
                     [-1.20809175, -1.49917302],
                     [-0.60497017, -0.66630228],
                     [1.39272351, -0.51631728],
                     [-1.55581933, 1.09609604],
                     [1.55157493, -1.6981518]])
    y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2])
    idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #16
0
    rus = NeighbourhoodCleaningRule(sampling_strategy='all')
    # rus = TomekLinks(sampling_strategy='all')
    # rus = RandomUnderSampler(sampling_strategy="not minority")
    # rus = OneSidedSelection(sampling_strategy='all',n_seeds_S=1000)

    # for i in unique_labels:
    #     print("Before Class" + str(i) +  "number of samples: ", len(crops_only_flatten[crops_only_flatten==i]))
    #     file.write("Before Class" + str(i) +  "number of samples: " + str(len(crops_only_flatten[crops_only_flatten==i])) + "\n")

    # print("Before Class 0 number of samples: ", len(crops_only_flatten[crops_only_flatten==0]))
    # print("Before Class 1 number of samples: ", len(crops_only_flatten[crops_only_flatten==1]))
    # print("Before Class 2 number of samples: ", len(crops_only_flatten[crops_only_flatten==2]))
    # print("Before Class 10 number of samples: ", len(crops_only_flatten[crops_only_flatten==10]))
    # print()
    start_train = time.time()
    X_rus, y_rus = rus.fit_sample(data_array_combined_flatten,
                                  crops_only_flatten)
    end_train = time.time()
    print("Balancing time: ", end_train - start_train)
    # print()
    # print("After Class 0 number of samples: ", len(y_rus[y_rus==0]))
    # print("After Class 1 number of samples: ", len(y_rus[y_rus==1]))
    # print("After Class 2 number of samples: ", len(y_rus[y_rus==2]))
    # print("After Class 10 number of samples: ", len(y_rus[y_rus==10]))
    # print()

    file.write("Balancing time: " + str(end_train - start_train) + "\n\n")

    for i in unique_labels:
        print("Before Class " + str(i) + " number of samples: ",
              len(crops_only_flatten[crops_only_flatten == i]))
        file.write("Before Class " + str(i) + " number of samples: " +
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=200,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply neighbourhood cleaning rule
ncl = NeighbourhoodCleaningRule(return_indices=True)
X_resampled, y_resampled, idx_resampled = ncl.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0],
            X_res_vis[idx_class_0, 1],
            alpha=.8,
            label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0],
            X_res_vis[~idx_class_0, 1],
            alpha=.8,
예제 #18
0
파일: abalone.py 프로젝트: tqtifnypmb/ML
def undersampling(X, Y):
    rus = NeighbourhoodCleaningRule(ratio='majority')
    x_new, y_new = rus.fit_sample(X, Y)
    return (x_new, y_new)
from imblearn.under_sampling import NeighbourhoodCleaningRule

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply neighbourhood cleaning rule
ncl = NeighbourhoodCleaningRule()
X_resampled, y_resampled = ncl.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def test_deprecation_random_state():
    ncr = NeighbourhoodCleaningRule(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        ncr.fit_sample(X, Y)
def test_deprecation_random_state():
    ncr = NeighbourhoodCleaningRule(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        ncr.fit_sample(X, Y)
예제 #22
0
def sampling(algorithm, x_train, y_train):

    if (algorithm == 'standard'):

        print('\nUsing Standard Scaler.\n')

        scaler = StandardScaler().fit(x_train)
        X_resampled = scaler.transform(x_train)
        y_resampled = y_train

    elif(algorithm == 'undersampling'):
        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        print('\nUsing Random Under Sampling.\n')

        rus = RandomUnderSampler(return_indices=True)
        X_resampled, y_resampled, idx_resampled = rus.fit_sample(x_train, y_train)
        X_res_vis = pca.transform(X_resampled)

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                        idx_resampled)

        idx_class_0 = y_resampled == 0
        plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
                    alpha=.8, label='Class #0')
        plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
                    alpha=.8, label='Class #1')
        plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
                    alpha=.8, label='Removed samples')
        
        # make nice plotting
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim([-6, 6])
        ax.set_ylim([-6, 6])

        plt.title('Under-sampling using random under-sampling')
        plt.legend()
        plt.tight_layout()
        plt.show()

    elif(algorithm == 'smote'):

        print('\nUsing SMOTE.\n')

        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        kinds = ['regular', 'borderline1', 'borderline2', 'svm']
        kind = [kinds[int(sys.argv[2] if len(sys.argv) >= 3 else 'regular')]]
        print(kind)
        sm = [SMOTE(kind=k) for k in kind]
        X_resampled = []
        y_resampled = []
        X_res_vis = []
        for method in sm:
            X_res, y_res = method.fit_sample(x_train, y_train)
            X_resampled.append(X_res)
            y_resampled.append(y_res)
            X_res_vis.append(pca.transform(X_res))

        f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2)
        ax2.axis('off')
        ax_res = [ax3, ax4, ax5, ax6]

        c0, c1 = plot_resampling(ax1, X_vis, y_train, 'Original set')
        for i in range(len(kind)):
            plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i],
                            'SMOTE {}'.format(kind[i]))

        ax2.legend((c0, c1), ('Class #0', 'Class #1'), loc='center',
                ncol=1, labelspacing=0.)
        plt.tight_layout()
        plt.show()

    elif(algorithm=='neighbourhood'):

        print('\nUsing Neighbourhood Cleaning Rule.\n')

        pca = PCA(n_components=2)
        X_vis = pca.fit_transform(x_train)

        ncl = NeighbourhoodCleaningRule(return_indices=True)
        X_resampled, y_resampled, idx_resampled = ncl.fit_sample(x_train, y_train)
        X_res_vis = pca.transform(X_resampled)

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                        idx_resampled)

        idx_class_0 = y_resampled == 0
        plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
                    alpha=.8, label='Class #0')
        plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
                    alpha=.8, label='Class #1')
        plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
                    alpha=.8, label='Removed samples')

        # make nice plotting
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim([-6, 6])
        ax.set_ylim([-6, 6])

        plt.title('Under-sampling using neighbourhood cleaning rule')
        plt.legend()
        plt.tight_layout()
        plt.show()

    elif(algorithm == 'ENN'):

        print('\nUsing ENN.\n')

        enn = EditedNearestNeighbours(return_indices=True)
        X_resampled, y_resampled, idx_resampled = enn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'RENN'):

        print('\nUsing RENN.\n')

        renn = RepeatedEditedNearestNeighbours(return_indices=True)
        X_resampled, y_resampled, idx_resampled = renn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'AllKNN'):

        print('\nUsing AllKNN.\n')

        allknn = AllKNN(return_indices=True)
        X_resampled, y_resampled, idx_resampled = allknn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'centroids'):

        print('\nUsing Cluster Centroids.\n')

        # Apply Cluster Centroids
        cc = ClusterCentroids()
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)

    elif(algorithm == 'centroidshard'):

        print('\nUsing Cluster Centroids with Hard Voting.\n')

        pca = PCA(n_components=2)
        X_vis = pca.fit_transform(x_train)
        # Apply Cluster Centroids
        cc = ClusterCentroids()
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)
        X_res_vis_soft = pca.transform(X_resampled)

        # Use hard voting instead of soft voting
        cc = ClusterCentroids(voting='hard')
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)
        X_res_vis_hard = pca.transform(X_resampled)

        # Two subplots, unpack the axes array immediately
        f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

        c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0",
                        alpha=0.5)
        c1 = ax1.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Class #1",
                        alpha=0.5)
        ax1.set_title('Original set')

        ax2.scatter(X_res_vis_soft[y_resampled == 0, 0],
                    X_res_vis_soft[y_resampled == 0, 1],
                    label="Class #0", alpha=.5)
        ax2.scatter(X_res_vis_soft[y_resampled == 1, 0],
                    X_res_vis_soft[y_resampled == 1, 1],
                    label="Class #1", alpha=.5)
        ax2.scatter(X_vis[y_train == 1, 0],
                        X_vis[y_train == 1, 1], label="Original #1",
                        alpha=0.2)
        ax2.set_title('Cluster centroids with soft voting')

        ax3.scatter(X_res_vis_hard[y_resampled == 0, 0],
                    X_res_vis_hard[y_resampled == 0, 1],
                    label="Class #0", alpha=.5)
        ax3.scatter(X_res_vis_hard[y_resampled == 1, 0],
                    X_res_vis_hard[y_resampled == 1, 1],
                    label="Class #1", alpha=.5)
        ax3.scatter(X_vis[y_train == 1, 0],
                    X_vis[y_train == 1, 1],
                    alpha=0.2)
        ax3.set_title('Cluster centroids with hard voting')

        # make nice plotting
        for ax in (ax1, ax2, ax3):
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.get_xaxis().tick_bottom()
            ax.get_yaxis().tick_left()
            ax.spines['left'].set_position(('outward', 10))
            ax.spines['bottom'].set_position(('outward', 10))
            ax.set_xlim([-6, 8])
            ax.set_ylim([-6, 6])

        plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'),
                    loc='lower center',
                    ncol=3, labelspacing=0.)
        plt.tight_layout(pad=3)
        plt.show()

    else:
        
        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        return x_train, y_train

    return X_resampled, y_resampled
예제 #23
0
X_resampled, y_resampled = cnn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#显然,CondensedNearestNeighbour方法对噪音数据是很敏感的,也容易加入噪音数据到集合C中.
#因此,OneSidedSelection函数使用 TomekLinks方法来剔除噪声数据(多数类样本).
from imblearn.under_sampling import OneSidedSelection
oss = OneSidedSelection(random_state=0)
X_resampled, y_resampled = oss.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

'''
NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用
EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集.
'''
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(random_state=0)
X_resampled, y_resampled = ncr.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

#InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉.
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import InstanceHardnessThreshold
iht = InstanceHardnessThreshold(random_state=0,
                              estimator=LogisticRegression())
X_resampled, y_resampled = iht.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 64), (2, 64)]


'''
过采样与下采样的结合
在之前的SMOTE方法中,当由边界的样本与其他样本进行过采样差值时,很容易生成一些噪音数据. 
예제 #24
0
    def fit(self, X, y):
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')
        cv_predictions = []
        targets = []

        # klonowanie
        self.estimators_ = [
            clone(estimator) for _, estimator in self.estimators
        ]

        # dodawanie klasyfikatorow AdaBoost
        for clf in self.estimators_ada:
            self.clfs.append(
                AdaBoostClassifier(clone(clf), n_estimators=self.n_estimators))

        # dodawanie klasyfikatorow Bagging
        for clf in self.estimators_bag:
            self.clfs.append(
                BaggingClassifier(clone(clf),
                                  n_estimators=100,
                                  max_samples=0.9))

        self.clfs.append(
            StackingClassifier(classifiers=self.estimators_,
                               meta_classifier=LogisticRegression()))
        self.clfs.append(clf_expert(self.estimators))
        # ocena klasyfikatorow
        for clf in self.clfs:
            testpredict, testtarget = cross_val_pred2ict(clf,
                                                         X,
                                                         y,
                                                         cv=self.n_folds,
                                                         n_jobs=1)
            cv_predictions.append((testpredict))
            targets.append(testtarget)

        skf = StratifiedKFold(n_splits=2, random_state=self.random_st)

        # trenowanie i ocenianie klasyfiktorow dla zbioru SMOTE i NCR
        for clf in self.clfs:
            for method, name in zip(self.methoda, self.name_met):
                metodaa = SMOTE(k_neighbors=3, random_state=self.random_st)
                metodaj = NeighbourhoodCleaningRule(
                    n_neighbors=3, random_state=self.random_st)

                predict_re = []
                targets_re = []
                for train_index, test_index in skf.split(X, y):

                    if method == 0:
                        data_re, tar_re = metodaa.fit_sample(
                            np.asarray(X[train_index]),
                            np.asarray(y[train_index]))
                    else:
                        data_re, tar_re = metodaj.fit_sample(
                            np.asarray(X[train_index]),
                            np.asarray(y[train_index]))

                    clf_ = clone(clf)

                    # trenowanie
                    clf_.fit(data_re, tar_re)

                    # testowanie
                    predict_re.append(clf_.predict(X[test_index]))
                    targets_re.append(y[test_index])
                cv_predictions.append((predict_re))
                targets.append(targets_re)

        # wylanianie 2 najlepszych ekspertow
        for idx, (prediction, target) in enumerate(zip(cv_predictions,
                                                       targets)):

            matrixes1 = []
            matrixes2 = []
            for pred, tar in zip(prediction, target):
                matrixes1.append(simplefunctions.confusion_matrix(tar, pred))
            for matrix in matrixes1:
                matrixes2.append(
                    np.array([[matrix[1, 1], matrix[1, 0]],
                              [matrix[0, 1], matrix[0, 0]]]))
            fun_cmp = getattr(simplefunctions,
                              self.function_compare)(matrixes1)

            if fun_cmp > self.max_g[0]:
                self.clf_id[1] = self.clf_id[0]
                self.clf_id[0] = idx
                self.max_g[1] = self.max_g[0]
                self.max_g[0] = fun_cmp
            elif fun_cmp > self.max_g[1]:
                self.clf_id[2] = self.clf_id[1]
                self.clf_id[1] = idx
                self.max_g[2] = self.max_g[1]
                self.max_g[0] = fun_cmp
            elif fun_cmp > self.max_g[2]:
                self.clf_id[2] = idx
                self.max_g[2] = fun_cmp
        for clf_id in self.clf_id:
            if clf_id > len(self.estimators_ada) + len(self.estimators_bag):
                if clf_id % 2 == 0:
                    met = self.methods[0]
                    data_re, tar_re = met.fit_sample(X, y)
                    clf_ = clone(self.clfs[(clf_id - 7) / 2])
                    self.ensemble_.append(clf_.fit(data_re, tar_re))
                else:
                    met = self.methods[1]
                    data_re, tar_re = met.fit_sample(X, y)
                    clf_ = clone(self.clfs[(clf_id - 7) / 2])
                    self.ensemble_.append(clf_.fit(data_re, tar_re))
            else:
                clf_ = clone(self.clfs[clf_id])
                self.ensemble_.append(clf_.fit(X, y))

        meta_features = self._predict_meta_features(X)
        self.meta_clf_.fit(meta_features, y)
예제 #25
0
# CREATE TRAIN TEST DATASETS FOR ML CLASSIFIERS
train_test_ratio = 0.2

if clean_dataset == 'yes':
    file.write("UnderSampling: " + str(clean_dataset) + "\n\n")
    total_elements = len(labels)
    background_elements = len(labels[labels != 0])
    resample_dict = {0: int(background_elements)}

    rus = NeighbourhoodCleaningRule(sampling_strategy='all')
    # rus = TomekLinks(sampling_strategy='all')
    # rus = RandomUnderSampler(sampling_strategy="not minority")
    # rus = OneSidedSelection(sampling_strategy='all',n_seeds_S=1000)

    start_train = time.time()
    X_rus, y_rus = rus.fit_sample(data, labels)
    end_train = time.time()
    print("Balancing time: ", end_train - start_train)

    file.write("Balancing time: " + str(end_train - start_train) + "\n\n")

    for i in labels_nums:
        print("Before Class " + str(i) + " number of samples: ",
              len(labels[labels == i]))
        file.write("Before Class " + str(i) + " number of samples: " +
                   str(len(labels[labels == i])) + "\n")
    file.write("\n")
    for i in labels_nums:
        print("After Class " + str(i) + " number of samples: ",
              len(y_rus[y_rus == i]))
        file.write("After Class " + str(i) + " number of samples: " +
def test_ncr_wrong_nn_obj():
    nn = 'rnd'
    ncr = NeighbourhoodCleaningRule(
        return_indices=True, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ncr.fit_sample(X, Y)
예제 #27
0
            rows[i].append(score)
        print("----------")
        print(str(clf))
        print_scores(testpredict, testtarget)

        # NCR
        ncr_ = NeighbourhoodCleaningRule(random_state=random_st, n_neighbors=3)

        scores = []

        # powtorzenie X razy i obliczenie sredniej
        for iteration in range(iterations):
            predict_re = []
            targets_re = []
            for train_index, test_index in skf.split(db.data, db.target):
                data_re, tar_re = ncr_.fit_sample(db.data[train_index], db.target[train_index])
                clf_ = clone(clf)

                # trenowanie
                clf_.fit(data_re, tar_re)

                # testowanie
                predict_re.append(clf_.predict(db.data[test_index]))
                targets_re.append(db.target[test_index])
            # obliczanie wyniku ze sprawdzianu krzyzowego
            scores.append(accsespf1g(predict_re, targets_re))
            print("NCR")
            print(str(clf))
            print_scores(predict_re, targets_re)

        avgscores = avgaccsespf1g(scores)