def test_renn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=4)
    renn = RepeatedEditedNearestNeighbours(
        n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [2.94290565, -0.13986434], [-1.10146139, 0.91782682],
                     [0.73489726, 0.43915195], [-0.28479268, 0.70459548],
                     [1.84864913, 0.14729596], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [1.67314371, 0.19231498], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [1.32319756, -0.13181616],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_mode():
    nn = NearestNeighbors(n_neighbors=4)
    renn = RepeatedEditedNearestNeighbours(
        n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [2.94290565, -0.13986434], [-1.10146139, 0.91782682],
                     [0.73489726, 0.43915195], [-0.28479268, 0.70459548],
                     [1.84864913, 0.14729596], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [1.67314371, 0.19231498], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [1.32319756, -0.13181616],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_with_indices():
    renn = RepeatedEditedNearestNeighbours(
        return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25,
        26, 28, 31, 33, 34, 35, 36
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#5
0
文件: sampler.py 项目: jooglyp/ml101
    def rnn_undersampling(
            self, x: pandas.DataFrame, y: numpy.ndarray,
            neighbors: int) -> typing.Tuple[numpy.ndarray, numpy.ndarray]:
        """
        Repeated Edited Nearest Neighbors.
        Args:
            x: X training covariates for the ML model.
            y: y training binary outcomes of the ML model.

        Returns: resampled (undersampled) observations that reduce bias in the receiving operating characteristic (ROC).

        """
        x = self.check_id(x)
        rnn_undersampler = RepeatedEditedNearestNeighbours(
            random_state=82,
            n_neighbors=neighbors,
            return_indices=True,
            kind_sel="mode",
            max_iter=400,
            ratio="majority",
        )

        X_resampled, y_resampled, resampled_idx = rnn_undersampler.fit_sample(
            copy.deepcopy(x), copy.deepcopy(y))
        LOGGER.info(X_resampled)
        LOGGER.info(
            "RNN undersampling yielded {} number of X_resampled observations".
            format(len(X_resampled)))
        LOGGER.info(y_resampled)
        LOGGER.info(
            "RNN undersampling yielded {} number of y_resampled observations".
            format(len(y_resampled)))
        return X_resampled, y_resampled
def test_renn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(
        return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25,
        26, 28, 31, 33, 34, 35, 36
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
示例#7
0
def test_renn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#9
0
def test_renn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(return_indices=True,
                                           random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'renn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_renn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(return_indices=True,
                                           random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'renn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 378)
    assert_equal(count_y_res[1], 1828)
    assert_equal(count_y_res[2], 5)
def test_renn_fit_sample():
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#13
0
def test_renn_iter_wrong():
    max_iter = -1
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter)
    with raises(ValueError):
        renn.fit_sample(X, Y)
            continue
        b_ref_dict[B] = [c_id, c_score, ref_dict_B]
    ref_by_b = selectTopKref(b_ref_dict)
    ref_overlap_A = overlapWithA(ref_by_b, ref_dict_A)
    return ref_by_b, ref_overlap_A, ref_dict_A


# 14608(0.7,yes) 14895(0.047,no)
dictionary = gensim.corpora.Dictionary.load("dictionary")
tf_idf = gensim.models.TfidfModel.load("tf_idf")
X_no, y_no = getXY("features_no.csv")
X_yes, y_yes = getXY("features_yes.csv")

training_testing_variation = 0.3

wiki_As, train_yes, train_no = getTrainingTestingData("features9.csv",
                                                      len(X_yes),
                                                      scale=1)

print "No. yes", len(train_yes)
print "No. no", len(train_no)
print "Test set of A's", len(wiki_As)
X_train = train_yes + train_no
y_train = ['Y'] * len(train_yes) + ['N'] * len(train_no)

renn = RepeatedEditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = renn.fit_sample(X_train, y_train)

save_pickle(X_resampled, "X_resampled_renn_9_1234567")
save_pickle(y_resampled, "y_resampled_renn_9_1234567")
示例#15
0
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('Edited nearest neighbours')

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax3.set_title('Repeated Edited nearest neighbours')

# Apply the AllKNN
print('AllKNN')
allknn = AllKNN()
X_resampled, y_resampled = allknn.fit_sample(X, y)
示例#16
0
def sampling(algorithm, x_train, y_train):

    if (algorithm == 'standard'):

        print('\nUsing Standard Scaler.\n')

        scaler = StandardScaler().fit(x_train)
        X_resampled = scaler.transform(x_train)
        y_resampled = y_train

    elif(algorithm == 'undersampling'):
        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        print('\nUsing Random Under Sampling.\n')

        rus = RandomUnderSampler(return_indices=True)
        X_resampled, y_resampled, idx_resampled = rus.fit_sample(x_train, y_train)
        X_res_vis = pca.transform(X_resampled)

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                        idx_resampled)

        idx_class_0 = y_resampled == 0
        plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
                    alpha=.8, label='Class #0')
        plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
                    alpha=.8, label='Class #1')
        plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
                    alpha=.8, label='Removed samples')
        
        # make nice plotting
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim([-6, 6])
        ax.set_ylim([-6, 6])

        plt.title('Under-sampling using random under-sampling')
        plt.legend()
        plt.tight_layout()
        plt.show()

    elif(algorithm == 'smote'):

        print('\nUsing SMOTE.\n')

        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        kinds = ['regular', 'borderline1', 'borderline2', 'svm']
        kind = [kinds[int(sys.argv[2] if len(sys.argv) >= 3 else 'regular')]]
        print(kind)
        sm = [SMOTE(kind=k) for k in kind]
        X_resampled = []
        y_resampled = []
        X_res_vis = []
        for method in sm:
            X_res, y_res = method.fit_sample(x_train, y_train)
            X_resampled.append(X_res)
            y_resampled.append(y_res)
            X_res_vis.append(pca.transform(X_res))

        f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2)
        ax2.axis('off')
        ax_res = [ax3, ax4, ax5, ax6]

        c0, c1 = plot_resampling(ax1, X_vis, y_train, 'Original set')
        for i in range(len(kind)):
            plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i],
                            'SMOTE {}'.format(kind[i]))

        ax2.legend((c0, c1), ('Class #0', 'Class #1'), loc='center',
                ncol=1, labelspacing=0.)
        plt.tight_layout()
        plt.show()

    elif(algorithm=='neighbourhood'):

        print('\nUsing Neighbourhood Cleaning Rule.\n')

        pca = PCA(n_components=2)
        X_vis = pca.fit_transform(x_train)

        ncl = NeighbourhoodCleaningRule(return_indices=True)
        X_resampled, y_resampled, idx_resampled = ncl.fit_sample(x_train, y_train)
        X_res_vis = pca.transform(X_resampled)

        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                        idx_resampled)

        idx_class_0 = y_resampled == 0
        plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
                    alpha=.8, label='Class #0')
        plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
                    alpha=.8, label='Class #1')
        plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
                    alpha=.8, label='Removed samples')

        # make nice plotting
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim([-6, 6])
        ax.set_ylim([-6, 6])

        plt.title('Under-sampling using neighbourhood cleaning rule')
        plt.legend()
        plt.tight_layout()
        plt.show()

    elif(algorithm == 'ENN'):

        print('\nUsing ENN.\n')

        enn = EditedNearestNeighbours(return_indices=True)
        X_resampled, y_resampled, idx_resampled = enn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'RENN'):

        print('\nUsing RENN.\n')

        renn = RepeatedEditedNearestNeighbours(return_indices=True)
        X_resampled, y_resampled, idx_resampled = renn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'AllKNN'):

        print('\nUsing AllKNN.\n')

        allknn = AllKNN(return_indices=True)
        X_resampled, y_resampled, idx_resampled = allknn.fit_sample(x_train, y_train)
        reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_resampled)) /
                                                        len(x_train))))
        print(reduction_str)

    elif(algorithm == 'centroids'):

        print('\nUsing Cluster Centroids.\n')

        # Apply Cluster Centroids
        cc = ClusterCentroids()
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)

    elif(algorithm == 'centroidshard'):

        print('\nUsing Cluster Centroids with Hard Voting.\n')

        pca = PCA(n_components=2)
        X_vis = pca.fit_transform(x_train)
        # Apply Cluster Centroids
        cc = ClusterCentroids()
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)
        X_res_vis_soft = pca.transform(X_resampled)

        # Use hard voting instead of soft voting
        cc = ClusterCentroids(voting='hard')
        X_resampled, y_resampled = cc.fit_sample(x_train, y_train)
        X_res_vis_hard = pca.transform(X_resampled)

        # Two subplots, unpack the axes array immediately
        f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

        c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0",
                        alpha=0.5)
        c1 = ax1.scatter(X_vis[y_train == 1, 0], X_vis[y_train == 1, 1], label="Class #1",
                        alpha=0.5)
        ax1.set_title('Original set')

        ax2.scatter(X_res_vis_soft[y_resampled == 0, 0],
                    X_res_vis_soft[y_resampled == 0, 1],
                    label="Class #0", alpha=.5)
        ax2.scatter(X_res_vis_soft[y_resampled == 1, 0],
                    X_res_vis_soft[y_resampled == 1, 1],
                    label="Class #1", alpha=.5)
        ax2.scatter(X_vis[y_train == 1, 0],
                        X_vis[y_train == 1, 1], label="Original #1",
                        alpha=0.2)
        ax2.set_title('Cluster centroids with soft voting')

        ax3.scatter(X_res_vis_hard[y_resampled == 0, 0],
                    X_res_vis_hard[y_resampled == 0, 1],
                    label="Class #0", alpha=.5)
        ax3.scatter(X_res_vis_hard[y_resampled == 1, 0],
                    X_res_vis_hard[y_resampled == 1, 1],
                    label="Class #1", alpha=.5)
        ax3.scatter(X_vis[y_train == 1, 0],
                    X_vis[y_train == 1, 1],
                    alpha=0.2)
        ax3.set_title('Cluster centroids with hard voting')

        # make nice plotting
        for ax in (ax1, ax2, ax3):
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.get_xaxis().tick_bottom()
            ax.get_yaxis().tick_left()
            ax.spines['left'].set_position(('outward', 10))
            ax.spines['bottom'].set_position(('outward', 10))
            ax.set_xlim([-6, 8])
            ax.set_ylim([-6, 6])

        plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'),
                    loc='lower center',
                    ncol=3, labelspacing=0.)
        plt.tight_layout(pad=3)
        plt.show()

    else:
        
        # Instanciate a PCA object for the sake of easy visualisation
        pca = PCA(n_components=2)
        # Fit and transform x to visualise inside a 2D feature space
        X_vis = pca.fit_transform(x_train)

        return x_train, y_train

    return X_resampled, y_resampled
示例#17
0
    def pre_process(train_index, test_index):
        train_x, test_x = X_train_all[train_index], X_train_all[test_index]
        train_y, test_y = y_train[train_index], y_train[test_index]

        #Class Balance on the training split
        if class_balance_method == 'rand_under':
            rus = RandomUnderSampler(sampling_strategy='majority',
                                     random_state=0)
            train_x, train_y = rus.fit_sample(train_x, train_y)

        elif class_balance_method == 'enn':
            enn = EditedNearestNeighbours(n_neighbors=5,
                                          random_state=0,
                                          n_jobs=1)
            train_x, train_y = enn.fit_sample(train_x, train_y)

        elif class_balance_method == 'renn':
            renn = RepeatedEditedNearestNeighbours(n_neighbors=5,
                                                   random_state=0,
                                                   n_jobs=1)
            train_x, train_y = renn.fit_sample(train_x, train_y)

        elif class_balance_method == 'tomek':
            tl = TomekLinks(random_state=0)
            train_x, train_y = tl.fit_sample(train_x, train_y)

        elif class_balance_method == 'tomek_enn':
            tl = TomekLinks(random_state=0)
            train_x, train_y = tl.fit_sample(train_x, train_y)

            enn = EditedNearestNeighbours(n_neighbors=5,
                                          random_state=0,
                                          n_jobs=1)
            train_x, train_y = enn.fit_sample(train_x, train_y)

        elif class_balance_method == 'tomek_renn':
            tl = TomekLinks(random_state=0)
            train_x, train_y = tl.fit_sample(train_x, train_y)

            renn = RepeatedEditedNearestNeighbours(n_neighbors=5,
                                                   random_state=0,
                                                   n_jobs=1)
            train_x, train_y = renn.fit_sample(train_x, train_y)

        #Feature Selection on the training split
        #For all methods except the relief based
        feature_scores = 'N/A'

        if feature_selection_method == 'no':
            selected_features = X_df.columns

        elif feature_selection_method == 'chi2':
            selected_features, X_train_df, train_x, test_x = chi2_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)

        elif feature_selection_method == 'anovaF':
            selected_features, X_train_df, train_x, test_x = anova_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)

        elif feature_selection_method == 'reliefF':
            selected_features, feature_scores, train_x, test_x = relieff_fs(
                X_df, train_x, test_x, train_y)

        elif feature_selection_method == 'multisurf':
            selected_features, feature_scores, train_x, test_x = multisurf_fs(
                X_df, train_x, test_x, train_y)

        elif feature_selection_method == 'chi2_reliefF':
            selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = relieff_fs(
                X_train_df, X_train_chi2, X_test_chi2, train_y)

        elif feature_selection_method == 'chi2_multisurf':
            selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = multisurf_fs(
                X_train_df, X_train_chi2, X_test_chi2, train_y)

        elif feature_selection_method == 'anova_reliefF':
            selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = relieff_fs(
                X_train_df, X_train_anova, X_test_anova, train_y)

        elif feature_selection_method == 'anova_multisurf':
            selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = multisurf_fs(
                X_train_df, X_train_anova, X_test_anova, train_y)

        return train_x, train_y, test_x, test_y, selected_features, feature_scores
示例#18
0
def test_deprecation_random_state():
    renn = RepeatedEditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        renn.fit_sample(X, Y)
x = np.array(
    UserData.
    loc[:, ['wk2', 'wk4', 'wk6', 'wk8', 'Nchans', 'Nusers', 'chanScore']])
x = np.array(UserData.loc[:, ['wk2', 'wk4', 'Nchans', 'Nusers', 'chanScore']])

x = np.nan_to_num(x)
y = np.nan_to_num(y)

#~~~~~~~~~~~~~~over sampling to deal with class imbalance ~~~~~~~~~~~~~~~~~~~
sm = SMOTE(kind='svm')
tm = TomekLinks()
renn = RepeatedEditedNearestNeighbours()

x_res, y_res = sm.fit_sample(x, y)
x_res, y_res = tm.fit_sample(x, y)
x_res, y_res = renn.fit_sample(x, y)

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)

train_x, test_x, train_y, test_y = train_test_split(x_res,
                                                    y_res,
                                                    test_size=0.2)

C = np.corrcoef(x.T)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  random forest
RF = RandomForestClassifier(min_samples_leaf=5)
RF = RF.fit(train_x, train_y)
y_pred = RF.predict(test_x)
y_score = RF.predict_proba(test_x)
print(RF.score(test_x, test_y))
示例#20
0
           'sentiment'] = X_test.loc[:,
                                     'body_polarity'] + X_test.loc[:,
                                                                   'title_polarity'] + X_test.loc[:,
                                                                                                  'body_subjectivity'] + X_test.loc[:,
                                                                                                                                    'title_subjectivity']
X_train.drop(dropcols, axis=1, inplace=True)
X_test.drop(dropcols, axis=1, inplace=True)

print(X_train.head)

from imblearn.under_sampling import (AllKNN, EditedNearestNeighbours,
                                     RepeatedEditedNearestNeighbours)

print('RENN')
enn = RepeatedEditedNearestNeighbours(return_indices=True)
X_res, Y_res, idx_res = enn.fit_sample(X_train, Y_train)
reduction_str = ('Reduced {:.2f}%'.format(
    100 * (1 - float(len(X_res)) / len(X_train))))
print(reduction_str)

print(X_res.shape, Y_res.shape)
print(Y_res.sum(), Y_train.sum())

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb

X_rs_trn, X_rs_val, Y_res_trn, Y_res_val = train_test_split(X_res,
                                                            Y_res,
                                                            test_size=0.075,
                                                            shuffle=True,
示例#21
0
        try:
            X_resampled, Y_resampled = nm1.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y
    elif index == 5:
        enn = EditedNearestNeighbours(random_state=0)
        try:
            X_resampled, Y_resampled = enn.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y
    elif index == 6:
        renn = RepeatedEditedNearestNeighbours(random_state=0)
        try:
            X_resampled, Y_resampled = renn.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y
    elif index == 7:
        allknn = AllKNN(random_state=0)
        try:
            X_resampled, Y_resampled = allknn.fit_sample(X, Y)
        except Exception, e:
            print str(e)
            X_resampled, Y_resampled = X, Y

    return X_resampled, Y_resampled


algo_list = ['dt', 'GaNB', 'linear_svc', 'logistic', 'nn', 'rf', 'svc']
示例#22
0
    ]
    return classifier_list, classifier_name_list


def print_evaluation_metrics(trained_model, trained_model_name, X_test,
                             y_test):
    print '--------- For Model : ', trained_model_name, ' ---------------\n'
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test, predicted_values)
    print "Accuracy Score : ", metrics.accuracy_score(y_test, predicted_values)
    print "---------------------------------------\n"


filename = 'creditcard.csv'
credit_frame = pd.read_csv(filename)
class_labels = list(credit_frame['Class'].values)
del credit_frame['Class']
random_under = RandomUnderSampler(random_state=42)
rnn = RepeatedEditedNearestNeighbours(random_state=42)
tomek = TomekLinks(random_state=42)

X, y = rnn.fit_sample(credit_frame.values, class_labels)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
classifier_list, classifier_name_list = get_ensemble_models()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
def test_renn_not_good_object():
    nn = 'rnd'
    renn = RepeatedEditedNearestNeighbours(
        n_neighbors=nn, kind_sel='mode')
    with raises(ValueError):
        renn.fit_sample(X, Y)
def test_deprecation_random_state():
    renn = RepeatedEditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        renn.fit_sample(X, Y)
def test_renn_iter_wrong():
    max_iter = -1
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter)
    with raises(ValueError):
        renn.fit_sample(X, Y)
示例#26
0
'''
print(sorted(Counter(y).items()))
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = enn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
'''
[(0, 64), (1, 262), (2, 4674)]
[(0, 64), (1, 213), (2, 4568)]
'''
'''
在此基础上, 延伸出了RepeatedEditedNearestNeighbours算法, 重复基础的EditedNearestNeighbours算法多次
'''
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = renn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 208), (2, 4551)]
#与RepeatedEditedNearestNeighbours算法不同的是, ALLKNN算法在进行每次迭代的时候, 最近邻的数量都在增加.
from imblearn.under_sampling import AllKNN
allknn = AllKNN(random_state=0)
X_resampled, y_resampled = allknn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 220), (2, 4601)]
#Condensed nearest neighbors and derived algorithms
'''
CondensedNearestNeighbour使用1近邻的方法来进行迭代,来判断一个样本是应该保留还是剔除,具体的实现步骤如下:
集合C:所有的少数类样本;
选择一个多数类样本(需要下采样)加入集合C,其他的这类样本放入集合S;
使用集合S训练一个1-NN的分类器,对集合S中的样本进行分类;
将集合S中错分的样本加入集合C;
示例#27
0
def test_renn_not_good_object():
    nn = 'rnd'
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    with raises(ValueError):
        renn.fit_sample(X, Y)