예제 #1
0
def tomek_links():
    # minority class
    X_minority = np.transpose([[1.4, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55],
                               [0.4, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]])
    # majority class
    X_majority = np.transpose(
        [[2.1, 1.5, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45, 3.00, 3.1, 1.5],
         [1.5, 2.2, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9, 1.00, 2.0, 0.3]])
    #
    # fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    # ax.scatter(X_majority[:, 0], X_majority[:, 1],
    #            label='Negative class', s=200, marker='_')
    #
    # ax.scatter(X_minority[:, 0], X_minority[:, 1],
    #            label='Positive class', s=200, marker='+')
    #
    # # highlight the samples of interest
    # ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
    #            [X_minority[-1, 1], X_majority[1, 1]],
    #            label='Tomek link', s=200, alpha=0.3)
    # ax.set_title('Illustration of a Tomek link')
    # make_plot_despine(ax)
    # fig.tight_layout()

    sampler = TomekLinks()

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

    ax_arr = (ax1, ax2)
    title_arr = ('Removing only majority samples', 'Removing all samples')
    for ax, title, sampler in zip(ax_arr, title_arr, [
            TomekLinks(sampling_strategy='auto'),
            TomekLinks(sampling_strategy='all')
    ]):
        X_res, y_res = sampler.fit_resample(
            np.vstack((X_majority, X_minority)),
            np.array([0] * X_majority.shape[0] + [1] * X_minority.shape[0]))
        ax.scatter(X_res[y_res == 1][:, 0],
                   X_res[y_res == 1][:, 1],
                   label='Minority class',
                   s=200,
                   marker='+')
        ax.scatter(X_res[y_res == 0][:, 0],
                   X_res[y_res == 0][:, 1],
                   label='Majority class',
                   s=200,
                   marker='_')

        # highlight the samples of interest
        ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
                   [X_minority[-1, 1], X_majority[1, 1]],
                   label='Tomek link',
                   s=200,
                   alpha=0.3)

        ax.set_title(title)
        make_plot_despine(ax)
    fig.tight_layout()

    plt.show()
예제 #2
0
def getData(splitData=True, useImbalancer=False, useStratify=False):
    global standard_scaler
    data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv")
    X = data.values[:, 1:-1]
    rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy()
    X = np.concatenate((X, rank_dummy), axis=1)
    y = data.values[:, 0].reshape(-1, 1)
    if useStratify:
        stratify = y
    else:
        stratify = None
    if splitData:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=101,
                                                            shuffle=True,
                                                            stratify=stratify)
    else:
        X_train = X
        y_train = y
    if useImbalancer and splitData:
        tl = TomekLinks(sampling_strategy='majority')
        X_train, y_train = tl.fit_sample(X=X_train, y=y_train)
        # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T)
    if splitData:
        unique, counts = np.unique(y_test, return_counts=True)
    # print("y_test\n", np.asarray((unique, counts)).T)
    if splitData:
        return X_train, X_test, y_train.ravel(), y_test.ravel()
    else:
        return X_train, y_train.ravel()
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 20)
    tl = TomekLinks(random_state=RND_SEED)
    assert_warns(UserWarning, tl.fit, X, y)

    # multiclass case
    y = np.array([0] * 3 + [1] * 7 + [2] * 10)
    tl = TomekLinks(random_state=RND_SEED)
    assert_warns(UserWarning, tl.fit, X, y)
예제 #4
0
def resample():
    test_switch = np.load('data/test_switch_w_64_f_20.npy')
    test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy')
    train_switch = np.load('data/train_switch_w_64_f_20.npy')
    train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy')

    resample_train = SMOTETomek(sampling_strategy='all',
                                smote=SMOTE(n_jobs=4),
                                tomek=TomekLinks(n_jobs=4))
    resampe_test = SMOTETomek(sampling_strategy='all',
                              smote=SMOTE(n_jobs=4),
                              tomek=TomekLinks(n_jobs=4))

    print('Beginning train resample...')
    X = np.concatenate((train_switch, train_non_switch))
    y = np.concatenate(
        (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0])))
    X_res, y_res = resample_train.fit_resample(X, y)

    train_switch = []
    train_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            train_switch.append(X_res[i])
        else:
            train_non_switch.append(X_res[i])

    np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch))
    np.save('data/train_non_switch_w_64_f_20_samp.npy',
            np.array(train_non_switch))

    print('Beginning test resample...')
    X = np.concatenate((test_switch, test_non_switch))
    y = np.concatenate(
        (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0])))
    X_res, y_res = resample_test.fit_resample(X, y)

    test_switch = []
    test_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            test_switch.append(X_res[i])
        else:
            test_non_switch.append(X_res[i])

    np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch))
    np.save('data/test_non_switch_w_64_f_20_samp.npy',
            np.array(test_non_switch))
    return
예제 #5
0
def test_tl_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    assert_raises(RuntimeError, tl.sample, X, Y)
예제 #6
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #7
0
def test_validate_estimator_init():
    """Test right processing while passing objects as initialization"""

    # Create a SMOTE and Tomek object
    smote = SMOTE(random_state=RND_SEED)
    tomek = TomekLinks(random_state=RND_SEED)

    smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769], [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734], [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049], [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929], [1.79580611, -0.02219234],
                     [0.38307743, -0.05670439], [0.93976473, -0.06570176],
                     [0.70319159, -0.02571668], [0.75052536, -0.19246517]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #8
0
    def _validate_estimator(self):
        """
        Private function to validate SMOTE and ENN objects
        :return:
        """

        if self.smote is not None:
            if isinstance(self.smote, SMOTE):
                self.smote_ = self.smote
            else:
                raise ValueError('smote needs to be a SMOTE object.'
                                 'Got {} instead.'.format(type(self.smote)))
        else:
            self.smote_ = SMOTE(ratio=self.ratio,
                                k_neighbors=3,
                                random_state=self.random_state)

        if self.tomek is not None:
            if isinstance(self.tomek, TomekLinks):
                self.tomek_ = self.tomek
            else:
                raise ValueError('tomek needs to be a TomekLinks object.'
                                 'Got {} instead.'.format(type(self.tomek)))
        else:
            self.tomek_ = TomekLinks(ratio="all",
                                     random_state=self.random_state)
예제 #9
0
def handleImbalancedDatset(method, x, y):
    X_resampled = []
    Y_resampled = []
    seed = 123

    if method.lowercase == "smote":
        sm = SMOTE(sampling_strategy='auto', random_state=seed)
        X_resampled, Y_resampled = sm.fit_resample(x, y)

    if method.lowercase == "adasyn":
        adas = ADASYN()
        X_resampled, Y_resampled = adas.fit_resample(x, y)

    if method.lowercase == "enn":
        enn = EditedNearestNeighbours()
        X_resampled, Y_resampled = enn.fit_resample(x, y)

    if method.lowercase == "cnn":
        cnn = CondensedNearestNeighbour()
        X_resampled, Y_resampled = cnn.fit_resample(x, y)

    if method.lowercase == "oss":
        oss = OneSidedSelection()
        X_resampled, Y_resampled = oss.fit_resample(x, y)

    if method.lowercase == "nm":
        nm = NearMiss(version=3, n_neighbors_ver3=n)
        X_resampled, Y_resampled = nm.fit_resample(x, y)

    if method.lowercase == "smotetomek":
        smotetomek = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
        X_resampled, Y_resampled = smotetomek.fit_resample(x, y)

    return X_resampled, Y_resampled
	def workflow_no_oversampling(self, remove_tomeklinks, model_name):
		"""
		This function performs the workflow of classification without any oversampling
		:return: f1 score without oversampling
		"""
		train_x_expanded, train_y_binary = self.pre_process(test_data=False)
		inos_p_old = train_x_expanded[train_y_binary == 1]
		inos_n = train_x_expanded[train_y_binary == 0]
		print("debug, shape of inos_p_old, inos_n")
		print(inos_p_old.shape, inos_n.shape)
		x_res = pd.concat([inos_p_old, inos_n], axis=0)
		# create y_res
		y_res_p = np.ones(inos_p_old.shape[0])
		y_res_n = np.zeros(inos_n.shape[0])
		y_res = np.concatenate([y_res_p, y_res_n])
		print("debug, shape of training data:")
		print(x_res.shape)
		print(y_res.shape)
		if remove_tomeklinks == True:
			tl = TomekLinks()
			x_res, y_res = tl.fit_resample(x_res, y_res)
			print("shape of training data after removing tomek links:")
			print(x_res.shape)
			print(y_res.shape)
		else:
			pass
		tmo = self.build_model(x_res, y_res, model_name)
		# evaluates performance
		x_test, y_test_binary = self.pre_process(test_data=True)
		#
		f1_score, precision, recall = self.eval_model(tmo, x_test, y_test_binary)

		return f1_score, precision, recall
예제 #11
0
def undersample(X, y, bal_strategy):
	print 'Shape of X: ', X.shape
	print 'Shape of y_Train: ', y.shape

	if(bal_strategy == "RANDOM" or bal_strategy == "ALL"):
		# apply random under-sampling
		rus = RandomUnderSampler()
		X_sampled, y_sampled = rus.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"):
		# Apply Tomek Links cleaning
		tl = TomekLinks()
		X_sampled, y_sampled = tl.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE'
		sys.exit(1)

	return (X_sampled, y_sampled)
예제 #12
0
 def getsampler(self, type):
     if type == 'none':
         sampler = NoSampler()
     elif type == 'randomunder':
         sampler = RandomUnderSampler()
     elif type == 'nearmiss':
         sampler = NearMiss()
     elif type == 'allknn':
         sampler = AllKNN()
     elif type == 'condensednn':
         sampler = CondensedNearestNeighbour()
     elif type == 'editednn':
         sampler = EditedNearestNeighbours()
     elif type == 'repeatededitednn':
         sampler = RepeatedEditedNearestNeighbours()
     elif type == 'tomeklinks':
         sampler = TomekLinks()
     elif type == 'randomover':
         sampler = RandomOverSampler()
     elif type == 'smote':
         sampler = SMOTE()
     elif type == 'adasyn':
         sampler = ADASYN()
     elif type == 'smotenc':
         sampler = SMOTENC()
     elif type == 'quality':  # and self.quality_model_selection_type == 'extended':
         sampler = QualitySampler(self.n_init)
     else:
         print("Unsupported sampler %s" % type)
         exit(1)
     if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params(
     ).keys():
         sampler.set_params(random_state=self.random_state)
     return sampler
예제 #13
0
def undersample_tomek_link(X,
                           y,
                           label='Tomek links under-sampling',
                           plot=False):
    tl = TomekLinks(return_indices=True, ratio='all')
    X_tl, y_tl, id_tl = tl.fit_sample(X, y)
    X_tl = pd.DataFrame(X_tl, columns=X.columns)
    y_tl = pd.Series(y_tl, name=y.name)
    if plot == True:
        #print('Removed indexes:', id_tl)
        # plotting using pca
        pca = PCA(n_components=2)
        X_pca = pd.DataFrame(pca.fit_transform(X_tl))
        colors = ['#1F77B4', '#FF7F0E']
        markers = ['o', 's']
        for l, c, m in zip(np.unique(y_tl), colors, markers):
            plt.scatter(
                X_pca.loc[y_tl == l, 0],  # pc 1
                X_pca.loc[y_tl == l, 1],  # pc 2
                c=c,
                label=l,
                marker=m)
        plt.title(label)
        plt.legend(loc='upper right')
        plt.show()
    return X_tl, y_tl, tl, id_tl
예제 #14
0
    def get_sampler(self):
        sampler = None
        if self.sampler == 'random-over-sampler':
            sampler = RandomOverSampler(random_state=self.random_seed)

        elif self.sampler == 'adasyn':
            sampler = ADASYN(random_state=self.random_seed, n_jobs=self.njobs)

        elif self.sampler == 'smote':
            sampler = SMOTE(random_state=self.random_seed, n_jobs=self.njobs)

        elif self.sampler == 'svm-smote':
            sampler = SVMSMOTE(random_state=self.random_seed,
                               n_jobs=self.njobs)

        elif self.sampler == 'random-under-sampler':
            sampler = RandomUnderSampler(random_state=self.random_seed)

        elif self.sampler == 'tomek-links':
            sampler = TomekLinks(n_jobs=self.njobs)

        elif self.sampler == 'near-miss':
            sampler = NearMiss(n_jobs=self.njobs)

        elif self.sampler == 'instance-hardness':
            sampler = InstanceHardnessThreshold(random_state=self.random_seed,
                                                n_jobs=self.njobs)

        return sampler
예제 #15
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
예제 #16
0
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs):
    if name == "rus":
        sampler = RandomUnderSampler(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "nm":
        sampler = NearMiss(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "enn":
        sampler = EditedNearestNeighbours(return_indices=return_indices,
                                          random_state=random_state,
                                          **kwargs)
    elif name == "renn":
        sampler = RepeatedEditedNearestNeighbours(
            return_indices=return_indices, random_state=random_state, **kwargs)
    elif name == "allknn":
        sampler = AllKNN(return_indices=return_indices,
                         random_state=random_state,
                         **kwargs)
    elif name == "tl":
        sampler = TomekLinks(return_indices=return_indices,
                             random_state=random_state,
                             **kwargs)
    else:
        raise ValueError
    return sampler
def resampling(train_data, train_labels, resampling_type, resampling_stragey):
    train_data_new = np.reshape(train_data,
                                (train_data.shape[0], train_data.shape[1] *
                                 train_data.shape[2] * train_data.shape[3]))
    if resampling_type == 'SMOTE':
        train_data_resampled, train_labels_resampled = SMOTE(
            random_state=42).fit_resample(train_data_new, train_labels.values)

    elif resampling_type == 'over_sampling':
        over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = over_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'under_sampling':
        under_sampler = RandomUnderSampler(
            sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = under_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'tomelinks':
        t1 = TomekLinks(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = t1.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'near_miss_neighbors':
        undersample = NearMiss(version=1, n_neighbors=3)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'one_sided_selection':
        undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    return train_data_resampled, train_labels_resampled
예제 #18
0
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    tomek = TomekLinks(sampling_strategy="all")
    smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([
        [0.68481731, 0.51935141],
        [1.34192108, -0.13367336],
        [0.62366841, -0.21312976],
        [1.61091956, -0.40283504],
        [-0.37162401, -2.19400981],
        [0.74680821, 1.63827342],
        [0.61472253, -0.82309052],
        [0.19893132, -0.47761769],
        [1.40301027, -0.83648734],
        [-1.20515198, -1.02689695],
        [-0.23374509, 0.18370049],
        [-0.00288378, 0.84259929],
        [1.79580611, -0.02219234],
        [0.38307743, -0.05670439],
        [0.70319159, -0.02571667],
        [0.75052536, -0.19246518],
    ])
    y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
예제 #19
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
예제 #20
0
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False):
    '''
    Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation
    If normalize flag is True then the data are being normalised
    The sampling parameter sets the type of sampling to be used
    '''
    print('----------{} with {}----------'.format(clf_name, sampling))
    totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0
    plot_ind = randint(0, 9)
    j = 0
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for train_index, test_index in skf.split(usx, usy):
        x_train, x_test = usx[train_index], usx[test_index]
        y_train, y_test = usy[train_index], usy[test_index]

        if sampling == 'SMOTE':
            x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ADASYN':
            x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'ENN':
            x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train)
        elif sampling == 'Tomek':
            x_train, y_train = TomekLinks().fit_resample(x_train, y_train)
        elif sampling == 'SMOTETomek':
            x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'SMOTEENN':
            x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train)
        elif sampling == 'NCR':
            x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train)
        elif sampling == 'OSS':
            x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train)

        if normalize:
            scaler = StandardScaler().fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)

        clf.fit(x_train, y_train)

        # if plot_ind == j and clf_name == 'DecisionTreeClassifier':
        #     plot_decision_tree(clf)

        y_predict = clf.predict(x_test)

        for i in range(len(y_predict)):
            if y_test[i] and y_predict[i]:
                totalTP += 1
            if not y_test[i] and y_predict[i]:
                totalFP += 1
            if y_test[i] and not y_predict[i]:
                totalFN += 1
            if not y_test[i] and not y_predict[i]:
                totalTN += 1
        j += 1

    print('TOTAL TP: ' + str(totalTP))
    print('TOTAL FP: ' + str(totalFP))
    print('TOTAL FN: ' + str(totalFN))
    print('TOTAL TN: ' + str(totalTN))
def smoteTomek(X, y):
    smote = SMOTE(k_neighbors=3, m_neighbors=10)
    tomek = TomekLinks()
    sm = SMOTETomek(smote=smote, tomek=tomek)

    X_resampled, y_resampled = sm.fit_sample(X, y)

    return X_resampled, y_resampled
예제 #22
0
def test_tl_init():
    """Test the initialisation of the object"""

    # Define a ratio
    tl = TomekLinks(random_state=RND_SEED)

    assert_equal(tl.n_jobs, -1)
    assert_equal(tl.random_state, RND_SEED)
    def _tomek_data(self):
        """Performs tomek links. Can not handle nominal values."""

        if self.cols_nominal.size > 0:
            print("Skipping Tomek Links. Cannot perform with raw categorical data. Create dummies to use.")
            return
        tl = TomekLinks()
        self.X_train, self.y_train = tl.fit_sample(self.X_train, self.y_train)
예제 #24
0
    def get_sampler(self):
        sampler = None

        if self.sampler == 'tomek-links':
            sampler = TomekLinks(random_state=self.random_seed,
                                 n_jobs=self.njobs)

        return sampler
def analyze_resampled_class_train_data():
    parent_dir = Path.cwd().parent
    pickle_dir = parent_dir.joinpath('default_results',
                                     'pickle_files_feat_eng')

    for i, emotion in Dictionaries.emo_dict.items():
        for vect_name, vectorizer in Dictionaries.vectorizer_dict.items():
            print('\n\nResampled data  - EMOTION: ' + emotion +
                  ', VECTORIZER: ' + vect_name)
            preprocess_train_df, feat_transformed_train_df = df, df
            # Fit transform the vectorizer with the corresponding preprocessed training data
            if os.path.exists(
                    pickle_dir.joinpath(emotion +
                                        '_c_train_preprocess_df.pkl')):
                preprocess_train_df = pd.read_pickle(
                    pickle_dir.joinpath(emotion +
                                        '_c_train_preprocess_df.pkl'))
                train_vect = vectorizer.fit_transform(
                    preprocess_train_df['preprocessed_text'].values)
                print(emotion + ' vectorized features: ', train_vect.shape)
                train_vect_df = pd.DataFrame(
                    train_vect.toarray(),
                    columns=vectorizer.get_feature_names())
            if os.path.exists(
                    pickle_dir.joinpath(emotion +
                                        '_c_train_feat_transform_df.pkl')):
                feat_transformed_train_df = pd.read_pickle(
                    pickle_dir.joinpath(emotion +
                                        '_c_train_feat_transform_df.pkl'))
                print(emotion + ' transformed features: ',
                      feat_transformed_train_df.shape)
            else:
                # If the file doesnt exist, exit the program with instructions
                print(
                    '\nRequired files does not exist.\n\n Please, train the models first by running > Modelling.py'
                )
                sys.exit(1)
            features_df = pd.concat([train_vect_df, feat_transformed_train_df],
                                    axis=1)
            print(emotion + ' merged features: ', features_df.shape)

            #Resample the training data using SMOTE, Tomek links and SMOTETomek
            smote_X_train, smote_y_train = SMOTE(
                random_state=42, sampling_strategy='minority',
                n_jobs=-1).fit_resample(
                    features_df, preprocess_train_df['Affect Dimension'])
            tomek_X_train, tomek_y_train = TomekLinks(
                random_state=42, sampling_strategy='majority',
                n_jobs=-1).fit_resample(
                    features_df, preprocess_train_df['Affect Dimension'])
            smotetomek_X_train, smotetomek_y_train = SMOTETomek(
                random_state=42).fit_resample(
                    features_df, preprocess_train_df['Affect Dimension'])
            print('Data in SMOTE, Tomek links, SMOTETomek')
            print(smote_X_train.shape[0], tomek_X_train.shape[0],
                  smotetomek_X_train.shape[0])
            print(Counter(smote_y_train), Counter(tomek_y_train),
                  Counter(smotetomek_y_train))
예제 #26
0
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
예제 #27
0
def test_tl_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(RuntimeWarning, tl.fit, X, y_single_class)
예제 #28
0
def test_tl_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    tl.fit(X, Y)
    assert_raises(RuntimeError, tl.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
예제 #29
0
def get_tomeklinks_under_sampled_dataset():
    tl = TomekLinks(return_indices=True, ratio='majority')
    X_tl, y_tl, id_tl = tl.fit_sample(X_train, y_train)

    print('Removed indexes:', id_tl)
    shuffle(X_tl)
    y_tl = X_tl[target]

    return X_tl, y_tl
예제 #30
0
def get_binary_Tomek_Links_cleaned_data(id_df, X_df, y_df):
    tLinks = TomekLinks()
    a = y_df.iloc[:, 0]
    tLinks.fit_sample(X_df, y_df.iloc[:, 0])
    sample_indices = tLinks.sample_indices_
    id_df_cleaned = id_df.iloc[sample_indices]
    X_df_cleaned = X_df.iloc[sample_indices]
    y_df_cleaned = y_df.iloc[sample_indices]
    return id_df_cleaned, X_df_cleaned, y_df_cleaned