예제 #1
0
def tomek_links():
    # minority class
    X_minority = np.transpose([[1.4, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55],
                               [0.4, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]])
    # majority class
    X_majority = np.transpose(
        [[2.1, 1.5, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45, 3.00, 3.1, 1.5],
         [1.5, 2.2, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9, 1.00, 2.0, 0.3]])
    #
    # fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    # ax.scatter(X_majority[:, 0], X_majority[:, 1],
    #            label='Negative class', s=200, marker='_')
    #
    # ax.scatter(X_minority[:, 0], X_minority[:, 1],
    #            label='Positive class', s=200, marker='+')
    #
    # # highlight the samples of interest
    # ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
    #            [X_minority[-1, 1], X_majority[1, 1]],
    #            label='Tomek link', s=200, alpha=0.3)
    # ax.set_title('Illustration of a Tomek link')
    # make_plot_despine(ax)
    # fig.tight_layout()

    sampler = TomekLinks()

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

    ax_arr = (ax1, ax2)
    title_arr = ('Removing only majority samples', 'Removing all samples')
    for ax, title, sampler in zip(ax_arr, title_arr, [
            TomekLinks(sampling_strategy='auto'),
            TomekLinks(sampling_strategy='all')
    ]):
        X_res, y_res = sampler.fit_resample(
            np.vstack((X_majority, X_minority)),
            np.array([0] * X_majority.shape[0] + [1] * X_minority.shape[0]))
        ax.scatter(X_res[y_res == 1][:, 0],
                   X_res[y_res == 1][:, 1],
                   label='Minority class',
                   s=200,
                   marker='+')
        ax.scatter(X_res[y_res == 0][:, 0],
                   X_res[y_res == 0][:, 1],
                   label='Majority class',
                   s=200,
                   marker='_')

        # highlight the samples of interest
        ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
                   [X_minority[-1, 1], X_majority[1, 1]],
                   label='Tomek link',
                   s=200,
                   alpha=0.3)

        ax.set_title(title)
        make_plot_despine(ax)
    fig.tight_layout()

    plt.show()
예제 #2
0
def undersample(X, y, bal_strategy):
	print 'Shape of X: ', X.shape
	print 'Shape of y_Train: ', y.shape

	if(bal_strategy == "RANDOM" or bal_strategy == "ALL"):
		# apply random under-sampling
		rus = RandomUnderSampler()
		X_sampled, y_sampled = rus.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"):
		# Apply Tomek Links cleaning
		tl = TomekLinks()
		X_sampled, y_sampled = tl.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE'
		sys.exit(1)

	return (X_sampled, y_sampled)
	def workflow_no_oversampling(self, remove_tomeklinks, model_name):
		"""
		This function performs the workflow of classification without any oversampling
		:return: f1 score without oversampling
		"""
		train_x_expanded, train_y_binary = self.pre_process(test_data=False)
		inos_p_old = train_x_expanded[train_y_binary == 1]
		inos_n = train_x_expanded[train_y_binary == 0]
		print("debug, shape of inos_p_old, inos_n")
		print(inos_p_old.shape, inos_n.shape)
		x_res = pd.concat([inos_p_old, inos_n], axis=0)
		# create y_res
		y_res_p = np.ones(inos_p_old.shape[0])
		y_res_n = np.zeros(inos_n.shape[0])
		y_res = np.concatenate([y_res_p, y_res_n])
		print("debug, shape of training data:")
		print(x_res.shape)
		print(y_res.shape)
		if remove_tomeklinks == True:
			tl = TomekLinks()
			x_res, y_res = tl.fit_resample(x_res, y_res)
			print("shape of training data after removing tomek links:")
			print(x_res.shape)
			print(y_res.shape)
		else:
			pass
		tmo = self.build_model(x_res, y_res, model_name)
		# evaluates performance
		x_test, y_test_binary = self.pre_process(test_data=True)
		#
		f1_score, precision, recall = self.eval_model(tmo, x_test, y_test_binary)

		return f1_score, precision, recall
예제 #4
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def resampling(train_data, train_labels, resampling_type, resampling_stragey):
    train_data_new = np.reshape(train_data,
                                (train_data.shape[0], train_data.shape[1] *
                                 train_data.shape[2] * train_data.shape[3]))
    if resampling_type == 'SMOTE':
        train_data_resampled, train_labels_resampled = SMOTE(
            random_state=42).fit_resample(train_data_new, train_labels.values)

    elif resampling_type == 'over_sampling':
        over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = over_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'under_sampling':
        under_sampler = RandomUnderSampler(
            sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = under_sampler.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'tomelinks':
        t1 = TomekLinks(sampling_strategy=resampling_stragey)
        train_data_resampled, train_labels_resampled = t1.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'near_miss_neighbors':
        undersample = NearMiss(version=1, n_neighbors=3)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    elif resampling_type == 'one_sided_selection':
        undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
        train_data_resampled, train_labels_resampled = undersample.fit_resample(
            train_data_new, train_labels.values)

    return train_data_resampled, train_labels_resampled
예제 #6
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318],
                     [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #7
0
def undersample_tomek_link(X,
                           y,
                           label='Tomek links under-sampling',
                           plot=False):
    tl = TomekLinks(return_indices=True, ratio='all')
    X_tl, y_tl, id_tl = tl.fit_sample(X, y)
    X_tl = pd.DataFrame(X_tl, columns=X.columns)
    y_tl = pd.Series(y_tl, name=y.name)
    if plot == True:
        #print('Removed indexes:', id_tl)
        # plotting using pca
        pca = PCA(n_components=2)
        X_pca = pd.DataFrame(pca.fit_transform(X_tl))
        colors = ['#1F77B4', '#FF7F0E']
        markers = ['o', 's']
        for l, c, m in zip(np.unique(y_tl), colors, markers):
            plt.scatter(
                X_pca.loc[y_tl == l, 0],  # pc 1
                X_pca.loc[y_tl == l, 1],  # pc 2
                c=c,
                label=l,
                marker=m)
        plt.title(label)
        plt.legend(loc='upper right')
        plt.show()
    return X_tl, y_tl, tl, id_tl
예제 #8
0
    def _validate_estimator(self):
        """
        Private function to validate SMOTE and ENN objects
        :return:
        """

        if self.smote is not None:
            if isinstance(self.smote, SMOTE):
                self.smote_ = self.smote
            else:
                raise ValueError('smote needs to be a SMOTE object.'
                                 'Got {} instead.'.format(type(self.smote)))
        else:
            self.smote_ = SMOTE(ratio=self.ratio,
                                k_neighbors=3,
                                random_state=self.random_state)

        if self.tomek is not None:
            if isinstance(self.tomek, TomekLinks):
                self.tomek_ = self.tomek
            else:
                raise ValueError('tomek needs to be a TomekLinks object.'
                                 'Got {} instead.'.format(type(self.tomek)))
        else:
            self.tomek_ = TomekLinks(ratio="all",
                                     random_state=self.random_state)
    def _tomek_data(self):
        """Performs tomek links. Can not handle nominal values."""

        if self.cols_nominal.size > 0:
            print("Skipping Tomek Links. Cannot perform with raw categorical data. Create dummies to use.")
            return
        tl = TomekLinks()
        self.X_train, self.y_train = tl.fit_sample(self.X_train, self.y_train)
예제 #10
0
def test_tl_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    tl.fit(X, Y)
    assert_raises(RuntimeError, tl.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
예제 #11
0
def get_tomeklinks_under_sampled_dataset():
    tl = TomekLinks(return_indices=True, ratio='majority')
    X_tl, y_tl, id_tl = tl.fit_sample(X_train, y_train)

    print('Removed indexes:', id_tl)
    shuffle(X_tl)
    y_tl = X_tl[target]

    return X_tl, y_tl
예제 #12
0
def get_binary_Tomek_Links_cleaned_data(id_df, X_df, y_df):
    tLinks = TomekLinks()
    a = y_df.iloc[:, 0]
    tLinks.fit_sample(X_df, y_df.iloc[:, 0])
    sample_indices = tLinks.sample_indices_
    id_df_cleaned = id_df.iloc[sample_indices]
    X_df_cleaned = X_df.iloc[sample_indices]
    y_df_cleaned = y_df.iloc[sample_indices]
    return id_df_cleaned, X_df_cleaned, y_df_cleaned
def test_tl_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    tl.fit(X, Y)
    assert_raises(RuntimeError, tl.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
예제 #14
0
    def sample_all(self, nb_data_to_load):

        X = []
        y = []

        # 한번에 돌면서 ng 데이터 따로 저장, 정상 데이터 샘플링 따로 저장

        # lg_train 폴더 안에 있는 파일을 하나씩 가져옴
        file_list = glob(self.file_to_load + '/*.txt')

        for filepath in file_list:
            print(filepath)

            # list 안의 인덱스에 맞는 line 의 데이터 가져오기
            # 정상, 불량 데이터 X,y에 저장
            # Load normal data
            index = 0
            with open(filepath, mode='r') as f:
                for i, line in enumerate(f):
                    if line[0] == '0':
                        # print("label : 0")
                        curr_data = line.strip().split('\t')
                        curr_data[2] = stage_value = int(curr_data[2][1])
                        X.append(curr_data[1:])
                        y.append(curr_data[0])
                        index += 1
                        if index % 1000 == 0:
                            print(index)

        # Load random_700 data

        random_700 = '/workspace/peter/sampled/sampled_random700.txt'

        with open(random_700, mode='r') as f:
            if i in random_index:
                if line[0] == '0':
                    curr_data = line.strip().split('\t')
                    X.append(curr_data[1:])
                    y.append(curr_data[0])

        # Possible type conversion required for sampling methods
        X_np = np.array(X).astype(np.float64)
        y_np = np.array(y).astype(np.int)

        # Undersampling with Tomeklinks
        undersampler = TomekLinks()

        X_resampled, y_resampled = undersampler.fit_resample(X_np, y_np)

        # Round datetime, stage and temperature
        X_resampled[:, 0] = X_resampled[:, 0].round()
        X_resampled[:, 1] = X_resampled[:, 1].round()
        X_resampled[:, 2] = X_resampled[:, 2].round(1)

        self.save_data(self.file_to_save, X_resampled, y_resampled)
예제 #15
0
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #16
0
def undersample_Tomeks_Link(df, debug=True):
    X = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    if debug:
        print('Original dataset shape %s' % Counter(y))
    tl = TomekLinks()
    X_res, y_res = tl.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
    df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res)
    if debug:
        print('Resampled dataset shape %s' % Counter(y_res))
    return df_resampled
예제 #17
0
def imbalanced_resampling(method, x, y):
    if method == "under":
        sampling = TomekLinks(sampling_strategy="auto")
    elif method == "over":
        sampling = SMOTE(ratio='auto')
    elif method == "combined":
        sampling = SMOTETomek()
    else:
        return x, y

    X, Y = sampling.fit_sample(x, y)
    return X, Y
def test_tl_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #19
0
def getData(splitData=True, useImbalancer=False, useStratify=False):
    global standard_scaler
    data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv")
    X = data.values[:, 1:-1]
    rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy()
    X = np.concatenate((X, rank_dummy), axis=1)
    y = data.values[:, 0].reshape(-1, 1)
    if useStratify:
        stratify = y
    else:
        stratify = None
    if splitData:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=101,
                                                            shuffle=True,
                                                            stratify=stratify)
    else:
        X_train = X
        y_train = y
    if useImbalancer and splitData:
        tl = TomekLinks(sampling_strategy='majority')
        X_train, y_train = tl.fit_sample(X=X_train, y=y_train)
        # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T)
    if splitData:
        unique, counts = np.unique(y_test, return_counts=True)
    # print("y_test\n", np.asarray((unique, counts)).T)
    if splitData:
        return X_train, X_test, y_train.ravel(), y_test.ravel()
    else:
        return X_train, y_train.ravel()
예제 #20
0
def trainModelWithResults(model, X, y,rd_state=None,autoscale=1,usetomeklinks=1):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, stratify=y, random_state=rd_state) # stratify the split because we have unbalanced target
    if autoscale==1:
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    if usetomeklinks==1:
        tl = TomekLinks(return_indices=False)
        X_train, y_train = tl.fit_sample(X_train, y_train)
    mfitted = model.fit(X_train,y_train)
    predictions = mfitted.predict(X_test)
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))
예제 #21
0
def tomeklinks(X,
               y,
               visualize=False,
               pca2d=True,
               pca3d=True,
               tsne=True,
               pie_evr=True):
    tl = TomekLinks()
    X_res, y_res = tl.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
예제 #22
0
def test_tl_fit():
    """Test the fitting method"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    # Fit the data
    tl.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(tl.min_c_, 0)
    assert_equal(tl.maj_c_, 1)
    assert_equal(tl.stats_c_[0], 500)
    assert_equal(tl.stats_c_[1], 4500)
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 20)
    tl = TomekLinks(random_state=RND_SEED)
    assert_warns(UserWarning, tl.fit, X, y)

    # multiclass case
    y = np.array([0] * 3 + [1] * 7 + [2] * 10)
    tl = TomekLinks(random_state=RND_SEED)
    assert_warns(UserWarning, tl.fit, X, y)
예제 #24
0
    def oversample(self, train, labels):
        """
            Over samples data according to SMOTE algorithm
        """
        #Oversample
        sm = SMOTE(random_state=2)
        train_res, labels_res = sm.fit_sample(train, labels)

        #clear noise points that emerged from oversampling
        tl = TomekLinks(random_state=42)
        train_res, labels_res = tl.fit_sample(train_res, labels_res)

        return train_res, labels_res
def test_tl_fit():
    """Test the fitting method"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    # Fit the data
    tl.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(tl.min_c_, 0)
    assert_equal(tl.maj_c_, 1)
    assert_equal(tl.stats_c_[0], 7)
    assert_equal(tl.stats_c_[1], 13)
def Tomek_us(X_train, Y_train, seed, sampling_strategy):
    tl = TomekLinks(random_state=seed,
                    n_jobs=-1,
                    sampling_strategy=sampling_strategy)
    print('Before Tomek undersampling : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = tl.fit_resample(X_train, Y_train)
    print('After Tomek undersampling : ',
          sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
def test_tl_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
예제 #28
0
def test_tl_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def undersampling(df):
    tl = TomekLinks(ratio='all', n_jobs=16, return_indices=True)
    X = []
    y = []
    add2list(df, X, y)

    X, y, idx = tl.fit_sample(X, y)

    criterion = [False] * len(df)
    for i in idx:
        criterion[i] = True

    newdf = df[criterion].reset_index(drop=True)
    return newdf
예제 #30
0
def dataset_sampling(X,y):
	sm = SMOTE(random_state=42,ratio='minority')
	smt = SMOTETomek(ratio='auto')
	ros = RandomOverSampler(random_state=0)
	rus = RandomUnderSampler(random_state=0)
	tl = TomekLinks(return_indices=True, ratio='majority')
	cc = ClusterCentroids(ratio={0: 10})
	#X_res, y_res = sm.fit_resample(X, y)
	#X_res, y_res = ros.fit_resample(X, y)
	#X_res, y_res = rus.fit_resample(X, y)
	X_res, y_res, id_tl = tl.fit_sample(X, y)
	#X_res, y_res = cc.fit_sample(X, y)
	#X_res, y_res = smt.fit_sample(X, y)
	return X_res,y_res
예제 #31
0
def resample():
    test_switch = np.load('data/test_switch_w_64_f_20.npy')
    test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy')
    train_switch = np.load('data/train_switch_w_64_f_20.npy')
    train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy')

    resample_train = SMOTETomek(sampling_strategy='all',
                                smote=SMOTE(n_jobs=4),
                                tomek=TomekLinks(n_jobs=4))
    resampe_test = SMOTETomek(sampling_strategy='all',
                              smote=SMOTE(n_jobs=4),
                              tomek=TomekLinks(n_jobs=4))

    print('Beginning train resample...')
    X = np.concatenate((train_switch, train_non_switch))
    y = np.concatenate(
        (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0])))
    X_res, y_res = resample_train.fit_resample(X, y)

    train_switch = []
    train_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            train_switch.append(X_res[i])
        else:
            train_non_switch.append(X_res[i])

    np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch))
    np.save('data/train_non_switch_w_64_f_20_samp.npy',
            np.array(train_non_switch))

    print('Beginning test resample...')
    X = np.concatenate((test_switch, test_non_switch))
    y = np.concatenate(
        (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0])))
    X_res, y_res = resample_test.fit_resample(X, y)

    test_switch = []
    test_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            test_switch.append(X_res[i])
        else:
            test_non_switch.append(X_res[i])

    np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch))
    np.save('data/test_non_switch_w_64_f_20_samp.npy',
            np.array(test_non_switch))
    return
예제 #32
0
def under_sample_data(matrix, y_train):
    add_to_log('Under Sampling')
    add_to_log('Sample distribution %s' % Counter(y_train))
    # clean proximity samples using TomeKLinks
    tl = TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1)
    X_res, y_res = tl.fit_resample(matrix, y_train)
    add_to_log('TomekLinks distribution %s' % Counter(y_res))

    enn = EditedNearestNeighbours(random_state=7,
                                  sampling_strategy='majority',
                                  n_jobs=-1)
    X_res, y_res = enn.fit_resample(X_res, y_res)

    add_to_log('EditedNearestNeighbours distribution %s' % Counter(y_res))
    return X_res, y_res
예제 #33
0
def test_tl_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    assert_raises(RuntimeError, tl.sample, X, Y)
예제 #34
0
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs):
    if name == "rus":
        sampler = RandomUnderSampler(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "nm":
        sampler = NearMiss(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "enn":
        sampler = EditedNearestNeighbours(return_indices=return_indices,
                                          random_state=random_state,
                                          **kwargs)
    elif name == "renn":
        sampler = RepeatedEditedNearestNeighbours(
            return_indices=return_indices, random_state=random_state, **kwargs)
    elif name == "allknn":
        sampler = AllKNN(return_indices=return_indices,
                         random_state=random_state,
                         **kwargs)
    elif name == "tl":
        sampler = TomekLinks(return_indices=return_indices,
                             random_state=random_state,
                             **kwargs)
    else:
        raise ValueError
    return sampler
예제 #35
0
def smote_tomek(x_train, y_train):
    oversample = BorderlineSMOTE(sampling_strategy=0.5,
                                 random_state=0,
                                 k_neighbors=5,
                                 m_neighbors=10,
                                 n_jobs=-1,
                                 kind='borderline-1')
    X, y = oversample.fit_resample(x_train, y_train)

    tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1)
    X, y = tom_lin.fit_resample(X, y)
    # print(len([i for i in y_train.values if i==1]))
    # print(len([i for i in y.values if i==1]))
    # print(len(y_train))
    # print(len(y))
    return X, y
예제 #36
0
def test_tl_fit_sample_with_indices():
    tl = TomekLinks(return_indices=True)
    X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y)

    X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.2184254, 0.24299982],
                     [0.61472253, -0.82309052], [0.19893132, -0.47761769],
                     [0.97407872, 0.44454207], [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695], [-0.23374509, 0.18370049],
                     [-0.32635887, -0.29299653], [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234]])
    y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    idx_gt = np.array(
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 19])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
sampling_strategy = 'not majority'

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by over-sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# With **cleaning method**, the number of samples in each class will not be
# equalized even if targeted.

sampling_strategy = 'not minority'
tl = TomekLinks(sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` as a ``dict``
# ...................................
#
# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
# classes. The values correspond to the desired number of samples for each
# targeted class. This is working for both **under- and over-sampling**
# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
예제 #38
0
def test_deprecation_random_state():
    tl = TomekLinks(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        tl.fit_resample(X, Y)
           label='Majority class', s=200, marker='+')

# highlight the samples of interest
ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
           [X_minority[-1, 1], X_majority[1, 1]],
           label='Tomek link', s=200, alpha=0.3)
ax.set_title('Illustration of a Tomek link')
make_plot_despine(ax)
fig.tight_layout()

###############################################################################
# We can run the ``TomekLinks`` sampling to remove the corresponding
# samples. If ``ratio='auto'`` only the sample from the majority class will be
# removed. If ``ratio='all'`` both samples will be removed.

sampler = TomekLinks()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
             'Removing all samples')
for ax, title, sampler in zip(ax_arr,
                              title_arr,
                              [TomekLinks(ratio='auto', random_state=0),
                               TomekLinks(ratio='all', random_state=0)]):
    X_res, y_res = sampler.fit_sample(np.vstack((X_minority, X_majority)),
                                      np.array([0] * X_minority.shape[0] +
                                               [1] * X_majority.shape[0]))
    ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
               label='Minority class', s=200, marker='_')
예제 #40
0
from imblearn.under_sampling import TomekLinks

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Tomek Links cleaning
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
예제 #41
0
from imblearn.under_sampling import TomekLinks

print(__doc__)

rng = np.random.RandomState(0)
n_samples_1 = 500
n_samples_2 = 50
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
              0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn, y_syn = shuffle(X_syn, y_syn)
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn,
                                                                    y_syn)

# remove Tomek links
tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, idx_resampled = tl.fit_sample(X_syn, y_syn)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]),
                                   idx_resampled)
idx_class_0 = y_resampled == 0
plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')