def tomek_links(): # minority class X_minority = np.transpose([[1.4, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55], [0.4, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]]) # majority class X_majority = np.transpose( [[2.1, 1.5, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45, 3.00, 3.1, 1.5], [1.5, 2.2, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9, 1.00, 2.0, 0.3]]) # # fig, ax = plt.subplots(1, 1, figsize=(6, 6)) # ax.scatter(X_majority[:, 0], X_majority[:, 1], # label='Negative class', s=200, marker='_') # # ax.scatter(X_minority[:, 0], X_minority[:, 1], # label='Positive class', s=200, marker='+') # # # highlight the samples of interest # ax.scatter([X_minority[-1, 0], X_majority[1, 0]], # [X_minority[-1, 1], X_majority[1, 1]], # label='Tomek link', s=200, alpha=0.3) # ax.set_title('Illustration of a Tomek link') # make_plot_despine(ax) # fig.tight_layout() sampler = TomekLinks() fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) ax_arr = (ax1, ax2) title_arr = ('Removing only majority samples', 'Removing all samples') for ax, title, sampler in zip(ax_arr, title_arr, [ TomekLinks(sampling_strategy='auto'), TomekLinks(sampling_strategy='all') ]): X_res, y_res = sampler.fit_resample( np.vstack((X_majority, X_minority)), np.array([0] * X_majority.shape[0] + [1] * X_minority.shape[0])) ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1], label='Minority class', s=200, marker='+') ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1], label='Majority class', s=200, marker='_') # highlight the samples of interest ax.scatter([X_minority[-1, 0], X_majority[1, 0]], [X_minority[-1, 1], X_majority[1, 1]], label='Tomek link', s=200, alpha=0.3) ax.set_title(title) make_plot_despine(ax) fig.tight_layout() plt.show()
def getData(splitData=True, useImbalancer=False, useStratify=False): global standard_scaler data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv") X = data.values[:, 1:-1] rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy() X = np.concatenate((X, rank_dummy), axis=1) y = data.values[:, 0].reshape(-1, 1) if useStratify: stratify = y else: stratify = None if splitData: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, shuffle=True, stratify=stratify) else: X_train = X y_train = y if useImbalancer and splitData: tl = TomekLinks(sampling_strategy='majority') X_train, y_train = tl.fit_sample(X=X_train, y=y_train) # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T) if splitData: unique, counts = np.unique(y_test, return_counts=True) # print("y_test\n", np.asarray((unique, counts)).T) if splitData: return X_train, X_test, y_train.ravel(), y_test.ravel() else: return X_train, y_train.ravel()
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 20) tl = TomekLinks(random_state=RND_SEED) assert_warns(UserWarning, tl.fit, X, y) # multiclass case y = np.array([0] * 3 + [1] * 7 + [2] * 10) tl = TomekLinks(random_state=RND_SEED) assert_warns(UserWarning, tl.fit, X, y)
def resample(): test_switch = np.load('data/test_switch_w_64_f_20.npy') test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy') train_switch = np.load('data/train_switch_w_64_f_20.npy') train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy') resample_train = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) resampe_test = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) print('Beginning train resample...') X = np.concatenate((train_switch, train_non_switch)) y = np.concatenate( (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0]))) X_res, y_res = resample_train.fit_resample(X, y) train_switch = [] train_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: train_switch.append(X_res[i]) else: train_non_switch.append(X_res[i]) np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch)) np.save('data/train_non_switch_w_64_f_20_samp.npy', np.array(train_non_switch)) print('Beginning test resample...') X = np.concatenate((test_switch, test_non_switch)) y = np.concatenate( (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0]))) X_res, y_res = resample_test.fit_resample(X, y) test_switch = [] test_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: test_switch.append(X_res[i]) else: test_non_switch.append(X_res[i]) np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch)) np.save('data/test_non_switch_w_64_f_20_samp.npy', np.array(test_non_switch)) return
def test_tl_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object tl = TomekLinks(random_state=RND_SEED) assert_raises(RuntimeError, tl.sample, X, Y)
def test_tl_fit_sample(): """Test the fit sample routine""" # Resample the data tl = TomekLinks(random_state=RND_SEED) X_resampled, y_resampled = tl.fit_sample(X, Y) X_gt = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.2184254, 0.24299982], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234]]) y_gt = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init(): """Test right processing while passing objects as initialization""" # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(random_state=RND_SEED) smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.93976473, -0.06570176], [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def _validate_estimator(self): """ Private function to validate SMOTE and ENN objects :return: """ if self.smote is not None: if isinstance(self.smote, SMOTE): self.smote_ = self.smote else: raise ValueError('smote needs to be a SMOTE object.' 'Got {} instead.'.format(type(self.smote))) else: self.smote_ = SMOTE(ratio=self.ratio, k_neighbors=3, random_state=self.random_state) if self.tomek is not None: if isinstance(self.tomek, TomekLinks): self.tomek_ = self.tomek else: raise ValueError('tomek needs to be a TomekLinks object.' 'Got {} instead.'.format(type(self.tomek))) else: self.tomek_ = TomekLinks(ratio="all", random_state=self.random_state)
def handleImbalancedDatset(method, x, y): X_resampled = [] Y_resampled = [] seed = 123 if method.lowercase == "smote": sm = SMOTE(sampling_strategy='auto', random_state=seed) X_resampled, Y_resampled = sm.fit_resample(x, y) if method.lowercase == "adasyn": adas = ADASYN() X_resampled, Y_resampled = adas.fit_resample(x, y) if method.lowercase == "enn": enn = EditedNearestNeighbours() X_resampled, Y_resampled = enn.fit_resample(x, y) if method.lowercase == "cnn": cnn = CondensedNearestNeighbour() X_resampled, Y_resampled = cnn.fit_resample(x, y) if method.lowercase == "oss": oss = OneSidedSelection() X_resampled, Y_resampled = oss.fit_resample(x, y) if method.lowercase == "nm": nm = NearMiss(version=3, n_neighbors_ver3=n) X_resampled, Y_resampled = nm.fit_resample(x, y) if method.lowercase == "smotetomek": smotetomek = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')) X_resampled, Y_resampled = smotetomek.fit_resample(x, y) return X_resampled, Y_resampled
def workflow_no_oversampling(self, remove_tomeklinks, model_name): """ This function performs the workflow of classification without any oversampling :return: f1 score without oversampling """ train_x_expanded, train_y_binary = self.pre_process(test_data=False) inos_p_old = train_x_expanded[train_y_binary == 1] inos_n = train_x_expanded[train_y_binary == 0] print("debug, shape of inos_p_old, inos_n") print(inos_p_old.shape, inos_n.shape) x_res = pd.concat([inos_p_old, inos_n], axis=0) # create y_res y_res_p = np.ones(inos_p_old.shape[0]) y_res_n = np.zeros(inos_n.shape[0]) y_res = np.concatenate([y_res_p, y_res_n]) print("debug, shape of training data:") print(x_res.shape) print(y_res.shape) if remove_tomeklinks == True: tl = TomekLinks() x_res, y_res = tl.fit_resample(x_res, y_res) print("shape of training data after removing tomek links:") print(x_res.shape) print(y_res.shape) else: pass tmo = self.build_model(x_res, y_res, model_name) # evaluates performance x_test, y_test_binary = self.pre_process(test_data=True) # f1_score, precision, recall = self.eval_model(tmo, x_test, y_test_binary) return f1_score, precision, recall
def undersample(X, y, bal_strategy): print 'Shape of X: ', X.shape print 'Shape of y_Train: ', y.shape if(bal_strategy == "RANDOM" or bal_strategy == "ALL"): # apply random under-sampling rus = RandomUnderSampler() X_sampled, y_sampled = rus.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"): # Apply Tomek Links cleaning tl = TomekLinks() X_sampled, y_sampled = tl.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE' sys.exit(1) return (X_sampled, y_sampled)
def getsampler(self, type): if type == 'none': sampler = NoSampler() elif type == 'randomunder': sampler = RandomUnderSampler() elif type == 'nearmiss': sampler = NearMiss() elif type == 'allknn': sampler = AllKNN() elif type == 'condensednn': sampler = CondensedNearestNeighbour() elif type == 'editednn': sampler = EditedNearestNeighbours() elif type == 'repeatededitednn': sampler = RepeatedEditedNearestNeighbours() elif type == 'tomeklinks': sampler = TomekLinks() elif type == 'randomover': sampler = RandomOverSampler() elif type == 'smote': sampler = SMOTE() elif type == 'adasyn': sampler = ADASYN() elif type == 'smotenc': sampler = SMOTENC() elif type == 'quality': # and self.quality_model_selection_type == 'extended': sampler = QualitySampler(self.n_init) else: print("Unsupported sampler %s" % type) exit(1) if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params( ).keys(): sampler.set_params(random_state=self.random_state) return sampler
def undersample_tomek_link(X, y, label='Tomek links under-sampling', plot=False): tl = TomekLinks(return_indices=True, ratio='all') X_tl, y_tl, id_tl = tl.fit_sample(X, y) X_tl = pd.DataFrame(X_tl, columns=X.columns) y_tl = pd.Series(y_tl, name=y.name) if plot == True: #print('Removed indexes:', id_tl) # plotting using pca pca = PCA(n_components=2) X_pca = pd.DataFrame(pca.fit_transform(X_tl)) colors = ['#1F77B4', '#FF7F0E'] markers = ['o', 's'] for l, c, m in zip(np.unique(y_tl), colors, markers): plt.scatter( X_pca.loc[y_tl == l, 0], # pc 1 X_pca.loc[y_tl == l, 1], # pc 2 c=c, label=l, marker=m) plt.title(label) plt.legend(loc='upper right') plt.show() return X_tl, y_tl, tl, id_tl
def get_sampler(self): sampler = None if self.sampler == 'random-over-sampler': sampler = RandomOverSampler(random_state=self.random_seed) elif self.sampler == 'adasyn': sampler = ADASYN(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'smote': sampler = SMOTE(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'svm-smote': sampler = SVMSMOTE(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'random-under-sampler': sampler = RandomUnderSampler(random_state=self.random_seed) elif self.sampler == 'tomek-links': sampler = TomekLinks(n_jobs=self.njobs) elif self.sampler == 'near-miss': sampler = NearMiss(n_jobs=self.njobs) elif self.sampler == 'instance-hardness': sampler = InstanceHardnessThreshold(random_state=self.random_seed, n_jobs=self.njobs) return sampler
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs): if name == "rus": sampler = RandomUnderSampler( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "nm": sampler = NearMiss( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "enn": sampler = EditedNearestNeighbours(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "renn": sampler = RepeatedEditedNearestNeighbours( return_indices=return_indices, random_state=random_state, **kwargs) elif name == "allknn": sampler = AllKNN(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "tl": sampler = TomekLinks(return_indices=return_indices, random_state=random_state, **kwargs) else: raise ValueError return sampler
def resampling(train_data, train_labels, resampling_type, resampling_stragey): train_data_new = np.reshape(train_data, (train_data.shape[0], train_data.shape[1] * train_data.shape[2] * train_data.shape[3])) if resampling_type == 'SMOTE': train_data_resampled, train_labels_resampled = SMOTE( random_state=42).fit_resample(train_data_new, train_labels.values) elif resampling_type == 'over_sampling': over_sampler = RandomOverSampler(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = over_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'under_sampling': under_sampler = RandomUnderSampler( sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = under_sampler.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'tomelinks': t1 = TomekLinks(sampling_strategy=resampling_stragey) train_data_resampled, train_labels_resampled = t1.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'near_miss_neighbors': undersample = NearMiss(version=1, n_neighbors=3) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) elif resampling_type == 'one_sided_selection': undersample = OneSidedSelection(n_neighbors=1, n_seeds_S=200) train_data_resampled, train_labels_resampled = undersample.fit_resample( train_data_new, train_labels.values) return train_data_resampled, train_labels_resampled
def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(sampling_strategy="all") smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([ [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518], ]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False): ''' Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation If normalize flag is True then the data are being normalised The sampling parameter sets the type of sampling to be used ''' print('----------{} with {}----------'.format(clf_name, sampling)) totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0 plot_ind = randint(0, 9) j = 0 skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(usx, usy): x_train, x_test = usx[train_index], usx[test_index] y_train, y_test = usy[train_index], usy[test_index] if sampling == 'SMOTE': x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ADASYN': x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ENN': x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train) elif sampling == 'Tomek': x_train, y_train = TomekLinks().fit_resample(x_train, y_train) elif sampling == 'SMOTETomek': x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'SMOTEENN': x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'NCR': x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train) elif sampling == 'OSS': x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train) if normalize: scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clf.fit(x_train, y_train) # if plot_ind == j and clf_name == 'DecisionTreeClassifier': # plot_decision_tree(clf) y_predict = clf.predict(x_test) for i in range(len(y_predict)): if y_test[i] and y_predict[i]: totalTP += 1 if not y_test[i] and y_predict[i]: totalFP += 1 if y_test[i] and not y_predict[i]: totalFN += 1 if not y_test[i] and not y_predict[i]: totalTN += 1 j += 1 print('TOTAL TP: ' + str(totalTP)) print('TOTAL FP: ' + str(totalFP)) print('TOTAL FN: ' + str(totalFN)) print('TOTAL TN: ' + str(totalTN))
def smoteTomek(X, y): smote = SMOTE(k_neighbors=3, m_neighbors=10) tomek = TomekLinks() sm = SMOTETomek(smote=smote, tomek=tomek) X_resampled, y_resampled = sm.fit_sample(X, y) return X_resampled, y_resampled
def test_tl_init(): """Test the initialisation of the object""" # Define a ratio tl = TomekLinks(random_state=RND_SEED) assert_equal(tl.n_jobs, -1) assert_equal(tl.random_state, RND_SEED)
def _tomek_data(self): """Performs tomek links. Can not handle nominal values.""" if self.cols_nominal.size > 0: print("Skipping Tomek Links. Cannot perform with raw categorical data. Create dummies to use.") return tl = TomekLinks() self.X_train, self.y_train = tl.fit_sample(self.X_train, self.y_train)
def get_sampler(self): sampler = None if self.sampler == 'tomek-links': sampler = TomekLinks(random_state=self.random_seed, n_jobs=self.njobs) return sampler
def analyze_resampled_class_train_data(): parent_dir = Path.cwd().parent pickle_dir = parent_dir.joinpath('default_results', 'pickle_files_feat_eng') for i, emotion in Dictionaries.emo_dict.items(): for vect_name, vectorizer in Dictionaries.vectorizer_dict.items(): print('\n\nResampled data - EMOTION: ' + emotion + ', VECTORIZER: ' + vect_name) preprocess_train_df, feat_transformed_train_df = df, df # Fit transform the vectorizer with the corresponding preprocessed training data if os.path.exists( pickle_dir.joinpath(emotion + '_c_train_preprocess_df.pkl')): preprocess_train_df = pd.read_pickle( pickle_dir.joinpath(emotion + '_c_train_preprocess_df.pkl')) train_vect = vectorizer.fit_transform( preprocess_train_df['preprocessed_text'].values) print(emotion + ' vectorized features: ', train_vect.shape) train_vect_df = pd.DataFrame( train_vect.toarray(), columns=vectorizer.get_feature_names()) if os.path.exists( pickle_dir.joinpath(emotion + '_c_train_feat_transform_df.pkl')): feat_transformed_train_df = pd.read_pickle( pickle_dir.joinpath(emotion + '_c_train_feat_transform_df.pkl')) print(emotion + ' transformed features: ', feat_transformed_train_df.shape) else: # If the file doesnt exist, exit the program with instructions print( '\nRequired files does not exist.\n\n Please, train the models first by running > Modelling.py' ) sys.exit(1) features_df = pd.concat([train_vect_df, feat_transformed_train_df], axis=1) print(emotion + ' merged features: ', features_df.shape) #Resample the training data using SMOTE, Tomek links and SMOTETomek smote_X_train, smote_y_train = SMOTE( random_state=42, sampling_strategy='minority', n_jobs=-1).fit_resample( features_df, preprocess_train_df['Affect Dimension']) tomek_X_train, tomek_y_train = TomekLinks( random_state=42, sampling_strategy='majority', n_jobs=-1).fit_resample( features_df, preprocess_train_df['Affect Dimension']) smotetomek_X_train, smotetomek_y_train = SMOTETomek( random_state=42).fit_resample( features_df, preprocess_train_df['Affect Dimension']) print('Data in SMOTE, Tomek links, SMOTETomek') print(smote_X_train.shape[0], tomek_X_train.shape[0], smotetomek_X_train.shape[0]) print(Counter(smote_y_train), Counter(tomek_y_train), Counter(smotetomek_y_train))
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def test_tl_fit_single_class(): """Test either if an error when there is a single class""" # Create the object tl = TomekLinks(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(RuntimeWarning, tl.fit, X, y_single_class)
def test_tl_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object tl = TomekLinks(random_state=RND_SEED) tl.fit(X, Y) assert_raises(RuntimeError, tl.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def get_tomeklinks_under_sampled_dataset(): tl = TomekLinks(return_indices=True, ratio='majority') X_tl, y_tl, id_tl = tl.fit_sample(X_train, y_train) print('Removed indexes:', id_tl) shuffle(X_tl) y_tl = X_tl[target] return X_tl, y_tl
def get_binary_Tomek_Links_cleaned_data(id_df, X_df, y_df): tLinks = TomekLinks() a = y_df.iloc[:, 0] tLinks.fit_sample(X_df, y_df.iloc[:, 0]) sample_indices = tLinks.sample_indices_ id_df_cleaned = id_df.iloc[sample_indices] X_df_cleaned = X_df.iloc[sample_indices] y_df_cleaned = y_df.iloc[sample_indices] return id_df_cleaned, X_df_cleaned, y_df_cleaned