def test_ada_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.8 ada = ADASYN(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ada_fit_sample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.94899098, -0.30508981], [0.28204936, -0.13953426], [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_ada_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Resample the data nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29427267, 0.21740707], [0.68118697, -0.25220353], [1.37180201, 0.37279378], [-0.59243851, -0.80715327]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ada = ADASYN(random_state=RND_SEED) ada.fit(X, Y) assert_raises(RuntimeError, ada.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_ada_fit_sample(): """Test the fit sample routine""" # Resample the data ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ada_fit(): """Test the fitting method""" # Create the object ada = ADASYN(random_state=RND_SEED) # Fit the data ada.fit(X, Y) # Check if the data information have been computed assert_equal(ada.min_c_, 0) assert_equal(ada.maj_c_, 1) assert_equal(ada.stats_c_[0], 8) assert_equal(ada.stats_c_[1], 12)
def oversample(X, y, bal_strategy): if(bal_strategy == "SMOTESVN" or bal_strategy == "ALL"): # Apply SMOTE SVM sm = SMOTE(kind='svm') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "SMOTE" or bal_strategy == "ALL"): # Apply regular SMOTE sm = SMOTE(kind='regular') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "ADASYN" or bal_strategy == "ALL"): # Apply the random over-sampling ada = ADASYN() X_sampled, y_sampled = ada.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE' sys.exit(1) return (X_sampled, y_sampled)
def main(): climate_data = pd.read_csv(".././DataSets/Lead_10_Hist_10.csv") climate_data = np.asarray(climate_data) end_col = climate_data.shape[1] print(climate_data.shape) #--------------------------------------------- #segregating the predictand and predictors X = climate_data[:, :end_col - 1] Y = climate_data[:, end_col - 1] # print(X.shape, Y.shape) # print(X[0][1], Y) #---------------------------------------------- # checking the number of samples for each class print("\nSamples of each rainfall class in overall set: ", collections.Counter(Y)) #------------------------------------------------------------------ # dividing into training and test set X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False) print("\nSamples in training set: ", collections.Counter(Y_train)) # --------------------------------------------------- # Upsampling the data for increasing the balance between class #resampling should be done over the training set and test set should be put away from it # #method 1: SMOTE # X_resampled1, Y_resampled1 = SMOTE().fit_resample(X_train, Y_train) # print("\nSMOTE:", sorted(collections.Counter(Y_resampled1).items())) #method 2: ADASYN X_resampled2, Y_resampled2 = ADASYN().fit_resample(X_train, Y_train) print("\nADASYN:", sorted(collections.Counter(Y_resampled2).items())) #----------------------------------------------------------------- # Calling the classifier module Y_pred_all = pipe_selkbest_RF(X_resampled2, Y_resampled2, X_test) Y_true = Y_test ind = 0 for comp in n_components_to_test: # evaluating the classification print("\n Reduced number of features: ", comp) calculateEvaluationMetrics(Y_true, Y_pred_all[ind]) ind = ind + 1
def plot_data(X, Y): # train_X = PCA(n_components=2).fit_transform(train_X) plt.rcParams['figure.figsize'] = (27.0, 5.0) fig = plt.figure() ax0 = fig.add_subplot(1, 5, 1) ax0.scatter(X[:, 0], X[:, 1], c=Y) #ax0.set_title('Original dataset') plt.axis('off') plt.xticks([]) plt.yticks([]) X1, Y1 = SMOTE().fit_sample(X, Y) ax1 = fig.add_subplot(1, 5, 2) ax1.scatter(X1[:, 0], X1[:, 1], c=Y1) #ax1.set_title('SMOTE') plt.axis('off') plt.xticks([]) plt.yticks([]) X2, Y2 = BorderlineSMOTE(kind='borderline-1').fit_sample(X, Y) ax2 = fig.add_subplot(1, 5, 3) ax2.scatter(X2[:, 0], X2[:, 1], c=Y2) #ax2.set_title('Borderline-SMOTE') plt.axis('off') plt.xticks([]) plt.yticks([]) enn = EditedNearestNeighbours() X3, Y3 = enn.fit_sample(X, Y) smo = SMOTE(k_neighbors=5) X3, Y3 = smo.fit_sample(X3, Y3) ax3 = fig.add_subplot(1, 5, 4) ax3.scatter(X3[:, 0], X3[:, 1], c=Y3) #ax3.set_title('ADASYN') plt.axis('off') plt.xticks([]) plt.yticks([]) X4, Y4 = ADASYN(n_neighbors=3).fit_sample(X, Y) ax4 = fig.add_subplot(1, 5, 4) ax4.scatter(X4[:, 0], X4[:, 1], c=Y4) #ax4.set_title('SMOTE+ENN') plt.axis('off') plt.xticks([]) plt.yticks([]) X5, Y5 = dbscan_based.MultiDbscanBasedOverSample(eps=0.3, min_pts=5).fit_sample(X, Y) ax5 = fig.add_subplot(1, 5, 5) ax5.scatter(X5[:, 0], X5[:, 1], c=Y5) #ax5.set_title('MC-ODG') plt.axis('off') plt.xticks([]) plt.yticks([]) plt.show()
def test_ada_fit_sample(): """Test the fit sample routine""" # Resample the data ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29427267, 0.21740707], [0.68118697, -0.25220353], [1.37180201, 0.37279378], [-0.59243851, -0.80715327]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def use_debug_parameters(self, reduced_selected_features): # Define parameters as an array of dicts in case different parameters are used for different optimizations params_debug = [{ 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__var_smoothing': np.logspace(0, -9, num=100) }] return params_debug
def use_debug_parameters(self, reduced_selected_features): # Define parameters as an array of dicts in case different parameters are used for different optimizations params_debug = [{ 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__n_neighbors': [3, 5], 'model__weights': ['uniform', 'distance'] }] return params_debug
def tackle_data_imbalance(self, X, Y): increase = 3 counter = Counter(Y) total_classes = len(counter) total_data_points = sum(counter.values()) expected_points = total_data_points * increase avg_points_per_class = int(expected_points / total_classes) # generating highest amount of data for each class # higest_key, highest_val = max(counter.items(), key=operator.itemgetter(1)) # famous_dict = dict((key, highest_val) for key in counter) famous_dict = dict( (key, avg_points_per_class) for key in counter) # generating double of previous for each class over = ADASYN(n_neighbors=1, sampling_strategy=famous_dict) under = RandomUnderSampler(sampling_strategy="auto") X, Y = over.fit_resample(X, Y) X, Y = under.fit_resample(X, Y) return X, Y
def initializeSamplers(self): self.makeDataDirectory() random_sampler = RandomOverSampler(sampling_strategy=0.1, random_state=42) smote = SMOTE(sampling_strategy=0.1, random_state=42) ada = ADASYN(sampling_strategy=0.1, random_state=42) smote_tomek = SMOTETomek(sampling_strategy=0.1, random_state=42) smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42) self.samplers = [random_sampler, smote, ada, smote_tomek, smote_enn] self.names = [ "RandomSample", "SMOTE", "ADASYN", "SMOTETomek", "SMOTEEnn" ] for sampler, name in zip(self.samplers, self.names): self.runSampler(sampler, name) self.loadAll()
def compare_different_oversample_method(model, sample_method, X, Y): n_split = 5 skf = StratifiedKFold(n_splits=n_split, shuffle=True) res_list = np.zeros(4) cnt=0 for train_indices, test_indices in skf.split(X, Y): cnt+=1 print('正在进行第{}次交叉验证'.format(cnt)) train_X, train_Y, test_X, test_Y = X[train_indices], Y[train_indices], X[test_indices], Y[test_indices] min_k_kearest = min(Counter(train_Y)) - 1 if sample_method == 'SMOTE_ENN': enn = EditedNearestNeighbours() train_X, train_Y = enn.fit_sample(train_X, train_Y) smo = SMOTE(k_neighbors=min(3, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = smo.fit_sample(train_X, train_Y) elif sample_method == 'smote': smo = SMOTE(k_neighbors=min(3, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = smo.fit_sample(train_X, train_Y) elif sample_method == 'borderline_smote': smo = BorderlineSMOTE(kind='borderline-1', k_neighbors=min(3, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = smo.fit_sample(train_X, train_Y) elif sample_method == 'adasyn': ada = ADASYN(n_neighbors=min(2, min_k_kearest)) if min_k_kearest > 0: train_X, train_Y = ada.fit_sample(train_X, train_Y) elif sample_method: train_X, train_Y = sample_method.fit_sample(train_X, train_Y) model.fit(train_X, train_Y) y_score = model.predict(test_X) y_score_prob = model.predict_proba(test_X)[:, 1] # res_list1 += cal_multi_class_matrics(test_Y,y_sampled_score,y_sampled_score_prob) res_list += cal_multi_class_matrics(test_Y, y_score, y_score_prob) return res_list / n_split
def resample_to_csv(X, y, random_state, path, method): """Re-samples dataset using desired method of oversampling and writes output to CSV. :param X: Original Features :param y: Original Labels :param randomState: Random intialization :param path: Path to output location and name of CSV :param method: Either SMOTE-NN method or BorderLineSMOTE (borderline) method. See imbalanced-learn documentation for more information. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.BorderlineSMOTE.html :return: none """ if method == 'SMOTE-NN': smote_enn = SMOTEENN(random_state=random_state) X_resampled, y_resampled = smote_enn.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path) elif method == 'borderline': borderlineSmote = BorderlineSMOTE(random_state=random_state) X_resampled, y_resampled = borderlineSmote.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path) elif method == 'adasyn': adasyn = ADASYN(random_state=random_state) X_resampled, y_resampled = adasyn.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path) elif method == 'tomek': tomek = SMOTETomek(random_state=random_state) X_resampled, y_resampled = tomek.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path)
def over_sample(X, y, sampler="SMOTE"): samplers = { "RandomOverSampler": RandomOverSampler(), "ADASYN": ADASYN(), "SMOTE": SMOTE(), "BorderlineSMOTE": BorderlineSMOTE(), "SVMSMOTE": SVMSMOTE(), "SMOTENC": SMOTENC(categorical_features=[]), } sampler = samplers[sampler] # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) return X_resampled, y_resampled
def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array( [ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.88161986, -0.2829741], [0.35681689, -0.18814597], [1.4148276, 0.05308106], [0.3136591, -0.31327875], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def retrieve_data(self, ml_cfg): """Pass config file to retrieve generator for training data""" self.ml_cfg = ml_cfg data_list = list() meta_df = pd.DataFrame() batch_count = 1 for data, meta in self.data_in.retrieve_data(self.ml_cfg): data_list.extend(data) meta_df = pd.concat([meta_df, meta]) batch_count += 1 print("Concatenated {} training batches".format(batch_count)) data_resp = np.nan_to_num(np.array(data_list)) meta_resp = df_to_one_hot(meta_df, 'target', 2) data_resp, meta_resp = ADASYN().fit_resample(data_resp, meta_resp) meta_resp = np.squeeze(np.eye(2)[meta_resp].astype('int16')) yield data_resp, meta_resp
def synthetic_sampling_ADASYN(self, dataset): """ :param dataset: :return: """ try: data = dataset.iloc[:, :-2] y = dataset.iloc[:, -1] X_resampled, y_resampled = ADASYN().fit_sample(data, y) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) new_dataset = pd.concat([X_resampled, y_resampled], axis=1) return new_dataset except Exception as e: print(e)
def runRandomUnderSample(train, test, seed): for i in range(5): X_resampled, y_resampled = ADASYN().fit_sample(train[vars], train.wtbz) print(len(X_resampled)) trained = pd.DataFrame( np.concatenate((X_resampled, y_resampled.reshape(-1, 1)), axis=1)) trained.columns = vars + ['wtbz'] rf1Default = baseModel( GradientBoostingClassifier( n_estimators=models['defaultGBM']['n_estimators'], learning_rate=models['defaultGBM']['learning_rate'], max_depth=models['defaultGBM']['max_depth'], max_features=models['defaultGBM']['max_features'], random_state=seed + 29), vars, "rfbase" + str(i), trained, test, seed) return train, test
def setup_data(data, resample=True): X = np.concatenate([data['train']['b'], data['train']['x'][:,0,:], data['train']['a'][:,0,:]], axis=-1) Y = data['train']['ys_seq'][:,0] if resample: ros = RandomOverSampler(random_state=0) smote = SMOTE(); ada = ADASYN() print('resampling...') X, Y = ros.fit_resample(X, Y) X_valid = np.concatenate([data['valid']['b'], data['valid']['x'][:,0,:], data['valid']['a'][:,0,:]], axis=-1) Y_valid = data['valid']['ys_seq'][:,0] CE_valid = data['valid']['ce'] S, S_oh = None, None if 'subtype' in data['train']: S = data['train']['subtype'] S_oh = data['train']['subtype_oh'] return X, Y, S, S_oh, X_valid, Y_valid, CE_valid
def fit(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours() elif by == 'NCR': sampler = NeighbourhoodCleaningRule() elif by == 'Tomek': sampler = TomekLinks() elif by == 'ALLKNN': sampler = AllKNN() elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss() elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y self.base_estimator.fit(X_train, y_train)
def balance_smote(X, y, logger_ins, freq_dct, method, seed_num, n_cluster=5): if method == "SMOTE": logger_ins.info("The sampling method is", method) imbl = SMOTE(sampling_strategy=freq_dct, random_state=seed_num, k_neighbors=n_cluster) else: imbl = ADASYN(sampling_strategy=freq_dct, random_state=seed_num, n_neighbors=n_cluster) logger_ins.info("The sampling method is", method, "using", n_cluster, "as the number of clusters") logger_ins.info("frequencies should match", freq_dct) X_res, y_res = imbl.fit_resample(X, y) logger_ins.info('Resampled dataset shape %s' % Counter(y_res)) return X_res, y_res
def makeOverSamplesADASYN(X, y): from imblearn.over_sampling import ADASYN """ Purpose ---------- Increasing the observation of minority class Parameters ---------- X: Independent Variable in DataFrame y: Dependent Variable in Pandas DataFrame format Returns: ---------- Returns Independent and Dependent variable with resampling minority class """ X_resampled, y_resampled = ADASYN(random_state=7).fit_sample(X, y) return (X_resampled, y_resampled)
def oversample(X, y, method="smote", pos_neg_frac=0.5, plot=False): from imblearn.over_sampling import SMOTE from imblearn.over_sampling import ADASYN from imblearn.over_sampling import RandomOverSampler from imblearn.combine import SMOTEENN from imblearn.combine import SMOTETomek sampler = None X, y = verify_pandas(X, y) pos_neg_frac_now = np.sum(y) / np.sum(~y) if pos_neg_frac <= pos_neg_frac_now: print( "Oversampling isn't need since Pos/Neg current = %.3f is greater than passed Pos/Neg ratio = %.3f" % (pos_neg_frac_now, pos_neg_frac)) return X, y cols = X.columns if method is None: return X, y elif method == "smote": sampler = SMOTE(sampling_strategy=pos_neg_frac) elif method == "adasyn": sampler = ADASYN(sampling_strategy=pos_neg_frac) elif method == "randomoversampler": sampler = RandomOverSampler(sampling_strategy=pos_neg_frac) elif method == "smoteenn": sampler = SMOTEENN(sampling_strategy=pos_neg_frac) elif method == "smotetomek": sampler = SMOTETomek(sampling_strategy=pos_neg_frac) else: raise ValueError("Over sampler not found") X_res, y_res = X.copy(deep=True), y.copy(deep=True) X_res, y_res = sampler.fit_resample(X_res, y_res) X_res = pd.DataFrame(X_res, columns=cols) y_res = pd.Series(y_res) if plot: print("=" * 100 + "\nPlotting Imbalance and Noise after Oversampling") plot_imbalance(y, y_res) plot_reduced_dim(X, y, X_res, y_res, title1="Before Oversampling", title2="After Oversampling") return X_res, y_res
def sample(self, nb_data_to_load, mode='combine'): X, y = self.load_data(nb_data_to_load) # Init sampler sampler = { 'over': ADASYN(), 'under': TomekLinks(), 'combine': SMOTETomek(), }.get(mode) X_resampled, y_resampled = sampler.fit_resample(X, y) # Round datetime, stage and temperature X_resampled[:, 0] = X_resampled[:, 0].round() X_resampled[:, 1] = X_resampled[:, 1].round() X_resampled[:, 2] = X_resampled[:, 2].round(1) self.save_data(self.file_to_save, X_resampled, y_resampled)
def load_data(mode: str, normalize: bool = True): df, hidden_df = __load_data_first_time() # Extract x and y y = np.array(df['earnings'].to_numpy(), dtype=int) del df['earnings'] x = np.array(df.to_numpy(), dtype=float) # Hidden to numpy hidden = hidden_df.to_numpy() if mode == 'vanilla': pass elif mode == 'smote': x, y = SMOTE().fit_sample(x, y) elif mode == 'adasyn': x, y = ADASYN().fit_sample(x, y) elif mode == 'bordersmote': x, y = BorderlineSMOTE().fit_sample(x, y) elif mode == 'randomover': x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'randomunder': x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'tomek': x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'knn': x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y) hidden = hidden[idxs] if normalize: x -= np.mean(x, axis=0) x /= np.std(x, axis=0) return x, y, hidden
def get_oversampling_models(): models, names = list(), list() # RandomOverSampler models.append(RandomOverSampler()) names.append('ROS') # SMOTE models.append(SMOTE()) names.append('SMOTE') # BorderlineSMOTE models.append(BorderlineSMOTE()) names.append('BLSMOTE') # SVMSMOTE models.append(SVMSMOTE()) names.append('SVMSMOTE') # ADASYN models.append(ADASYN()) names.append('ADASYN') return models, names
def over_sample(self, features, method="BorderLine", sampling_strategy="minority", random_state=42, k_neighbors=5, n_neighbors=10, kind="borderline-1"): """ 过采样方法 : param features: list 特征集 :param method: str, option: ADASYN, BorderLine,Random,SVM :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m} :param random_state:int :param k_neighbors:int :param n_neighbors:int :param kind:str, borderline-1,borderline-2 :return:df """ X = self._df[features].values y = self._df[self._target].values print("Original label shape {}".format(Counter(y))) if method == "ADASYN": overSm = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors) elif method == "BorderLine": overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, kind=kind) elif method == "Random": overSm = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state) elif method == "SVM": overSm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, out_step=0.5) else: print("不支持{}该抽样方法".format(method)) return self._df X_res, y_res = overSm.fit_resample(X, y) print("overSample label shape {}".format(Counter(y_res))) _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1) df_new = pd.DataFrame(data=_data, columns=features + [self._target]) return df_new
def classify(over_sampl, tf_idf, use_idf, pca, alphas, neighbors, slack, estimators, portion): """ input: over_sampl: string variable to indicate the name of oversampling method tf_idf: boolean variable to indicate whether to use tf or not use_idf: boolean variable to indicate whether to use idf or not pca: int variable to indicate whether to use PCA or not (<=0 means no, yes otherwise) alphas: NB tuning parameter neighbors: KNN tuning parameter slack: SVM tuning parameter estimators: GradientBoosting, AdaBoost tuning parameter portion: which airline data to work with (None means all airlines) """ if not tf_idf: if pca > 0: return None else: message = "Preprocessing used is Word2Vec & Over Sampling method is " + over_sampl + " data portion " + portion else: if use_idf: message = "Preprocessing used is tf-idf & Over Sampling method is " + over_sampl + " PCA dimension = " + str( pca) + " data portion " + portion else: message = "Preprocessing used is tf & Over Sampling method is " + over_sampl + " PCA dimension = " + str( pca) + " data portion " + portion # load dataset ds = get_dataset() X_train, X_test, Y_train, Y_test = ds.load_data(tf_idf=tf_idf, use_idf=use_idf, use_pca=pca, airway_name=portion) if over_sampl == "RandomOverSampler": X_train, Y_train = RandomOverSampler().fit_sample(X_train, Y_train) elif over_sampl == "SMOTE": X_train, Y_train = SMOTE().fit_sample(X_train, Y_train) elif over_sampl == "ADASYN": X_train, Y_train = ADASYN().fit_sample(X_train, Y_train) clas = classifier() print(message) SVM_result, GB_result, AB_result, KNN_result, NB_result = clas.classify( X_train, X_test, Y_train, Y_test) compare_performance(SVM_result, GB_result, AB_result, KNN_result, NB_result, message)
def get_sampling_technique(): sampling_technique = list() sampling_name = list() # RandomOverSampler sampling_technique.append(RandomOverSampler(random_state=123)) sampling_name.append('RandomOverSampler') # SMOTE sampling_technique.append(SMOTE(random_state=123)) sampling_name.append('SMOTE') # ADASYN sampling_technique.append(ADASYN(random_state=123)) sampling_name.append('ADASYN') # Downsampling tech sampling_technique.append(RandomUnderSampler(random_state=123)) sampling_name.append('RandomUnderSampler') #SMOTEENN sampling_technique.append(SMOTEENN(random_state=123)) sampling_name.append('SMOTEENN') #SMOTETomek sampling_technique.append(SMOTETomek(random_state=123)) sampling_name.append('SMOTETomek') # Combine Over and Undersampling Methods over = RandomOverSampler(random_state=123) under = RandomUnderSampler(random_state=123) sampling_technique.append(Pipeline(steps=[('o', over), ('u', under)])) sampling_name.append('Over-Under Resampling Combination') # Combine Over and Undersampling Methods smote = SMOTE(random_state=123) under = RandomUnderSampler(random_state=123) sampling_technique.append(Pipeline(steps=[('smote', over), ('u', under)])) sampling_name.append('SMOTE-Under Resampling Combination') return sampling_technique, sampling_name
def minority_oversample(X_train, Y_train, algorithm='random_oversample'): ''' Oversample the minority class using the specified algorithm :param X_train: Training set features :param Y_train: Training set labels :param algorithm: The oversampling algorithm to use. One of {"random_oversample", "smote", "adasyn"} :return: A new training set containing oversampled examples ''' if algorithm == 'random_oversample': sampler = RandomOverSampler(random_state=np.random.randint(0, high=1000)) elif algorithm == 'smote': sampler = SMOTE(random_state=np.random.randint(0, high=1000)) elif algorithm == 'adasyn': sampler = ADASYN(random_state=np.random.randint(0, high=1000)) else: sampler = RandomOverSampler(random_state=np.random.randint(0, high=1000)) X_resampled, Y_resampled = sampler.fit_resample(X_train, Y_train) print("Train set shape before oversampling: ", X_train.shape, " Train set shape after resampling: ", X_resampled.shape) return X_resampled, Y_resampled
def get_data_for_prediction(train, test, best_feature, sampling_strategy=0.4, random_state=21): """ Get scaled and balanced train (x, y) and test (y) data """ scaler = StandardScaler() x_train = scaler.fit_transform( train.drop(['user_id', 'is_churned'], axis=1)[best_feature]) y_train = train['is_churned'] x_train_bal, y_train_bal = ADASYN( random_state=random_state, sampling_strategy=sampling_strategy).fit_sample(x_train, y_train) x_test = scaler.transform(test.drop(['user_id'], axis=1)[best_feature]) return x_train_bal, y_train_bal, x_test
def get_best_feature(data, n_feature, random_state=21, sampling_strategy=0.4): """ List of best feature for model """ x = StandardScaler().fit_transform( data.drop(['user_id', 'is_churned'], axis=1)) y = data['is_churned'] x_bal, y_bal = ADASYN(random_state=random_state, sampling_strategy=sampling_strategy).fit_sample( x, y) clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=random_state, n_jobs=-1) clf.fit(x_bal, y_bal) best_feature = clf.feature_importances_ return best_feature[0][:n_feature]
def over_sampling_shift(x, y, delta=0.5, mode='smote', n_neighbors=5): assert(mode in ['smote', 'adasyn']) y_counts = Counter(np.squeeze(y)) x_resampled, y_resampled = under_sampling_shift(x, y, delta=delta) n_min_samples = np.min(list(Counter(y_resampled).values())) n_neighbors = min(n_neighbors, n_min_samples - 1) if mode == 'smote': x_resampled, y_resampled = SMOTE( sampling_strategy=y_counts, k_neighbors=n_neighbors).fit_resample(x_resampled, y_resampled) elif mode == 'adasyn': x_resampled, y_resampled = ADASYN( sampling_strategy=y_counts, n_neighbors=n_neighbors).fit_resample(x_resampled, y_resampled) return x_resampled, y_resampled
def _check_imbalance(self, method: str = 'SMOTE', random_seed: int = 1769) -> dict: """ This function checks for imbalance. Further, it resamples the data and return the dataframe. Currently, we are only using Oversampling. :param: method: This defines the type of sampling to be done. Possible values: ['SMOTE', 'RANDOM'] :return: None """ output = self.raw_data[self.output_column] self.feature_columns = list( set(self.raw_data.columns) - set(self.output_column)) features = self.raw_data[self.feature_columns] before_sampling = Counter(self.raw_data[self.output_column]) if method == 'SMOTE': sampler = SMOTE(sampling_strategy='auto', random_state=random_seed, n_jobs=-1) elif method == 'ADASYN': sampler = ADASYN(sampling_strategy='auto', random_state=random_seed, n_jobs=-1) else: sampler = RandomOverSampler(sampling_strategy='auto', random_state=random_seed) features_resampled, output_resampled = sampler.fit_resample( features, output) pickle.dump(features_resampled, open("./data/output/feature.pkl", 'wb')) pickle.dump(output_resampled, open("./data/output/output.pkl", 'wb')) after_sampling = Counter(output_resampled) return { 'before_sampling_counter': before_sampling, 'after_sampling_counter': after_sampling, 'feature_data': features_resampled, 'output_resampled': output_resampled }
def test_ada_fit_sampling_strategy_error(): sampling_strategy = {0: 9, 1: 12} ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) with raises(ValueError, match="No samples will be generated."): ada.fit_resample(X, Y)
from imblearn.over_sampling import ADASYN # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ada = ADASYN() X_resampled, y_resampled = ada.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
def test_adasyn_error(adasyn_params, err_msg): adasyn = ADASYN(**adasyn_params) with pytest.raises(ValueError, match=err_msg): adasyn.fit_resample(X, Y)
horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #define the size of test #sklearn.model_selection.train_test_split随机划分训练集与测试集 #train_test_split(train_data,train_target,test_size=数字, random_state=0) #ADASYN ada = ADASYN() os_X,os_y = ada.fit_sample(X_train,y_train) os_X = pd.DataFrame(os_X) os_y = pd.DataFrame(os_y) #logistic best_c = printing_Kfold_scores(os_X,os_y) clf_l = LogisticRegression(C = best_c, penalty = 'l1') clf_l.fit(os_X,os_y.values.ravel()) y_pred = clf_l.predict(X_test) #调用ravel()函数将矩阵转变成一维数组 #(ravel()函数与flatten()的区别) # 两者所要实现的功能是一致的(将多维数组降为一维), # 两者的区别在于返回拷贝(copy)还是返回视图(view), # numpy.flatten() 返回一份拷贝,对拷贝所做的修改不会影响(reflects)原始矩阵, # 而numpy.ravel()返回的是视图(view),会影响(reflects)原始矩阵。
def test_ada_wrong_nn_obj(): nn = 'rnd' ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ada.fit_sample(X, Y)
def test_ada_fit_ratio_error(): ratio = {0: 9, 1: 12} ada = ADASYN(ratio=ratio, random_state=RND_SEED) with raises(ValueError, match="No samples will be generated."): ada.fit_sample(X, Y)
def test_ada_fit(): ada = ADASYN(random_state=RND_SEED) ada.fit(X, Y) assert ada.ratio_ == {0: 4, 1: 0}