def __init__(self, window_size=6, training_ratio=.7, seq="sequence", pos="label"): self.training_ratio = training_ratio # Float value representing % of data used for training self.features = [] self.labels = [] self.words = [] self.window_size = window_size self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4), "mlp_adam": MLPClassifier(), "svc": svm.SVC(verbose=1), "xgb": XGBClassifier(max_delta_step=5), "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf") } self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(), "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(), "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(), "near_miss": NearMiss(), "pass": -1} self.seq = seq self.pos = pos self.random_data = 0 self.test_results = 0 self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"} self.vector = 0 self.features_labels = {} self.test_cv = 0 self.benchmark_mcc = 0 self.mcc_scorer = make_scorer(matthews_corrcoef)
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def buildModel(clf, X, y, cv_nums=10, is_random=False): # 是否打乱数据 if is_random == True: random_lst = list(np.random.randint(0, 1000, 4)) elif is_random == False: random_lst = [0] * 4 print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------') # 不做处理,使用原始数据集做预测 print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums))) ros = RandomOverSampler(random_state=random_lst[0]) X_oversampled, y_oversampled = ros.fit_sample(X, y) # print(sorted(Counter(y_oversampled).items())) print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums))) cc = ClusterCentroids(random_state=random_lst[1]) X_undersampled, y_undersampled = cc.fit_sample(X, y) #print(sorted(Counter(y_undersampled).items())) print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums))) sm = SMOTE(random_state=random_lst[2]) X_smote, y_smote = sm.fit_sample(X, y) #print(sorted(Counter(y_smote).items())) print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums))) # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样, # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份, # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用 ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10) X_ee, y_ee = ee.fit_sample(X, y)
def ensemble_train(X,y, working_dir,n, name, svm=True): ees = EasyEnsemble(random_state=557, n_subsets=n) X_res, y_res = ees.fit_sample(X,y) try: raise Exception('Retrain') with open(working_dir + "/" + name + '.pkl', 'rb') as f1: clf = pickle.load(f1) except: # scores = cross_val_score(clf, X, y, cv=4, scoring="roc_auc") # print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2)) clf = [] for i in range(len(X_res)): print(Counter(y_res[i])) if(svm): clfi = SVC(kernel="linear", probability=True) else: clfi = AdaBoostClassifier(n_estimators=20) #clfi=AdaBoostClassifier() clfi.fit(X_res[i], y_res[i]) clf.append(clfi) scores = cross_val_score(clfi, X_res[i], y_res[i], cv=4, scoring="roc_auc") print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2)) with open(working_dir + "/" + name + '.pkl', 'wb') as f1: pickle.dump(clf, f1) return clf
def test_fit_sample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [-2.10724436, 0.70263997], [-1.23195149, 0.15427291], [0.59091459, 0.40692742], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def __init__(self, base_model, n_subsets): self.base_model = base_model self.n_subsets = n_subsets self.easy_ensemble = EasyEnsemble('auto', random_state=RAND_SEED, n_subsets=4) self.trained_based_models = []
def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742]]]) y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]) idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2], [5, 9, 8, 0, 2, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def ezensemble(X_train, y_train): a = list(X_train) ee = EasyEnsemble(random_state=0, n_subsets=10) ee.fit(X_train, y_train) X_resampled, y_resampled = ee.fit_sample(X_train, y_train) X_resampled = pd.DataFrame(X_resampled[1], columns=a) y_resampled = pd.DataFrame(y_resampled[1], columns=['Target']) return X_resampled, y_resampled
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 10) ee = EasyEnsemble(random_state=RND_SEED) assert_warns(UserWarning, ee.fit, X, y)
def test_ee_fit_invalid_ratio(): """Test either if an error is raised when the balancing ratio to fit is smaller than the one of the data""" # Create the object ratio = 1. / 10000. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Fit the data assert_raises(RuntimeError, ee.fit, X, Y)
def easy_ensemble(train_set, train_label): ee = EasyEnsemble(ratio='auto', return_indices=True, random_state=None, replacement=False, n_subsets=easy_ensemble_num) X_resampled, y_resampled, idx_resampled = ee.fit_sample( train_set, train_label) return X_resampled, y_resampled
def test_ee_init(): # Define a ratio ratio = 1. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert_equal(ee.ratio, ratio) assert_equal(ee.replacement, False) assert_equal(ee.n_subsets, 10) assert_equal(ee.random_state, RND_SEED)
def test_ee_init(): # Define a ratio ratio = 1. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert ee.ratio == ratio assert ee.replacement is False assert ee.n_subsets == 10 assert ee.random_state == RND_SEED
def EasySample(data): x = data.iloc[:, 0:2] y = data.iloc[:, -2] # 使用集成方法EasyEnsemble处理不均衡样本 model_EasyEnsemble = EasyEnsemble() # 建立EasyEnsemble模型对象 x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample( x, y) # 输入数据并应用集成方法处理 print(x_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的x样本集概况 print(y_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的y标签集概况
def test_random_state_none(): # Define the sampling_strategy parameter sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ee = EasyEnsemble(random_state=RND_SEED) ee.fit(X, Y) assert_raises(RuntimeError, ee.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert_raises(RuntimeError, ee.sample, X, Y)
def test_ee_init(): # Define a sampling_strategy sampling_strategy = 1. ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ee.sampling_strategy == sampling_strategy assert ee.replacement is False assert ee.n_subsets == 10 assert ee.random_state == RND_SEED
def test_random_state_none(): """Test that the processing is going throw with random state being None.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_ee_init(): """Test the initialisation of the object""" # Define a ratio ratio = 1. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert_equal(ee.ratio, ratio) assert_equal(ee.replacement, False) assert_equal(ee.n_subsets, 10) assert_equal(ee.random_state, RND_SEED)
def generate_data(self, random_=1, random_ratio=2, random_test=0): imb_fun = { "smote": SMOTEENN(), "under": RandomUnderSampler(), "adasyn": ADASYN(), "ee": EasyEnsemble(), "smotetomek": SMOTETomek() } rand_features = [] neg_labels = [0 for i in range(len(self.neg_features))] pos_labels = [1 for i in range(len(self.pos_features))] features = self.pos_features + self.neg_features labels = pos_labels + neg_labels if self.imba: for i in self.imba: features, labels = imb_fun[i].fit_sample(features, labels) if random_ == 1 and random_ratio > 0: for i in range( int((len(self.pos_features) + len(self.neg_features)) * random_ratio)): rand_features.append( featurify( ProteinAnalysis( random_seq(locked=self.pos_seq, wing_size=self.window, center=self.amino_acid)), (2 * self.window + 1))) if random_test == 0: temp = list(zip(features, labels)) random.shuffle(temp) features, labels = zip(*temp) training_slice = int(self.training_ratio * len(labels)) self.training_features = list( features[:training_slice]) + rand_features self.training_labels = list(labels[:training_slice]) + [ 0 for i in range(len(rand_features)) ] self.test_features = features[training_slice:] self.test_labels = labels[training_slice:] else: features = features + rand_features labels = labels + [0 for i in range(len(rand_features))] temp = list(zip(features, labels)) random.shuffle(temp) features, labels = zip(*temp) training_slice = int(self.training_ratio * len(labels)) self.training_features = list(features[:training_slice]) self.training_labels = list(labels[:training_slice]) self.test_features = features[training_slice:] self.test_labels = labels[training_slice:]
def test_ee_fit_single_class(): """Test either if an error when there is a single class""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(UserWarning, ee.fit, X, y_single_class)
def test_ee_bad_ratio(): """Test either if an error is raised with a wrong decimal value for the ratio""" # Define a negative ratio ratio = -1.0 ee = EasyEnsemble(ratio=ratio) assert_raises(ValueError, ee.fit, X, Y) # Define a ratio greater than 1 ratio = 100.0 ee = EasyEnsemble(ratio=ratio) assert_raises(ValueError, ee.fit, X, Y) # Define ratio as an unknown string ratio = 'rnd' ee = EasyEnsemble(ratio=ratio) assert_raises(ValueError, ee.fit, X, Y) # Define ratio as a list which is not supported ratio = [.5, .5] ee = EasyEnsemble(ratio=ratio) assert_raises(ValueError, ee.fit, X, Y)
def ensemble_model(X_train, y_train): # define the methods over = BorderlineSMOTE(k_neighbors=7, kind="borderline-1") under = EasyEnsemble(random_state=1) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset new_X_train, new_y_train = pipeline.fit_resample(X_train, y_train) return new_X_train[0], new_y_train[0]
def fit(self, train_x, train_y): self._estimators = [] ee = EasyEnsemble(replacement=True, n_subsets=self._no_of_estimators) X_res, y_res = ee.fit_sample(train_x, train_y) for i in range(self._no_of_estimators): X, y = X_res[i, :, :], y_res[i, :] estimator = clone(self._base_classifier) estimator.fit(X, y) self._estimators.append(estimator) return self
def get_downsampling_data(train_pth="data/train_data.npy", val_pth="data/val_data.npy", test_pth="data/test_data.npy"): train_data = np.load(train_pth)[:, :-1] train_flag = np.load(train_pth)[:, -1] ee = EasyEnsemble(random_state=0, n_subsets=10) train_data, train_flag = ee.fit_sample(train_data, train_flag) train_flag = np.array(train_flag, dtype=np.int) val_data = np.load(val_pth)[:, :-1] val_flag = np.load(val_pth)[:, -1] val_flag = np.array(val_flag, dtype=np.int) test_data = np.load(test_pth)[:, :-1] test_flag = np.load(test_pth)[:, -1] test_flag = np.array(test_flag, dtype=np.int) return train_data, train_flag, val_data, val_flag, test_data, test_flag
def test_ee_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Fit the data ee.fit(X, Y) # Check if the data information have been computed assert_equal(ee.min_c_, 0) assert_equal(ee.maj_c_, 1) assert_equal(ee.stats_c_[0], 500) assert_equal(ee.stats_c_[1], 4500)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ee = EasyEnsemble(random_state=RND_SEED) X_resampled, y_resampled = ee.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled[0]) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 400) assert_equal(count_y_res[2], 400)
def test_ee_init(): """Test the initialisation of the object""" # Define a ratio ratio = 1. verbose = True ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, verbose=verbose) assert_equal(ee.ratio, ratio) assert_equal(ee.replacement, False) assert_equal(ee.n_subsets, 10) assert_equal(ee.random_state, RND_SEED) assert_equal(ee.verbose, verbose) assert_equal(ee.min_c_, None) assert_equal(ee.maj_c_, None) assert_equal(ee.stats_c_, {})
def easy_ensemble_classifier(clf, x_train, y_train, x_test, nsubs, repl): ee = EasyEnsemble(n_subsets=nsubs, replacement=repl) # Create EasyEnsemble object X_train_res, y_train_res = ee.fit_sample(x_train, y_train) # re-sample the data clfs = [] i = 0 preds_ = np.zeros([1, np.shape(x_test)[0]]) # Iterate through sub-samples: for xtrain in X_train_res: clfs += [clf] clfs[i].fit(xtrain, y_train_res[i]) preds_ = np.add(preds_, clfs[i].predict(x_test)) i += 1 return np.divide(preds_, nsubs)