def create_metric(soft, metric, release, fold=3, boderlinesmote=False): all = [] for i in range(release): path = 'F:\\orca-master\\exampledata\\mData\\ordinalRegressionData\\Three severity\\' + metric + '\\' + soft + '\\' + str( i + 1) + '_code&network_metrics&bugs.csv' auto_spearman_metric, auto_spearman_metric_data = getAutoSpearmanMetric( path) all.append(auto_spearman_metric) for k in range(fold): if boderlinesmote: # 使用borderlinSMOTE auto_spearman_metric_data = auto_spearman_metric_data.dropna( axis=1) x = auto_spearman_metric_data.iloc[:, 0:-1] y = auto_spearman_metric_data.iloc[:, -1:] bord_smote = BorderlineSMOTE(random_state=16, kind="borderline-1") x_res, y_res = bord_smote.fit_resample(x, y) auto_spearman_metric_data = pd.merge(x_res, y_res, how='left', left_index=True, right_index=True) save_path = 'F:\\orca-master\\exampledata\\' + metric + '\\' + soft + '\\' + str( fold) + '-fold\\' + soft + str( i + 1) + '\\matlab\\' + 'train_' + soft + str(i + 1) + '.' + str(k) tmp = shuffle(auto_spearman_metric_data) tmp.to_csv(save_path, header=None, index=False, sep=" ") return all
def main(path, began, is_cent_data=0, iterations=30, temperature=5, attractive_force=1, repulsive_force=0.4, speed=0.02, k=0.5): # 读取数据 # 读取df类型的归一化数据 my_data = tool.unitilize_data(tool.read_KEEL_data(path, began)) # 准备训练数据和测试数据 # 使用fr模型和smote模型数理数据,形成新的数据 # 使用处理完成的数据,进行建模,并预测 # 是否对数据进行中心化处理 if is_cent_data != 0: cent_point = get_cent_point(my_data) else: cent_point = np.array(my_data) fr_data = pd.DataFrame(fr(cent_point,iterations, temperature, attractive_force, repulsive_force, speed, k)) fr_data_x = fr_data.iloc[:, 0:-1] fr_data_y = fr_data.iloc[:, -1] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 随机抽取测试数据的训练数据 X_train, X_test, Y_train, Y_test = train_test_split(fr_data_x, fr_data_y, test_size=0.3, random_state=42) # 根据抽取的训练数据,生成平衡过后的数据集 my_B_SMOTE = B_SMOTE() fr_data_x_smote, fr_data_y_smote = my_B_SMOTE.fit_sample(X_train, Y_train) #经过处理的数数据 print("经过处理的数据:") train(fr_data_x_smote, fr_data_y_smote, X_test, Y_test) #未经处理的数据 X_train_org, X_test_org, Y_train_org, Y_test_org = train_test_split(my_data.iloc[:, 0:-1], my_data.iloc[:, -1], test_size=0.3, random_state=42) data_x_org, data_y_org = my_B_SMOTE.fit_sample(X_train_org, Y_train_org) print("未处理的数据:") train(data_x_org, data_y_org, X_test_org, Y_test_org)
def del_set_smote_data(self): """ 学習データのSMOTE処理を行い学習データを更新する """ # 対象数が少ない場合はサンプリングレートを下げる positive_count_train = self.y_train.sum() negative_count_train = len(self.y_train) - positive_count_train print("check y_train value 0:" + str(negative_count_train) + " 1:" + str(positive_count_train)) if positive_count_train >= 6: smote = BorderlineSMOTE() self.X_train, self.y_train = smote.fit_sample( self.X_train, self.y_train) else: print("----- RandomOverSampler ----- ") ros = RandomOverSampler( # ratio={1: self.X_train.shape[0], 0: self.X_train.shape[0] // 3}, random_state=71) ratio={ 1: negative_count_train, 0: negative_count_train }, random_state=71) # 学習用データに反映 self.X_train, self.y_train = ros.fit_sample( self.X_train, self.y_train) print("-- after sampling: " + str(np.unique(self.y_train, return_counts=True)))
def oversample_with_smote(x_train, y_train, iterator=10): ''' SMOTE를 이용하여 데이터를 oversampling 해줌. :param x_train: 모델에 입력되는 데이터 :param y_train: 모델이 예측할 타겟 :param iterator: sampling 반복 정도 :return: oversampling 된 X, Y ''' sm = BorderlineSMOTE() x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_fin = [] y_train_fin = [] for i in range(iterator): temp_x = [] temp_y = [] indexes = list(range(len(y_train_sm))) random.shuffle(indexes) cnt = 0 max_cnt = len(y_train_sm) // 10 for j in indexes: x = x_train_sm[j] y = y_train_sm[j] if y == i % 2: temp_x.append(x) temp_y.append(y) elif cnt < max_cnt: temp_x.append(x) temp_y.append(y) cnt += 1 x_sm_new, y_sm_new = sm.fit_sample(temp_x, temp_y) x_train_fin.extend(x_sm_new) y_train_fin.extend(y_sm_new) return x_train_fin, y_train_fin
def train(self, gridsearch=False): tic = time.time() self.set_pipeline() X_train_preproc = self.pipeline_feature.fit_transform(self.X_train) bm = BorderlineSMOTE(random_state=2, sampling_strategy='minority', k_neighbors=1, m_neighbors=20) self.X_train_smote, self.y_train_smote = bm.fit_resample( X_train_preproc, self.y_train) if gridsearch: self.model = RandomizedSearchCV( estimator=self.get_estimator(), param_distributions=self.model_params, n_iter=10, cv=2, verbose=5, random_state=42, n_jobs=None, ) self.model.fit(self.X_train_smote, self.y_train_smote) self.mlflow_log_metric("train_time", int(time.time() - tic)) print(colored(f'best score: {self.model.best_score_}', "blue")) print(colored(f'best params: {self.model.best_params_}', "blue")) self.model = self.model.best_estimator_ else: self.model = self.get_estimator() self.model.fit(self.X_train_smote, self.y_train_smote) self.mlflow_log_metric("train_time", int(time.time() - tic))
def oversample(x, y, method): randomstate = 42 if method == 'No Sample': # 不采样 return x, y elif method == 'random': # 随机过采样 ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=randomstate) X_resampled, y_resampled = ros.fit_resample(x, y) elif method == 'SMOTE': # SMOTE算法 X_resampled, y_resampled = SMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'Sparse SMOTE': # Sparse SMOTE算法 X_resampled, y_resampled = SparseSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'SMOTEBorderline-1': # BorderlineSmote算法 borderline-1 X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-1', random_state=randomstate).fit_resample(x, y) elif method == 'SMOTEBorderline-2': # BorderlineSmote算法 borderline-2 X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-2', random_state=randomstate).fit_resample(x, y) elif method == 'SVMSMOTE': # SVMSMOTE算法 X_resampled, y_resampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'ADASYN': # ADASYN算法 X_resampled, y_resampled = ADASYN(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'mwmote': # MWMOTE算法 X_resampled, y_resampled = MWMOTE.MWMOTE(x, y, N=1000, return_mode='append') # 统计过采样数量 # from collections import Counter # print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled
def Smote_bd( data, label): #样本的近邻至少有一半是其他类,(此时样本被称为危险样本)最近邻中的随机样本b与该少数类样本a来自于不同的类 from imblearn.over_sampling import BorderlineSMOTE smote = BorderlineSMOTE(random_state=0) data_smote_bd, label_smote_bd = smote.fit_sample(data, label) return data_smote_bd, label_smote_bd
def bordersmote(x, y): # Borderline-SMOTE k_neighbors = math.ceil(sum(y) * 0.01) m_neighbors = math.ceil(sum(y) * 0.01) bordersmote = BorderlineSMOTE(sampling_strategy=1, k_neighbors=k_neighbors, m_neighbors=m_neighbors) return bordersmote.fit_resample(x, y)
def oversample_remainingSet(self, instances, labels, kind='borderline-1'): """oversamples remaining set (using BorderlineSMOTE) after a drift is detected.""" if len(np.unique(labels)) >= 2: minority_class = collections.Counter(labels.tolist()).most_common()[-1][0] if np.sum(labels == minority_class) > self.n_neighbors: oversample = BorderlineSMOTE(k_neighbors=self.n_neighbors, m_neighbors=5, kind=kind, random_state=self.random_state) instances, labels = oversample.fit_sample(instances, labels) return instances, labels
def over_under_sampling(x, y): print('Generating synthetic samples...') over = BorderlineSMOTE() # under = RandomUnderSampler(sampling_strategy=0.5) # steps = [('o', over), ('u', under)] # pipeline = Pipeline(steps=steps) # x, y = pipeline.fit_resample(x, y) x, y = over.fit_resample(x, y.idxmax(axis=1)) y = pd.get_dummies(y) return x, y
def test_borderline_smote(kind, data): bsmote = BorderlineSMOTE(kind=kind, random_state=42) bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11)) X_res_1, y_res_1 = bsmote.fit_resample(*data) X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def up_sampling(X_train, y_train, ratio=2): pos_num = (y_train == 1).sum() if pos_num == 0: return X_train, y_train pos_sap_num = int(pos_num * ratio) X_train.fillna(0, inplace=True) smo = BorderlineSMOTE(sampling_strategy={1: pos_sap_num}, random_state=2019, n_jobs=8) X_train, y_train = smo.fit_resample(X_train, y_train) return X_train, y_train
def borderline_smote(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): sm = BorderlineSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def resample(X, Y, resampling): X_resampled, y_resampled = X, Y if resampling == 'oversampling': from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(X, Y) if resampling == 'undersampling': from imblearn.under_sampling import ClusterCentroids cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(X, Y) if resampling == 'smote': from imblearn.over_sampling import BorderlineSMOTE # from imblearn.over_sampling import SMOTE X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, Y) return X_resampled.fillna(0), y_resampled.fillna(0)
def generate_oversamplers(factor): """Generate a list of oversamplers that pre-apply undersampling.""" if factor is None: return [('BENCHMARK METHOD', None, {})] return [('NO OVERSAMPLING', UnderOverSampler(oversampler=None, factor=factor), {}), ('RANDOM OVERSAMPLING', UnderOverSampler(oversampler=RandomOverSampler(), factor=factor), {}), ('SMOTE', UnderOverSampler(oversampler=SMOTE(), factor=factor), { 'oversampler__k_neighbors': [3, 5] }), ('BORDERLINE SMOTE', UnderOverSampler(oversampler=BorderlineSMOTE(), factor=factor), { 'oversampler__k_neighbors': [3, 5] }), ('G-SMOTE', UnderOverSampler(oversampler=GeometricSMOTE(), factor=factor), { 'oversampler__k_neighbors': [3, 5], 'oversampler__selection_strategy': ['combined', 'minority', 'majority'], 'oversampler__truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'oversampler__deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0] })]
def test_combine_results_multiple(): """Test the combination of experimental results for different datasets, oversamplers and classifiers.""" # Clone and fit experiments experiment1 = (clone(EXPERIMENT).set_params( oversamplers=[('bsmote', BorderlineSMOTE(), { 'k_neighbors': [2, 5] })], classifiers=[('gbc', GradientBoostingClassifier(), {})], scoring=['accuracy', 'f1'], ).fit(DATASETS[:-1])) experiment2 = (clone(EXPERIMENT).set_params( scoring=['accuracy', 'f1']).fit(DATASETS[-1:])) # Extract combined results combined_results = combine_results(experiment1.results_, experiment2.results_) results = combined_results.reset_index() # Assertions assert set(results.Dataset) == {'A', 'B', 'C'} assert set(results.Oversampler) == {'random', 'smote', 'bsmote'} assert set(results.Classifier) == {'dtc', 'knc', 'gbc'} assert set([scorer[0] for scorer in combined_results.columns ]) == set(['accuracy', 'f1']) pd.testing.assert_frame_equal( combined_results, pd.concat([experiment1.results_, experiment2.results_]).sort_index(), )
def perform_smote_undersample(x, y, smote_type='regular', strategy='auto', seed=16, binary=False): _np.random.seed(seed) if smote_type == 'regular': sm = SMOTE(random_state=seed, k_neighbors=3, sampling_strategy=strategy, n_jobs=14) elif smote_type == 'borderline': sm = BorderlineSMOTE(random_state=seed, k_neighbors=5, sampling_strategy=strategy, n_jobs=14) if len(y.shape) > 1: x, y = sm.fit_resample(x, y[:,1].reshape(-1)) else: x, y = sm.fit_resample(x, y) y = y.reshape(-1).astype(_np.int8) #print('Head of y: {}'.format(y[:6])) if binary: y_binary = _np.zeros((y.shape[0], 2)) for i in range(y.shape[0]): #print('i: {} y[i]= {}'.format(i, y[i])) y_binary[i, y[i]] = 1 _np.random.seed(seed) _np.random.shuffle(y_binary) y = y_binary #print('Head y_binary: {}'.format(y_binary[:6, :])) return x, y
def upper_region(): X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET], axis=1) # Features - drop region, class y = data_frame[TOP_LEVEL_TARGET] # Labels print("Region Count ", (pd.DataFrame(y)).groupby(TOP_LEVEL_TARGET).size()) kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=False) model = tune_random_forest(RandomForestClassifier(random_state=42), X, y) clf = svm.SVC(kernel='linear', C=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=10) gb = GradientBoostingClassifier() gb = tune_gb(X, y) lr = LogisticRegression(multi_class='ovr') rf = RandomForestClassifier(n_estimators=200, max_depth=20) pipeline = Pipeline( [ ('ROS', BorderlineSMOTE()), ('model', clf) ] ) scoring = ['accuracy', 'f1_micro', 'precision_micro', 'recall_micro'] cv_results = cross_validate(pipeline, X, y, cv=kfold, scoring=scoring) # print('%f (%f)' % (cv_results.mean(), cv_results.std())) - error print(sorted(cv_results.keys())) print(cv_results['fit_time'].mean()) print(cv_results['score_time'].mean()) print(cv_results['test_accuracy'].mean()) print(cv_results['test_f1_micro'].mean()) print(cv_results['test_precision_micro'].mean()) print(cv_results['test_recall_micro'].mean()) joblib.dump(clf, filename='../resources/models/parent_classifier.pkl')
def sampler(X, y, over_pct=0.1, under_pct=0.2): over = BorderlineSMOTE(random_state=42, sampling_strategy=over_pct) under = RandomUnderSampler(random_state=42, sampling_strategy=under_pct) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X, y = pipeline.fit_resample(X, y) return X, y
def oversample_borderline_SMOTE(df, variant=1, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('Original dataset shape %s' % Counter(y)) if variant == 1: sm = BorderlineSMOTE(random_state=0, kind="borderline-1") else: sm = BorderlineSMOTE(random_state=0, kind="borderline-2") X_res, y_res = sm.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def smote_tomek(x_train, y_train): oversample = BorderlineSMOTE(sampling_strategy=0.5, random_state=0, k_neighbors=5, m_neighbors=10, n_jobs=-1, kind='borderline-1') X, y = oversample.fit_resample(x_train, y_train) tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1) X, y = tom_lin.fit_resample(X, y) # print(len([i for i in y_train.values if i==1])) # print(len([i for i in y.values if i==1])) # print(len(y_train)) # print(len(y)) return X, y
def __init__(self, lemmatization=False): BugCoupleModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False cleanup_functions = [ feature_cleanup.responses(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( [ ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)), ( "union", ColumnTransformer([("text", self.text_vectorizer(), "text")]), ), ] ) self.clf = LinearSVCWithLabelEncoding(LinearSVC())
def Borderline_DBSCAN(train_data, label, eps=20.1, min_samples=5): label_index = 0 if label == 'c': label_index = 1 if label == 'b': label_index = 0 print(train_data['label'].value_counts()) boSMOTE = BorderlineSMOTE(kind='borderline-1') x, y = boSMOTE.fit_resample(train_data.iloc[:, :-1], train_data.iloc[:, -1]) # print(boSMOTE.sample) BMG_sample = boSMOTE.sample[label_index][1] BMG_sample = pd.DataFrame(BMG_sample, columns=train_data.columns.values.tolist()[:-1]) BMG_sample['label'] = label max_sample = [] min_sample = [] # print(train_data.shape[0]) for temp in range(train_data.shape[0]): if train_data.iloc[temp, -1] == label: min_sample.append(train_data.iloc[temp, :].values) else: max_sample.append(train_data.iloc[temp, :].values) max_sample = pd.DataFrame(max_sample, columns=train_data.columns.values.tolist()) min_sample = pd.DataFrame(min_sample, columns=train_data.columns.values.tolist()) mergeSample = pd.concat([max_sample, BMG_sample], ignore_index=False) # print(min_sample.shape[0]) # print(max_sample.shape[0]) # print("**9**") # print(mergeSample.shape[0]) dbsc = DBSCAN(eps=eps, min_samples=min_samples).danger_fit(X=mergeSample, danger_sample=BMG_sample) array_neighborhoods = dbsc.neighborhoods neighborhoods_index = [] array_n_neighbors = dbsc.n_neighbors for temp in range(len(array_n_neighbors)): if array_n_neighbors[temp] >= 5: for i in range(array_n_neighbors[temp]): neighborhoods_index.append(array_neighborhoods[temp][i]) new_sample_index = list(set(neighborhoods_index)) num_sample = BMG_sample.shape[0] # print(array_neighborhoods) # print(len(new_sample_index)) # print(train_data.shape[0]) return min_sample, mergeSample, new_sample_index, num_sample
def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) self.calculate_importance = False feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.product(), # TODO: We would like to use the component at the time of filing too, # but we can't because the rollback script doesn't support changes to # components yet. # bug_features.component(), bug_features.num_words_title(), bug_features.num_words_comments(), bug_features.keywords(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True ), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords(set(keyword_dict.keys())), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.title(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ]), ), ]) self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def classification(self,X,Y): X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) #text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())]) vectorizer = TfidfVectorizer() # vectorizer2 = TfidfVectorizer() X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) sm = BorderlineSMOTE() X_res, Y_res = sm.fit_sample(X_train_tfidf, y_train) clf = MultinomialNB() clf.fit(X_res, Y_res) prediction = clf.predict(X_test_tfidf) print(prediction) final_time = start_time - datetime.datetime.now() print(final_time) print(metrics.classification_report(y_test,prediction)) print(metrics.roc_auc_score(y_test, prediction))
def imbalanced_sampler(input_data, input_labels, method='SMOTE'): if method == 'SMOTE': sampler = BorderlineSMOTE(n_jobs=4, random_state=RANDOM_STATE) elif method == 'Near Miss': sampler = NearMiss(n_jobs=4, random_state=RANDOM_STATE) else: print('Invalid sampler type. Only `SMOTE` (Borderline) and `Near Miss` are supported...') sys.exit(0) # TODO save samples by class to reduce file size max_class_num = np.max(input_labels) class_range = np.arange(1, max_class_num) x_sampled, y_sampled = sampler.fit_resample(input_data, input_labels) for i in class_range: idx = np.argwhere(y_sampled == i) pickle.dump(x_sampled[idx][:], open(method + '_Class_' + str(i) + '_data_samples.pkl', 'wb')) pickle.dump(y_sampled[idx], open(method + '_Class_' + str(i) + '_label_samples.pkl', 'wb')) return x_sampled, y_sampled
def fit(self, x, y, sampling="under", show_info=False): """ 训练集成器 :param x:样本 :param y:标签 """ IR = len(y[y == 1]) / len(y[y == 0]) # 下采样 if sampling == "under": sampling_interval = 1 / (IR * np.log2(IR)) # 采样间隔 balance_rate = 1 / IR # 平衡采样率 start_sampling_rate = balance_rate + sampling_interval if show_info: print("下采样") print("采样前 IR=%.2f" % IR) print("平衡采样率 %.4f 采样间隔 %.4f" % (balance_rate, sampling_interval)) for i in range(self.n_estimator): # 采样率越来越小,采样数量也就越来越少 sampling_rate = start_sampling_rate - pow(2, i + 1) / pow( 2, self.n_estimator) * sampling_interval # 基于密度采样 # x_train, y_train = DBUSampler(sampling_rate=sampling_rate, show_info=False).fit_resample(x, y) # 随机下采样,采样多数类 x_train, y_train = myRandomSampler().under_sampling( x, y, sampling_rate) if show_info: print("当前采样率:%.4f" % sampling_rate) IR = len(y_train[y_train == 1]) / len( y_train[y_train == 0]) print("采样后 IR=%.2f" % IR) self.classifiers[i].fit(x_train, y_train) else: # 上采样 sampling_interval = len(y[y == 1]) / len(y[y == 0]) - 1 if show_info: print("上采样") print("采样前 IR=%.2f" % IR) for i in range(self.n_estimator): sampling_rate = 1 + math.log( i + 1, self.n_estimator) * sampling_interval n_sampling = int(sampling_rate * len(y[y == 0])) x_train, y_train = BorderlineSMOTE(sampling_strategy={ 0: n_sampling }).fit_resample(x, y) if show_info: print("当前采样率 %.4f" % sampling_rate) IR = len(y_train[y_train == 1]) / len( y_train[y_train == 0]) print("采样后 IR=%.2f" % IR) self.classifiers[i].fit(x_train, y_train)
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def over_sample(self, method="BorderLine", sampling_strategy="minority", random_state=42, k_neighbors=5, n_neighbors=10, kind="borderline-1"): """ 过采样方法 :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m} :param random_state:int :param k_neighbors:int :param n_neighbors:int :param kind:str, borderline-1,borderline-2 :return:df """ feature_name = self._df.columns.difference(["id", self._target]).tolist() X = self._df[feature_name].values y = self._df[self._target].values print("Original label shape {}".format(Counter(y))) if method == "ADASYN": overSm = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors) elif method == "BorderLine": overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, kind=kind) elif method == "KMeans": overSm = KMeansSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors) elif method == "Random": overSm = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state) elif method == "SVM": overSm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, out_step=0.5) else: print("不支持{}该抽样方法".format(method)) return self._df X_res, y_res = overSm.fit_resample(X, y) print("overSample label shape {}".format(Counter(y_res))) _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1) df_new = pd.DataFrame(data=_data, columns=feature_name + [self._target]) return df_new
def use_parameters(self, X_train, selected_features): """ Default Parameter """ test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__C': test_C_linear, # default C=1 'model__kernel': ['linear'] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) return parameters
def test_borderline_smote_wrong_kind(data): bsmote = BorderlineSMOTE(kind='rand') with pytest.raises(ValueError, match='The possible "kind" of algorithm'): bsmote.fit_resample(*data)