def create_model_from_training_data(self): training_comments = [] training_ratings = [] print("Training classifier model..") for sentidata in self.training_data: comments = preprocess_text(sentidata.text) training_comments.append(comments) training_ratings.append(sentidata.rating) # discard stopwords, apply stemming, and discard words present in less than 3 comments self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, sublinear_tf=True, max_df=0.5, stop_words=mystop_words, min_df=3) X_train = self.vectorizer.fit_transform(training_comments).toarray() Y_train = np.array(training_ratings) #Apply SMOTE to improve ratio of the minority class smote_model = SVMSMOTE(sampling_strategy=0.5, random_state=None, k_neighbors=15, m_neighbors=15, out_step=.0001, svm_estimator=None, n_jobs=1) X_resampled, Y_resampled = smote_model.fit_sample(X_train, Y_train) model = self.get_classifier() model.fit(X_resampled, Y_resampled) return model
def data_oversample(self): x_train, x_val, x_test, y_train, y_val, y_test = self.data_sample_split( ) for i in [0, 1, 2, 3, 4]: if i not in y_train: print("lesion " + i + " not in y_train. Redoing sample split...") data_sample_split() print("Presampled train dataset: %s" % Counter(y_train)) resample = SVMSMOTE(random_state=42) # SVMSMOTE, SMOTENC x_train, y_train = resample.fit_resample(x_train, y_train) x_val, y_val = resample.fit_resample(x_val, y_val) print("Resampled train dataset: %s" % Counter(y_train)) ## x_test, y_test = resample.fit_resample(x_test, y_test) return x_train, x_val, x_test, y_train, y_val, y_test
def getData(splitData=True, useImbalancer=False, useStratify=False): global standard_scaler data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv") X = data.values[:, 1:-1] rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy() X = np.concatenate((X, rank_dummy), axis=1) y = data.values[:, 0].reshape(-1, 1) if useStratify: stratify = y else: stratify = None if splitData: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, shuffle=True, stratify=stratify) else: X_train = X y_train = y if useImbalancer and splitData: tl = TomekLinks(sampling_strategy='majority') X_train, y_train = tl.fit_sample(X=X_train, y=y_train) # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T) if splitData: unique, counts = np.unique(y_test, return_counts=True) # print("y_test\n", np.asarray((unique, counts)).T) if splitData: return X_train, X_test, y_train.ravel(), y_test.ravel() else: return X_train, y_train.ravel()
def svm_smote(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): sm = SVMSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def svm_smote(X, y): """Balancing data using SVMSMOTE Args: X: Training set without Class Target y:Training set Class Target Returns: balanced train_x, test_x """ sample = SVMSMOTE(random_state=42) X, y = sample.fit_resample(X, y) print('after balancing:', X.shape) return X, y
def Predict(data, mode): train, test = data idx = test.id.values.astype(int) y = train.median_relevance.values train_query = list( train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) train_title = list( train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) test_query = list( test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) test_title = list( test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \ 'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9']) stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english'))) tfv = text.TfidfVectorizer(min_df=7, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \ ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words) tfv.fit(train_query + train_title) X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)]) X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)]) sim = similarlity_stack() if mode == 'eda': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('svm', svm)]) elif mode == 'sampling': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('sampling', sampling),\ ('svm', svm)]) clf.fit(X_train, y) preds = clf.predict(X_test) pred_probas = clf.predict_proba(X_test) submission = pd.DataFrame({"id": idx, "prediction": preds}) submission_probas = pd.DataFrame(pred_probas, index=idx) return submission, submission_probas
def borderline_smoth_func(train_x, train_y, target): try: logger.info( f"counter before border line SMOTH is: {train_y[target].value_counts()}" ) # transform the dataset #oversample = BorderlineSMOTE() oversample = SVMSMOTE() train_x, train_y = oversample.fit_resample(train_x, train_y) # summarize the new class distribution logger.info( f"counter after borderline SMOTH is: {train_y[target].value_counts()}" ) return train_x, train_y except Exception as ex: logger.error(f"failed to run borderline_smoth_func due to: {ex}")
def test_svm_smote_not_svm(data): """Check that we raise a proper error if passing an estimator that does not expose a `support_` fitted attribute.""" err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute." with pytest.raises(RuntimeError, match=err_msg): SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data)
def logistic_regression(lr_params, train_feat, train_label, model, test_feat, test_label, vec_params=None, random_state=42): ''' A function to model data using logistic regression with under- or over-sampling. ''' if model == 'svmsmote': pipe = make_pipeline(CountVectorizer(**vec_params), SVMSMOTE(random_state=random_state), LogisticRegression(**lr_params)) elif model == 'rus': pipe = make_pipeline(CountVectorizer(**vec_params), RandomUnderSampler(random_state=random_state), LogisticRegression(**lr_params)) pipe_fit = pipe.fit(train_feat, train_label) y_pred = pipe_fit.predict(test_feat) cnf_matrix = confusion_matrix(test_label, y_pred) return pipe, pipe_fit, y_pred, cnf_matrix
def oversample(x, y, method): randomstate = 42 if method == 'No Sample': # 不采样 return x, y elif method == 'random': # 随机过采样 ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=randomstate) X_resampled, y_resampled = ros.fit_resample(x, y) elif method == 'SMOTE': # SMOTE算法 X_resampled, y_resampled = SMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'Sparse SMOTE': # Sparse SMOTE算法 X_resampled, y_resampled = SparseSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'SMOTEBorderline-1': # BorderlineSmote算法 borderline-1 X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-1', random_state=randomstate).fit_resample(x, y) elif method == 'SMOTEBorderline-2': # BorderlineSmote算法 borderline-2 X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-2', random_state=randomstate).fit_resample(x, y) elif method == 'SVMSMOTE': # SVMSMOTE算法 X_resampled, y_resampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'ADASYN': # ADASYN算法 X_resampled, y_resampled = ADASYN(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y) elif method == 'mwmote': # MWMOTE算法 X_resampled, y_resampled = MWMOTE.MWMOTE(x, y, N=1000, return_mode='append') # 统计过采样数量 # from collections import Counter # print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled
def get_sampler(self): sampler = None if self.sampler == 'random-over-sampler': sampler = RandomOverSampler(random_state=self.random_seed) elif self.sampler == 'adasyn': sampler = ADASYN(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'smote': sampler = SMOTE(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'svm-smote': sampler = SVMSMOTE(random_state=self.random_seed, n_jobs=self.njobs) elif self.sampler == 'random-under-sampler': sampler = RandomUnderSampler(random_state=self.random_seed) elif self.sampler == 'tomek-links': sampler = TomekLinks(n_jobs=self.njobs) elif self.sampler == 'near-miss': sampler = NearMiss(n_jobs=self.njobs) elif self.sampler == 'instance-hardness': sampler = InstanceHardnessThreshold(random_state=self.random_seed, n_jobs=self.njobs) return sampler
def svmsampler(X, y, over_pct=0.1, under_pct=1): over = SVMSMOTE(random_state=42, sampling_strategy=over_pct) under = RandomUnderSampler(random_state=42, sampling_strategy=under_pct) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) X, y = pipeline.fit_resample(X, y) return X, y
def run_upsample(json_file_path, fmt_file_path): json_manager = JsonManager(json_file_path) if json_manager.get_upsample_status() == True: print(f"Upsampling started using {json_file_path} and {fmt_file_path}") upsampled_path = json_manager.get_upsampled_path() constants.remove_folder_if_exists(\ constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path) hot_encoded_folder = os.fsdecode(os.path.join(\ json_manager.get_hot_encoded_path(), \ constants.HOT_ENCODED_CSV_FOLDER_NAME)) hot_encoded_file = os.fsdecode(os.path.join(\ hot_encoded_folder, \ constants.HOT_ENCODED_CSV_FILENAME)) hotEncoded_data = pd.read_csv(hot_encoded_file) features_data = pd.read_csv(hot_encoded_file, \ usecols = list(hotEncoded_data.columns)[:-1]) # everything except label labels_data = pd.read_csv(hot_encoded_file, \ usecols = [list(hotEncoded_data.columns)[-1]]) # label sm = SVMSMOTE(random_state=json_manager.get_random_state()) X_res, y_res = sm.fit_resample(features_data, labels_data) csv_ready = np.append(X_res, y_res, axis=constants.COLUMN_AXIS) upsampled_folder = constants.add_folder_to_directory(\ constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path) upsampled_file_path = os.fsdecode(os.path.join(\ upsampled_folder, constants.UPSAMPLED_CSV_FILENAME)) if os.path.exists(upsampled_file_path): os.remove(upsampled_file_path) f = open(fmt_file_path, "r") fmt = f.readline() f.close() header = ','.join(str(i) for i in hotEncoded_data.columns) np.savetxt(upsampled_file_path, csv_ready, \ fmt = fmt, \ delimiter = constants.CSV_DELIMITER, \ header = header, \ comments='') print(f"Upsampling finished, results in {upsampled_file_path}")
def fit(self, X, Y): # print('Kernel:', kernel_dict) train_data = np.append(X, Y.reshape(len(Y), 1), axis=1) if self.databalance == 'LowSampling': data_maj = train_data[Y == 1] # 将多数 data_min = train_data[Y != 1] index = np.random.randint(len(data_maj), size=len(data_min)) lower_data_maj = data_maj[list(index)] train_data = np.append(lower_data_maj, data_min, axis=0) X = train_data[:, :-1] Y = train_data[:, -1] self.Y = Y elif self.databalance == 'UpSampling': X, Y = SVMSMOTE(random_state=42).fit_sample(train_data[:, :-1],\ np.asarray(train_data[:, -1])) self.Y = Y else: X = X Y = Y self.Y = Y m = Y.shape[0] # Kernel if self.kernel_dict['type'] == 'RBF': K = Kernel.RBF(m, self.kernel_dict['sigma']) elif self.kernel_dict['type'] == 'LINEAR': K = Kernel.LINEAR(m) elif self.kernel_dict['type'] == 'POLY': K = Kernel.POLY(m, self.kernel_dict['d']) K.calculate(X) tmp1 = np.hstack((np.ones((1, 2 * m)), [[0]])) M_BR = K.kernelMat + np.eye(m) / (self.C * self.m_value) tmp2 = np.hstack((M_BR, K.kernelMat, np.ones((m, 1)))) M_BL = K.kernelMat + np.eye(m) / (self.C * (1 - self.m_value)) tmp3 = np.hstack((K.kernelMat, M_BL, np.ones((m, 1)))) L = np.vstack((tmp1, tmp2, tmp3)) R = np.ones(2 * m + 1) R[0] = 0 R[m + 1:] = -1 # solve solution = LA.solve(L, R) b = solution[-1] alpha = solution[:m] beta = solution[m:2 * m] print('b', b) # self.gamma = gamma self.beta = beta self.alpha = alpha self.b = b self.K = K self.kernelMat = K.kernelMat
def fit(self, X, Y): # print('Kernel:', self.kernel_dict) train_data = np.append(X, Y.reshape(len(Y), 1), axis=1) if self.databalance == 'LowSampling': data_maj = train_data[Y == 1] # 将多数 data_min = train_data[Y != 1] index = np.random.randint(len(data_maj), size=len(data_min)) lower_data_maj = data_maj[list(index)] train_data = np.append(lower_data_maj, data_min, axis=0) X = train_data[:, :-1] Y = train_data[:, -1] self.Y = Y elif self.databalance == 'UpSampling': X, Y = SVMSMOTE(random_state=42).fit_sample(train_data[:, :-1],\ np.asarray(train_data[:, -1])) self.Y = Y else: X = X Y = Y self.Y = Y m = len(Y) # Kernel if self.kernel_dict['type'] == 'RBF': K = Kernel.RBF(m, self.kernel_dict['sigma']) K.calculate(X) elif self.kernel_dict['type'] == 'LINEAR': K = Kernel.LINEAR(m) K.calculate(X) elif self.kernel_dict['type'] == 'POLY': K = Kernel.POLY(m, self.kernel_dict['d']) K.calculate(X) H = np.multiply(np.dot(np.matrix(Y).T, np.matrix(Y)), K.kernelMat) M_BR = H + np.eye(m) / (self.C) # Concatenate L_L = np.concatenate((np.matrix(0), np.matrix(Y).T), axis=0) L_R = np.concatenate((np.matrix(Y), M_BR), axis=0) L = np.concatenate((L_L, L_R), axis=1) R = np.ones(m + 1) R[0] = 0 # solve b_a = LA.solve(L, R) b = b_a[0] alpha = b_a[1:] e = alpha / self.C self.alpha = alpha self.b = b self.K = K self.kernelMat = K.kernelMat return self.alpha, self.b, e
def roc_curves(df, number_of_matches): number_of_matches = int(number_of_matches) df_played_matches = df.iloc[0:number_of_matches-1] classifier = LogisticRegression(max_iter=300, multi_class = 'multinomial', solver = 'saga',penalty='elasticnet',l1_ratio = .95) classifier = OneVsRestClassifier(classifier) count = 0 Data = df_played_matches[['home_pos', 'visitor_pos', 'spi1', 'spi2', 'draw%', 'home_form', 'visitor_form', 'importance1', 'importance2', 'xG1', 'xG2']] Target = df_played_matches['home_result'] y = np.asarray(Target) enc = LabelEncoder() label_encoder = enc.fit(y) y = label_encoder.transform(y) X = np.asarray(Data) n_classes = 3 n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=5) from imblearn.over_sampling import SVMSMOTE SVMSMOTE = SVMSMOTE() columns = Data.columns up_sampled_X,up_sampled_y=SVMSMOTE.fit_sample(X_train, y_train) up_sampled_X = pd.DataFrame(data=up_sampled_X,columns=columns ) up_sampled_y= pd.DataFrame(data=up_sampled_y,columns=['home_result']) scaler = RobustScaler() scaler.fit(up_sampled_X) X_train = scaler.transform(up_sampled_X) X_test = scaler.transform(X_test) y_train = label_binarize(np.asarray(up_sampled_y), classes=[0, 1, 2]) y_test = label_binarize(np.asarray(y_test), classes=[0, 1, 2]) y_score = classifier.fit(X_train, y_train).predict_proba(X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) data = [fpr[2], tpr[2]] dataset = pd.DataFrame({'FPR': data[0], 'TPR': data[1]}) dataset.to_csv("reticulate1.csv")
def over_sample(self, method="BorderLine", sampling_strategy="minority", random_state=42, k_neighbors=5, n_neighbors=10, kind="borderline-1"): """ 过采样方法 :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m} :param random_state:int :param k_neighbors:int :param n_neighbors:int :param kind:str, borderline-1,borderline-2 :return:df """ feature_name = self._df.columns.difference(["id", self._target]).tolist() X = self._df[feature_name].values y = self._df[self._target].values print("Original label shape {}".format(Counter(y))) if method == "ADASYN": overSm = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors) elif method == "BorderLine": overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, kind=kind) elif method == "KMeans": overSm = KMeansSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors) elif method == "Random": overSm = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state) elif method == "SVM": overSm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, out_step=0.5) else: print("不支持{}该抽样方法".format(method)) return self._df X_res, y_res = overSm.fit_resample(X, y) print("overSample label shape {}".format(Counter(y_res))) _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1) df_new = pd.DataFrame(data=_data, columns=feature_name + [self._target]) return df_new
def random_forest_param_selection(x: DataFrame, y: DataFrame = None, cv=DEFAULT_CV, metric: str = DEFAULT_METRIC, jobs: int = DEFAULT_THREAD, random_state: int = DEFAULT_RANDOM_STATE, refit: bool = DEFAULT_REFIT): """ :param x: :param y: :param cv: :param metric: :param jobs: :param random_state: :param refit: :return: """ param_grid = { 'criterion': ['entropy', 'gini'], 'max_depth': [80, 90], 'max_features': ['log2', 'sqrt'], 'min_samples_leaf': [2, 5], 'n_estimators': [10, 150, 300, 600] } new_params = {'rf__' + k: v for k, v in param_grid.items()} upsampling_model = Pipeline([ ('svmsmote', SVMSMOTE(svm_estimator=SVC(), k_neighbors=5, m_neighbors=5, n_jobs=jobs, random_state=random_state)), ('rf', RandomForestClassifier(random_state=random_state, warm_start=True, n_jobs=jobs)) ]) grid_search = ms.GridSearchCV( # RandomForestClassifier(random_state=random_state, warm_start=True, n_jobs=jobs), # param_grid=param_grid, upsampling_model, param_grid=new_params, scoring=metric, cv=cv, refit=refit, n_jobs=jobs, verbose=Tuning.DEFAULT_VERBOSE ) grid_search.fit(x, y) print("Best parameters:") print() print(grid_search.best_params_) print() print("Grid scores:") print() means = grid_search.cv_results_['mean_test_score'] stds = grid_search.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, grid_search.cv_results_['params']): print("%0.4f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() return grid_search.best_estimator_
def GradientBoost(): print('Gradient Boost') X, Y = SVMSMOTE(random_state=42).fit_sample(x_train, y_train) # clf = GradientBoostingClassifier(learning_rate=0.005, n_estimators=400,max_depth=11,\ # min_samples_leaf =70, min_samples_split =1000, \ # max_features='sqrt', subsample=1, random_state=10) clf = GradientBoostingClassifier() clf.fit(X, Y) ypred = clf.predict(x_test) Precision.precision(ypred, y_test)
def __get_smote(self): if self.algorithm == 'Borderline': return BorderlineSMOTE(random_state=RANDOM_STATE) elif self.algorithm == 'KMeans': return KMeansSMOTE(random_state=RANDOM_STATE, kmeans_estimator=KMeans(n_clusters=20)) elif self.algorithm == 'SVM': return SVMSMOTE(random_state=RANDOM_STATE) elif self.algorithm == 'Tomek': return SMOTETomek(random_state=RANDOM_STATE) return SMOTE(random_state=RANDOM_STATE)
def fit(self, X, y): train_data = np.append(X, y.reshape(len(y), 1), axis=1) clf = [[]] * self.n_estimator if databalance == 'LowSampling': data_maj = train_data[y == 1] # 将多数 data_min = train_data[y != 1] index = np.random.randint(len(data_maj), size=len(data_min)) lower_data_maj = data_maj[list(index)] train_data = np.append(lower_data_maj, data_min, axis=0) elif databalance == 'UpSampling': x_train, y_train = SVMSMOTE(random_state=42).fit_sample(train_data[:, :-1],\ np.asarray(train_data[:, -1])) train_data = np.append(x_train, y_train.reshape(len(y_train), 1), axis=1) else: train_data = train_data for i in range(self.n_estimator): #sample = np.array(subsample(dataset=data[:-test_length, :], ratio=0.7)) sample = np.array(self.subsample(dataset=train_data, ratio=0.8)) train_data = sample x_train = train_data[:, :-1] y_train = train_data[:, -1] if self.kernel_dict_type == 'LINEAR': C = GridSearch_parametre.LS_FSVM_best(x_train,y_train,self.kernel_dict_type,\ self.param_grid,self.judgment,self.fuzzyvalue, self.r_max, self.r_min) kernel_dict = {'type': 'LINEAR'} elif self.kernel_dict_type == 'RBF': C,sigma = GridSearch_parametre.LS_FSVM_best(x_train,y_train,self.kernel_dict_type,\ self.param_grid,self.judgment,self.fuzzyvalue, self.r_max, self.r_min) kernel_dict = {'type': 'RBF', 'sigma': sigma} elif self.kernel_dict_type == 'POLY': C,d = GridSearch_parametre.LS_FSVM_best(x_train,y_train,self.kernel_dict_type,\ self.param_grid,self.judgment,self.fuzzyvalue, self.r_max, self.r_min) kernel_dict = {'type': 'POLY', 'd': d} clf[i] = LS_FSVM.LSFSVM(C, kernel_dict, self.fuzzyvalue, 'origine', self.r_max, self.r_min) clf[i]._mvalue(x_train, y_train) clf[i].fit(x_train, y_train) with open('LSFsvm_bagging.pkl', 'wb') as f: for i in range(self.n_estimator): pickle.dump(clf[i], f, pickle.HIGHEST_PROTOCOL)
def get_oversampler(sampler_name, **add_params): sampler_name = sampler_name.lower() if sampler_name == 'adasyn': return ADASYN(**add_params) elif sampler_name == 'smote': return SMOTE(**add_params) elif sampler_name == 'smotenc': return SMOTENC(**add_params) elif sampler_name == 'svmsmote': return SVMSMOTE(**add_params) else: print('Choose one of predefined over-samplers')
def _SMOTE_SVM(self): # Oversampling - SMOTE - Synthetic Minority Over-sampling Technique # print('before SMOTE df', self.x_train) print("before SMOTE df", self.x_train.shape) smote = SVMSMOTE( k_neighbors=5, m_neighbors=5, random_state=self.seed ) # sampling_strategy=0.8 self.X_train_smote, self.y_train_smote = smote.fit_sample( self.x_train, self.y_train ) print("X_train_SMOTE:\n", self.X_train_smote[1]) self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"] ) # print('len smote: \n', len(self.X_train_smote)) print("len new x_train after smote: \n", len(self.x_train)) number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1] print("number positive responses y_train:\n", len(number_pos_x))
def fit(self, X, y): """Fitting.""" X, y = check_X_y(X, y) self.classes_ = unique_labels(y) self.X_ = X self.y_ = y minority_X = self.X_[self.y_ == 1] minority_y = self.y_[self.y_ == 1] majority_X = self.X_[self.y_ == 0] majority_y = self.y_[self.y_ == 0] for i in range(self.ensemble_size): self.estimators_.append(base.clone(self.base_estimator)) for n, estimator in enumerate(self.estimators_): np.random.seed(self.random_state + (n * 2)) bagXminority = minority_X[np.random.choice( round(minority_X.shape[0] / 2), len(minority_y), replace=True), :] bagXmajority = majority_X[np.random.choice( round(majority_X.shape[0] / 2), len(majority_y), replace=True), :] bagyminority = np.ones(len(minority_y)).astype('int') bagymajority = np.zeros(len(majority_y)).astype('int') train_X = np.concatenate((bagXmajority, bagXminority)) train_y = np.concatenate((bagymajority, bagyminority)) # unique, counts = np.unique(train_y, return_counts=True) if self.oversampled == "ROS": ovs = RandomOverSampler(random_state=self.random_state) train_X, train_y = ovs.fit_resample(train_X, train_y) elif self.oversampled == "SMOTE": ovs = SMOTE(random_state=self.random_state) train_X, train_y = ovs.fit_resample(train_X, train_y) elif self.oversampled == "SVMSMOTE": ovs = SVMSMOTE(random_state=self.random_state) train_X, train_y = ovs.fit_resample(train_X, train_y) elif self.oversampled == "B2SMOTE": ovs = BorderlineSMOTE(random_state=self.random_state, kind="borderline-2") train_X, train_y = ovs.fit_resample(train_X, train_y) estimator.fit(train_X, train_y) # Return the classifier return self
def hyper_paramytize_optimization(): print("model with no experience with Smote STSRCOM", file=f) print( "--------------------------------------------------------------------", file=f) counter = Counter(y) # estimate scale_pos_weight value estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate, file=f) print(counter[0], file=f) print(counter[1], file=f) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss') random = RandomUnderSampler(sampling_strategy=0.33) # define grid # weights = [1,3, 10, 25,30, 50, 75, 99, 100] # param_grid = dict(scale_pos_weight=weights) # param_grid= {'xgbclassifier__scale_pos_weight': weights} learning_rates = [0.1, 0.05, 0.01] max_depths = [1, 2, 3, 5, 8, 10, 14, 18] n_estimator = range(60, 220, 40) weights = [1, 10, 25, 50, 75, 99, 100, 1000] param_grid = { 'xgbclassifier__max_depth': max_depths, 'xgbclassifier__learning_rate': learning_rates, 'xgbclassifier__n_estimators': n_estimator, 'xgbclassifier__scale_pos_weight=weights': weights } print(param_grid, file=f) # define evaluation procedure cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1) # define grid search # pipeline = Pipeline([('under', random), ('xgbclassifier', model)]) pipeline = Pipeline([('sample', SVMSMOTE()), ('xgbclassifier', model)]) grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc') # execute the grid search grid_result = grid.fit(X, y) # report the best configuration print(grid_result, file=f) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_), file=f) # report all configurations means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param), file=f)
def equalize_training_dataset_with_SVMSMOTE(x_train, y_train): from imblearn.over_sampling import SVMSMOTE old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = SVMSMOTE(sampling_strategy='not majority', n_jobs=8).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def over_sample(X, y, sampler="SMOTE"): samplers = { "RandomOverSampler": RandomOverSampler(), "ADASYN": ADASYN(), "SMOTE": SMOTE(), "BorderlineSMOTE": BorderlineSMOTE(), "SVMSMOTE": SVMSMOTE(), "SMOTENC": SMOTENC(categorical_features=[]), } sampler = samplers[sampler] # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) return X_resampled, y_resampled
def runSmote(X, y, algorithm='default', split_synthetic=False, verbose=True): if verbose: log.info("Data before oversampling") log.info("Dataset: {0}, {1}".format(X.shape, len(y))) n_casos = np.count_nonzero(y == 1) n_controles = np.count_nonzero(y == 0) N = abs(n_casos - n_controles) if algorithm == 'Borderline': if verbose: log.info("Running Borderline Smote") X_novo, y_novo = BorderlineSMOTE( random_state=random_state).fit_resample(X, y) elif algorithm == 'KMeans': if verbose: log.info("Running KMeans Smote") X_novo, y_novo = KMeansSMOTE( random_state=random_state, kmeans_estimator=KMeans(n_clusters=20)).fit_resample(X, y) elif algorithm == 'SVM': if verbose: log.info("Running SVM Smote") X_novo, y_novo = SVMSMOTE(random_state=random_state).fit_resample(X, y) elif algorithm == 'Tomek': if verbose: log.info("Running Smote Tomek") X_novo, y_novo = SMOTETomek(random_state=random_state).fit_resample( X, y) else: if verbose: log.info("Running default Smote") X_novo, y_novo = SMOTE(random_state=random_state).fit_resample(X, y) if verbose: log.info("Data after oversampling") log.info("Dataset: {0}, {1}".format(X_novo.shape, len(y_novo))) if split_synthetic: synthetic_X = X_novo[-N:] synthetic_y = y_novo[-N:] return X, y, synthetic_X, synthetic_y else: return X_novo, y_novo, None, None
def test_svm_smote(data): svm_smote = SVMSMOTE(random_state=42) svm_smote_nn = SVMSMOTE(random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), svm_estimator=SVC(gamma='scale', random_state=42)) X_res_1, y_res_1 = svm_smote.fit_resample(*data) X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def get_oversampling_models(): models, names = list(), list() # RandomOverSampler models.append(RandomOverSampler()) names.append('ROS') # SMOTE models.append(SMOTE()) names.append('SMOTE') # BorderlineSMOTE models.append(BorderlineSMOTE()) names.append('BLSMOTE') # SVMSMOTE models.append(SVMSMOTE()) names.append('SVMSMOTE') # ADASYN models.append(ADASYN()) names.append('ADASYN') return models, names