def _fit_resample(self, X, y): n_samples = X.shape[0] # convert y to z_score y_z = (y - y.mean()) / y.std() index0 = np.arange(n_samples) index_negative = index0[y_z > self.negative_thres] index_positive = index0[y_z <= self.positive_thres] index_unclassified = [x for x in index0 if x not in index_negative and x not in index_positive] y_z[index_negative] = 0 y_z[index_positive] = 1 y_z[index_unclassified] = -1 ros = RandomOverSampler( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) _, _ = ros.fit_resample(X, y_z) sample_indices = ros.sample_indices_ print("Before sampler: %s. Total after: %s" % (Counter(y_z), sample_indices.shape)) self.sample_indices_ = np.array(sample_indices) if self.return_indices: return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices), sample_indices) return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices))
def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 5 assert count_y_res[1] == 5 assert count_y_res[2] == 5
def test_random_over_sampling_heterogeneous_data(): X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], dtype=np.object) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) X_res, y_res = ros.fit_resample(X_hetero, y) assert X_res.shape[0] == 4 assert y_res.shape[0] == 4 assert X_res.dtype == object assert X_res[-1, 0] in X_hetero[:, 0]
def test_ros_fit_resample_half(): sampling_strategy = {0: 3, 1: 7} ros = RandomOverSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_random_over_sampling_return_indices(): ros = RandomOverSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ 0.15490546, 0.3130677 ], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X)))
SEED=0xDEADBEEF y_col = 'add' X_cols = ['pct_contrib','turnover','VWAP','vol','VWMC','SPTSXComp'] all_cols = X_cols + [y_col] X = filtered[X_cols] y = filtered[y_col] X_test, X_train, y_test, y_train = sk.model_selection.train_test_split(X.values, y.values, test_size=0.2, random_state=SEED) filtered[all_cols].to_sql('model_inputs', conn, if_exists='replace', index=False) #oversampler = SMOTE(random_state=SEED) oversampler = RandomOverSampler(random_state=SEED) X_train_resample, y_train_resamle = oversampler.fit_resample(X_train, y_train) print(len(X_train), len(X_test)) #log_clf = LogisticRegression()# #log_clf = RandomForestClassifier() #log_clf = xgb.XGBClassifier(max_depth=4, min_child_weight=50, learning_rate=0.01, n_estimators=50, gamma=1) log_clf = svm.LinearSVC() ##LogisticRegression() log_clf.fit(X_train_resample, y_train_resamle) print(log_clf.score(X_train, y_train)) y_pred = log_clf.predict(X_test) try: y_pred_prob = log_clf.predict_proba(X_test)
X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an over-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` has a ``str`` # ................................... # # ``sampling_strategy`` can be given as a string which specify the class # targeted by the resampling. With under- and over-sampling, the number of # samples will be equalized. # # Note that we are using multiple classes from now on.
y_pred = logreg.predict(X_test) print('Accuracy :{0:0.5f}'.format(accuracy_score(y_pred , y_test))) print('AUC : {0:0.5f}'.format(roc_auc_score(y_test , y_pred))) print('Precision : {0:0.5f}'.format(precision_score(y_test , y_pred))) print('Recall : {0:0.5f}'.format(recall_score(y_test , y_pred))) print('F1 : {0:0.5f}'.format(f1_score(y_test , y_pred))) """# オーバーサンプリング""" from imblearn.over_sampling import RandomOverSampler print('Original dataset shape %s' % Counter(y)) ros = RandomOverSampler(random_state=my_random_state) X_res2, y_res2 = ros.fit_resample(X, y) print('Resampled dataset shape %s' % Counter(y_res2)) X_train, X_test, y_train, y_test = train_test_split(X_res2, y_res2, test_size=0.3, shuffle=True, random_state=my_random_state) # Oversampling with Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) print('Accuracy :{0:0.5f}'.format(accuracy_score(y_test , y_pred))) print('AUC : {0:0.5f}'.format(roc_auc_score(y_test , y_pred))) print('Precision : {0:0.5f}'.format(precision_score(y_test , y_pred))) print('Recall : {0:0.5f}'.format(recall_score(y_test , y_pred)))
return 1 else: return 0 df['Approval'] = df['Approval'].apply(switch_val) ## TRAIN/TEST SPLIT X = dtm.loc[:, dtm.columns != 'Approval'] y = dtm['Approval'] train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100) ## RESAMPLE DATASET ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(train_X, train_y) ## SGDClassifier GridSearch grid = { #'alpha': [1e-4, 1e-3, 1e-2, 1e-1], 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], #'l1_ratio': [0.15, 0.30, 0.45, 0.60, 0.75, 0.90] #'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], #'class_weight': ['balanced'] } paramGrid = ParameterGrid(grid) bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier, paramGrid, X_resampled, y_resampled, test_X, test_y, metric=roc_auc_score, scoreLabel="AUC", n_jobs=1)
#scaling sdscaler = StandardScaler() sdscaler.fit(data) sdscaler_data = sdscaler.transform(data) sdscaler_pd = pd.DataFrame(sdscaler_data, columns=data.columns) #성능 비교를 위한 test set설정 X_train, X_test, Y_train, Y_test = train_test_split(sdscaler_pd, label, test_size=0.1, shuffle=True, random_state=5) ros = RandomOverSampler(random_state=2019) rus = RandomUnderSampler(random_state=2019) oversampled_data, oversampled_label = ros.fit_resample(X_train, Y_train) undersampled_data, undersampled_label = rus.fit_resample(X_train, Y_train) oversampled_data = pd.DataFrame(oversampled_data, columns=data.columns) undersampled_data = pd.DataFrame(undersampled_data, columns=data.columns) print('원본 데이터의 클래스 비율\n{}'.format(pd.get_dummies(Y_train).sum())) print('\nOversampled_data 클래스 비율 \n{}'.format( pd.get_dummies(oversampled_label).sum())) print('\nUndersampled_data 클래스 비율 결과 \n{}'.format( pd.get_dummies(undersampled_label).sum())) #성능 비교 def train_and_test(model, X_train, Y_train, X_test, Y_test): model.fit(X_train, Y_train) pred = model.predict(X_test)
####### X_ = np.array(feature)[:, 1:] X = scale(X_) y = np.array(label.values.ravel()) # choose the method option = sys.argv[1] # the input sequence file = sys.argv[2] if (option == "1"): #Random over sampling method ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(X, y) csv_X = pd.DataFrame(data=X_resampled) csv_y = pd.DataFrame(data=y_resampled) csv_X.to_csv('ros_feature.csv', header=False, index=False) csv_y.to_csv('ros_label.csv', header=False, index=False) if (option == "2"): #SMOTE method X_resampled, y_resampled = SMOTE().fit_resample(X, y) csv_X = pd.DataFrame(data=X_resampled) csv_y = pd.DataFrame(data=y_resampled) csv_X.to_csv('ros_feature.csv', header=False, index=False) csv_y.to_csv('ros_label.csv', header=False, index=False) if (option == "3"): #ADASYN method
base_col_names = col_names[0:13] # for baseline model 包含银行数据+早中晚数据 df_fillna = df.fillna(0) # fill NA with 0. 无消费以0计 X = df_fillna[col_names] y = df_fillna.default_geq_1 # Target variable X_base = df_fillna[base_col_names] y_base = df_fillna.default_geq_1 # Target variable random_state = 1234 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=random_state) #如果 random_state = None (默认值),会随机选择一个种子,这样每次都会得到不同的数据划分。给 random_state 设置相同的值,那么当别人重新运行你的代码的时候就能得到完全一样的结果,复现和你一样的过程。 X_base_train, X_base_test, y_base_train, y_base_test = train_test_split( X_base, y_base, test_size=0.30) ros = RandomOverSampler(random_state=0) X_train, y_train = ros.fit_resample(X_train, y_train) X_base_train, y_base_train = ros.fit_resample(X_base_train, y_base_train) min_max_scaler = MinMaxScaler() #X_train = min_max_scaler.fit_transform(X_train) #X_test = min_max_scaler.fit_transform(X_test) #sc = StandardScaler() #X_train = sc.fit_transform(X_train) #X_test = sc.fit_transform(X_test) #numerical_columns=['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership'] #Specifying the parameter n_estimators = 100 learning_rate = 0.1 max_depth = 6 num_leaves = 16
pyplot.savefig("Plot_PRT.jpeg", dpi = 800, facecolor = "white") ##### Rebalance data with over-sampling ------------------------------------------------------------------- # Fit to training data and evaluate performance on test data (no rebalancing) clfLrgC = LogisticRegression(solver = "lbfgs", class_weight = {0:0.01, 1:0.99}) model_cv_bal(clfLrgC, 1000) # Oversample minority class at 1:1 ratio # This block of code does not influence models, and just shows how over-sampling works os = RandomOverSampler(sampling_strategy = "minority") os_x, os_y = os.fit_resample(train_x, train_y) print(Counter(train_y)) print(Counter(os_y)) # Fit to training data and evaluate performance on test data (1:1 rebalancing) pipeline = Pipeline([("samp", RandomOverSampler(sampling_strategy = "minority")), ("model", LogisticRegression(solver = "lbfgs", class_weight = {0:0.01, 1:0.99}))]) model_cv_bal(pipeline, 1000) # Oversample minority class at 1:4 ratio # This block of code does not influence models, and just shows how over-sampling works os = RandomOverSampler(sampling_strategy = 0.25) os_x, os_y = os.fit_resample(train_x, train_y) print(Counter(train_y)) print(Counter(os_y))
def oversampling(x, lbl): # apply imblearn pack. from imblearn.over_sampling import RandomOverSampler from sklearn.utils import shuffle ros = RandomOverSampler(random_state=87) x_res, y_res = ros.fit_resample(x, lbl) return shuffle(x_res, y_res, random_state=87)
cln = [ lem.lemmatize(word) for word in cln if word not in stopwords.words('english') ] cln = ' '.join(cln) corpus.append(cln) from sklearn.feature_extraction.text import TfidfVectorizer vec = TfidfVectorizer() X = vec.fit_transform(corpus).toarray() Y = pd.get_dummies(Y, drop_first=True) from imblearn.over_sampling import RandomOverSampler os = RandomOverSampler() X, Y = os.fit_resample(X, Y) from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB() mnb.fit(X_train, Y_train) pred = mnb.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(Y_test, pred)) print("\n") print(classification_report(Y_test, pred)) filename = 'spam-classifier.pkl'
def main(n_comp, n_samples_per_cat, Threshold_prob): seconds = time.time() file_test = "bridged_10k.csv" file_train = "train.csv" file_valid = "valid.csv" load_data(file_test, file_train, file_valid, n_samples_per_cat) with open('Simplified_smuto.pkl', 'rb') as fl: tt = pickle.load(fl) X_train, Y_train, X_test, Y_test, X_valid, Y_valid, Training_map, Validation_map = tt X_train, Y_train, X_valid, Y_valid, Training_map, Validation_map = cleaning_data_from_topics( X_train, Y_train, X_valid, Y_valid, Training_map, Validation_map, n_samples_per_cat) # Validation = Part of training !!! # Validation_map = Training_map # X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, random_state=42) # Vectorization model = load_Glove_Model() X_train_vect, X_train, Y_train = vect(X_train, Y_train, model) X_test_vect, X_test, Y_test = vect(X_test, Y_test, model) X_valid_vect, X_valid, Y_valid = vect(X_valid, Y_valid, model) # Making vectors positive min_val = 3.1 X_train_vect = [x + min_val for x in X_train_vect] X_test_vect = [x + min_val for x in X_test_vect] X_valid_vect = [x + min_val for x in X_valid_vect] print('Vectorization is done') # # SMOTE # sm = SMOTE(random_state=42) # X_train_vect, Y_train = sm.fit_resample(X_train_vect, Y_train) ros = RandomOverSampler(random_state=42) X_train_vect, Y_train = ros.fit_resample(X_train_vect, Y_train) unique, counts = np.unique(Y_train, return_counts=True) print('Train data distribution after Smote:{}'.format( dict(zip(np.array(unique), np.array(counts))))) unique, counts = np.unique(Y_valid, return_counts=True) print('Valid data distribution after Smote:{}'.format( dict(zip(np.array(unique), np.array(counts))))) # Dimensionality reduction using NMF X_train_vect, X_test_vect, X_valid_vect = dimensionality_reduction( X_train_vect, X_test_vect, X_valid_vect, Y_train, n_comp) print('Dimensionality reduction is done') # Classification using SVC Y_pred, Y_train_pred, Y_valid_pred, Y_pred_prob, Y_train_pred_prob, Y_valid_pred_prob = classification( X_train_vect, X_test_vect, X_valid_vect, Y_train, Threshold_prob) data_total = [ X_train, Y_train, X_test, Y_test, X_valid, Y_valid, Y_pred, Y_train_pred, Y_valid_pred, Y_pred_prob, Y_train_pred_prob, Y_valid_pred_prob, Training_map, Validation_map ] with open('Estimated.pkl', 'wb') as fs: pickle.dump(data_total, fs) with open('Estimated.pkl', 'rb') as fl: tt = pickle.load(fl) X_train, Y_train, X_test, Y_test, X_valid, Y_valid, Y_pred, Y_train_pred, Y_valid_pred,\ Y_pred_prob, Y_train_pred_prob, Y_valid_pred_prob, Training_map, Validation_map = tt # print(np.unique(Y_train)) # Diction = keys_output(Y_test, Y_pred, Y_pred_prob, Threshold=0.026) # final_excel(Diction) acc_valid, acc_train = efficiency_estimation(Y_valid_pred, Y_train_pred, X_valid, X_train, Training_map, Validation_map) unique, counts = np.unique(Y_valid, return_counts=True) print('Validation data distribution:{}'.format( dict(zip(np.array(unique), np.array(counts))))) X = "i" * len(Y_valid_pred) X, Y_valid_pred = flattening(X, Y_valid_pred) unique, counts = np.unique(Y_valid_pred, return_counts=True) print('Predicted Validation data distribution:{}'.format( dict(zip(np.array(unique), np.array(counts))))) print('Predicted categories amount: ', len(Y_valid_pred)) print('True categories amount: ', len(Y_valid)) mlb = MultiLabelBinarizer() mlb = mlb.fit(Y_valid) Y_valid = mlb.transform(Y_valid) Y_valid_pred = mlb.transform(Y_valid_pred) # matrix = confusion_matrix(Y_valid.argmax(axis =1), Y_valid_pred.argmax(axis =1)) # print('Validation confusion matrix:'.format(matrix)) return acc_valid, acc_train, seconds
# # 11.7 过采样和欠采样 # 11.7.1 过采样 import pandas as pd data = pd.read_excel("信用卡数据.xlsx") data.head() X = data.drop(columns='分类') y = data['分类'] from collections import Counter Counter(y) # (1)随机过采样 from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) X_oversampled, y_oversampled = ros.fit_resample(X, y) print(Counter(y_oversampled)) print(X_oversampled.shape) # (2)SMOTE过采样 from imblearn.over_sampling import SMOTE smote = SMOTE(random_state=0) X_smotesampled, y_smotesampled = smote.fit_resample(X, y) print(Counter(y_smotesampled)) # 11.7.2 欠采样 from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=0)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ros = RandomOverSampler() X_resampled, y_resampled = ros.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=.5)
class DataProcessor(): """Refactor of processing.py to reuse scaler from train data when scaling test data """ def __init__(self, train_data, **kwargs): """Constructor, used as a base for using the builder methods Arguments: train_data -- Dataframe with full training data without any preprocessing done labels -- Data labels in kwargs for convenience when we have data and labels seperate """ self.target = 'target' self.id_column = "ID_code" self.scaler = None self.sampler = None self.train_data = self.__remove_columns(train_data, self.id_column) if "labels" in kwargs: self.train_data[self.target] = kwargs["labels"] def with_scaling(self): """Builder method to add data scaling to processor. Can be used in method chaining. Returns: Instance of the processor it is called on """ self.scaler = StandardScaler() X, _ = self.__xy_split(self.train_data) self.scaler.fit(X) return self def with_undersampling(self, seed=0): """Builder method to add undersampling. Overwrites oversampling. Can be used in method chaining. Returns: Instance of the processor it is called on """ self.sampler = RandomUnderSampler(random_state=seed) return self def with_oversampling(self, seed=0): """Builder method to add oversampling. Overwrites undersampling. Can be used in method chaining. Returns: Instance of the processor it is called on """ self.sampler = RandomOverSampler(random_state=seed) return self def process_train(self): """Performs enabled data processing tasks for the train dataset. Returns: DataFrame with processed training data + array with training labels + input size (number of columns of processed training data without target or ID columns) """ X, y = self.__xy_split(self.train_data) columns = X.columns if self.sampler is not None: X, y = self.sampler.fit_resample(X, y) X = pd.DataFrame(X, columns=columns) y = np.array(y, dtype=np.float64) if self.scaler is not None: X = self.scaler.transform(X) X = pd.DataFrame(X, columns=columns) return X, y, len(columns) def process_data(self, data, **kwargs): """Performs enabled data processing tasks for a given dataset. This is used for example after splitting training data into train/val datasets, then this method can be used to process the val dataset. Arguments: data -- Dataframe with full data without any preprocessing done. Should also have labels. labels -- Data labels in kwargs for convenience when we have data and labels seperate Returns: DataFrame with processed data + array with labels + input size (number of columns of processed data without target or ID columns) """ data = self.__remove_columns(data, self.id_column) if "labels" in kwargs: data[self.target] = kwargs["labels"] X, y = self.__xy_split(data) columns = X.columns if self.scaler is not None: X = self.scaler.transform(X) X = pd.DataFrame(X, columns=columns) return X, y, len(columns) def process_test(self, data): """Performs enabled data processing tasks for a given test dataset. This should be used for the test set from kaggle. Arguments: data -- Dataframe with full data without any preprocessing done. It should NOT have any labels. Returns: DataFrame with processed data + None (representing the labels, is kept for compatibility reasons) + input size (number of columns of processed data without target or ID columns) """ data = self.__remove_columns(data, self.id_column) X = data columns = X.columns if self.scaler is not None: X = self.scaler.transform(X) X = pd.DataFrame(X, columns=columns) return X, None, len(columns) def __remove_columns(self, data, columns): return data.drop(columns, axis=1, inplace=False) def __xy_split(self, data): return self.__remove_columns(data, self.target), np.array(data[self.target], dtype=np.float64)