def test_balanced_bagging_classifier_samplers(sampler, n_samples_bootstrap): # check that we can pass any kind of sampler to a bagging classifier X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = BalancedBaggingClassifier( base_estimator=CountDecisionTreeClassifier(), n_estimators=2, sampler=sampler, random_state=0, ) clf.fit(X_train, y_train) clf.predict(X_test) # check that we have balanced class with the right counts of class # sample depending on the sampling strategy assert_array_equal(list(clf.estimators_[0][-1].class_counts_.values()), n_samples_bootstrap)
def ranking_borda_BalancedBagging(self): a = 0 rankings = np.zeros(len(self.X.columns),) std = np.zeros(len(self.X.columns),) for x in range(self.loops): seed = randint(0, 10000) #Splits the train/val set by a seed that generates randomly each loop. X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed) #Initializing a random forest rf = BalancedBaggingClassifier(n_estimators=50, random_state=0) #Fits the Random forest and we calculate the matthew score. rf.fit(X_train, y_train) mattheworiginal = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We initialize 2 lists to append values from the next loop. matthewscores= [] columnsrf= [] for x in self.X.columns: X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed) #We drop a different column each loop. X_train = X_train.drop([x], axis=1) X_fr = X_fr.drop([x], axis=1) #We fit our random forest again, but this time our training dataset lacks a feature. rf.fit(X_train, y_train) matthew = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We append to the list each column that we dropped. columnsrf.append(x) #And we also append, the drop (or gain), in r2 that we got when the feature was missing. matthewscores.append(mattheworiginal - matthew) a += 1 outcome = np.array(list(zip(columnsrf, matthewscores))) outcomepd = pd.DataFrame(data=outcome, columns=['Variables', 'r2-punish']) outcomepd['ranking'] = outcomepd['r2-punish'].rank(ascending = False) rankings = np.add(outcomepd['ranking'].to_numpy(), rankings) # We stack each value vertically to get a 2d numpy array std = np.vstack((outcomepd['ranking'].to_numpy(), std)) std = np.delete(std, -1, axis = 0) std = np.std(std, axis = 0) std = np.dstack((columnsrf, std)) featuresranks = np.dstack((columnsrf, rankings)) std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'STD']) borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'Borda-Score']) borda = borda.merge(std, on = 'Categories',) borda['Borda-Score'] = pd.to_numeric(borda['Borda-Score']) borda['Borda-Average'] = borda['Borda-Score'] / self.loops borda['ranking'] = borda['Borda-Score'].rank(ascending = True) borda.sort_values(by='Borda-Score', inplace = True) return borda
def ranking_by_matthew_punishment_rf(self): std = np.zeros(len(self.X.columns),) rankings = np.zeros(len(self.X.columns),) for x in range(self.loops): seed = randint(0, 10000) #Splits the train/val set by a seed that generates randomly each loop. X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed) #Initializing a random forest rf = BalancedBaggingClassifier(n_estimators=50, random_state=0) #Fits the Random forest and we calculate a R2. rf.fit(X_train, y_train) r2original = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We initialize 2 lists to append values from the next loop. r2fr= [] columnsrf= [] for x in self.X.columns: X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed) #We drop a different column each loop. X_train = X_train.drop([x], axis=1) X_fr = X_fr.drop([x], axis=1) #We fit our random forest again, but this time our training dataset lacks a feature. rf.fit(X_train, y_train) r2 = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We append to the list each column that we dropped. columnsrf.append(x) #And we also append, the drop (or gain), in r2 that we got when the feature was missing. r2fr.append(r2original - r2) outcome = np.array(r2fr) rankings = np.add(outcome, rankings) std = np.vstack((outcome, std)) rankings = np.true_divide(rankings, self.loops) std = np.delete(std, -1, axis = 0) std = np.std(std, axis = 0) std = np.dstack((columnsrf, std)) std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'SD_of_matt_punishment']) featuresranks = np.dstack((columnsrf, rankings)) borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'average-mtt-punishment']) borda['ranking'] = borda['average-mtt-punishment'].rank(ascending = False) borda = borda.merge(std, on = 'Categories',) borda.sort_values(by='average-mtt-punishment', inplace = True, ascending = False) return borda
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0, ).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier(), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def balanced_bragging(X_train, y_train, X_test, y_test, X_train_res, y_train_res): bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train.values.ravel()) y_train_bc = bagging.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_bc) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Niezbalansowane (bragging): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) bagging_oversampling = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging_oversampling.fit(X_train_res, y_train_res.ravel()) y_train_bc = bagging_oversampling.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_bc) with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("z oversamplingiem (bragging): {}%".format(with_oversampling)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging.fit(X_train, y_train.values.ravel()) y_train_bbc = balanced_bagging.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_bbc) within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Zbalansowane (bragging): {}%".format(within)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) objects = ('Bragging','Bragging z oversamplingiem SMOTE', 'Bragging z losowym undersamplingiem') y_pos = np.arange(len(objects)) performance = [without,with_oversampling, within] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność braggingu') plt.show() return without, within
def Model_3(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [DPC(i) for i in train['Sequence']] X_test = [DPC(i) for i in test['Sequence']] Y_train = train['label'] # Training clf = BalancedBaggingClassifier(base_estimator=RandomForestClassifier( bootstrap=False, n_estimators=450, random_state=6), n_estimators=25, n_jobs=-1, random_state=6, verbose=1) clf.fit(X_train, Y_train) # Predicting Y_pred = clf.predict(X_test) Y_prob = [x[1] for x in clf.predict_proba(X_test)] result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_3.csv", index=False) result["Label"] = Y_pred result.to_csv("Predictions_3.csv", index=False)
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators" " does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def imblearn_(classifier, X_train, y_train, X_test, y_test): clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) printStats(y_test, y_pred) return clf, y_pred
def impute_by_model(df, df_test, impList, classifier): # convert ' ?' to NAN sothat those values will be converted to -1 when tranform to numerical df, df_test = unknown_to_NAN(df, df_test) # create a new df by dropping all rows having NAN values # only to build model for imputation dropna_df = df.dropna(how='any').reset_index(drop=True) # before convert both df, df_test to numerical, replace below value to its column mode # so that, native_country will have same numerical value for each country df.__getitem__('native_country').replace(' Holand-Netherlands', ' United-States') # convert to numerical num_dropna_df = df2num(dropna_df, headers) num_df_test = df2num(df_test, headers) num_df = df2num(df, headers) # to learn model on dataset which dropped rows contains missing values Xtr_train = num_dropna_df[impList[0]].values ytr_train = num_dropna_df[impList[1]].values # colum missing value from training data, use to impute training set Xtr_test = num_df[impList[0]].values # colum missing value from test data, use to impute training set Xt_test = num_df_test[impList[0]].values clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', random_state=0) clf.fit(Xtr_train, ytr_train) # impute training data ytr_pred = clf.predict(Xtr_test) lst = df.loc[num_df[impList[1]] == -1, impList[1]].index.tolist() num_df.loc[lst, impList[1]] = ytr_pred[lst] # impute test data yt_pred = clf.predict(Xt_test) lstt = df_test.loc[num_df_test[impList[1]] == -1, impList[1]].index.tolist() num_df_test.loc[lstt, impList[1]] = yt_pred[lstt] # return df, df_test return df, df_test
class Classifier(BaseEstimator): def __init__(self): self.reg = BalancedBaggingClassifier(n_estimators=50, random_state=42) def fit(self, X, y): self.reg.fit(X, y) def predict(self, X): return self.reg.predict(X)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def buildModel(X, y): # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2])) print X.shape, y.shape scaler = StandardScaler() print(scaler.fit(X)) scaled_train_x = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(scaled_train_x, y, random_state=19, test_size=0.3) bag = BalancedBaggingClassifier(n_estimators=200, random_state=19) svm = SVC(class_weight='balanced', random_state=19, decision_function_shape='ovo') neural = MLPClassifier(max_iter=500, random_state=19, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(49, 8, 4)) ada = AdaBoostClassifier(n_estimators=100, random_state=19) logistic = LogisticRegression(solver='lbfgs', max_iter=500) bag.fit(X_train, y_train) svm.fit(X_train, y_train) neural.fit(X_train, y_train) ada.fit(X_train, y_train) logistic.fit(X_train, y_train) # joblib.dump(bag,'bag.pkl') # joblib.dump(scaler,'scaler.pkl') y_pred = bag.predict(X_test) y_pred2 = svm.predict(X_test) y_pred3 = neural.predict(X_test) y_pred4 = ada.predict(X_test) y_pred5 = logistic.predict(X_test) print matthews_corrcoef(y_test, y_pred) print matthews_corrcoef(y_test, y_pred2) print matthews_corrcoef(y_test, y_pred3) print matthews_corrcoef(y_test, y_pred4) print matthews_corrcoef(y_test, y_pred5) print confusion_matrix(y_test, y_pred) print confusion_matrix(y_test, y_pred2) print confusion_matrix(y_test, y_pred3) print confusion_matrix(y_test, y_pred4) print confusion_matrix(y_test, y_pred5) print(classification_report_imbalanced(y_test, y_pred)) print(classification_report_imbalanced(y_test, y_pred2)) print(classification_report_imbalanced(y_test, y_pred3)) print(classification_report_imbalanced(y_test, y_pred4)) print(classification_report_imbalanced(y_test, y_pred5))
def clf_wrapper(classifier, X_train, y_train, X_test, y_test): clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', replacement=False, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cfm = confusion_matrix(y_test, y_pred) # Predictive Value PPV = cfm[0,0]/(cfm[0,0]+cfm[0,1]) NPV = cfm[1,1]/(cfm[1,0]+cfm[1,1]) ACR = (cfm[0,0]+cfm[1,1])/(cfm[0,0]+cfm[1,1]+cfm[1,0]+cfm[0,1]) return (PPV+NPV+ACR)/3
def classifier_imblearn_SVM_training(_X, _Y, _weight): X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split( _X, _Y, _weight, test_size=0.2, random_state=0xdeadbeef) bbc = BalancedBaggingClassifier(base_estimator=SVC(kernel="rbf", gamma="auto"), n_estimators=10, sampling_strategy="auto", max_samples=80, replacement=False, random_state=0xdeadbeef) bbc.fit(X_train, Y_train) y_pred = bbc.predict(X_test) print("Result from bagging labeled SVM:") print("tn, fp, fn, tp =", confusion_matrix(Y_test, y_pred).ravel())
def Model_Building(): X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Train.csv', engine='python') Y_train = X['Bad_label'].values X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True) X_train = X.values X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Test.csv', engine='python') Y_test = X['Bad_label'].values X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True) X_test = X.values imp1.fit(X_train) X_train = imp1.transform(X_train).astype(float) # print(X_train) imp2.fit(X_test) X_test = imp2.transform(X_test).astype(float) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) print(X_train.shape) bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100)\ ,ratio='auto',replacement=False,random_state=0, bootstrap_features=False) clf = SelectKBest(mutual_info_classif, k=49) X_train = clf.fit_transform(X_train, Y_train) X_test = clf.transform(X_test) bbc.fit(X_train, Y_train) y_pred = bbc.predict(X_test) print(confusion_matrix(Y_test, y_pred)) print(classification_report(Y_test, y_pred)) fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred, pos_label=1) auc_score = metrics.auc(fpr, tpr) print('auc score =', auc_score) print('gini score =', 2 * auc_score - 1)
def test_single_estimator(): # Check singleton ensembles. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( base_estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=0).fit(X_train, y_train) clf2 = make_pipeline(RandomUnderSampler( random_state=clf1.estimators_[0].steps[0][1].random_state), KNeighborsClassifier()).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
bootstrap=True) classifier_5.fit(X_train, Y_train) # Fitting Decision Tree to the training data: Model 6 from sklearn.tree import DecisionTreeClassifier classifier_6 = DecisionTreeClassifier() classifier_6.fit(X_train, Y_train) # In[ ]: # Predicting the results y_pred_1 = classifier_1.predict(X_test) y_pred_2 = classifier_2.predict(X_test) y_pred_3 = classifier_3.predict(X_test) y_pred_4 = classifier_4.predict(X_test) y_pred_5 = classifier_5.predict(X_test) y_pred_6 = classifier_6.predict(X_test) # Creating the confusion matrix from sklearn.metrics import confusion_matrix cm_1 = confusion_matrix(Y_test, y_pred_1) accuracy_1 = (cm_1[0, 0] + cm_1[1, 1]) / len(Y_test) cm_2 = confusion_matrix(Y_test, y_pred_2) accuracy_2 = (cm_2[0, 0] + cm_2[1, 1]) / len(Y_test) cm_3 = confusion_matrix(Y_test, y_pred_3) accuracy_3 = (cm_3[0, 0] + cm_3[1, 1]) / len(Y_test) cm_4 = confusion_matrix(Y_test, y_pred_4) accuracy_4 = (cm_4[0, 0] + cm_4[1, 1]) / len(Y_test)
############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc))) cm_bagging = confusion_matrix(y_test, y_pred_bc) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0], title='Bagging')
from sklearn.model_selection import GridSearchCV from sklearn.ensemble import GradientBoostingClassifier gbm_params2 = {'learning_rate': [0.01, 0.05, 0.1, 0.5, 1], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators':[50,100,500,1000,1500], 'min_samples_leaf':[5,10,15]} rf = GradientBoostingClassifier() grid = GridSearchCV(rf,param_grid,refit=True,verbose=2) grid.fit(X_res,y_res) grid_predictions = grid.predict(X_test) print(confusion_matrix(y_test,grid_predictions)) print(classification_report(y_test,grid_predictions)) from sklearn.metrics import accuracy_score print( accuracy_score(y_test, grid_predictions) ) print( grid.best_params_) # # BalancedBaggingClassifier # In[ ]: from imblearn.ensemble import BalancedBaggingClassifier bbc = BalancedBaggingClassifier(random_state=42) bbc.fit(X_train, y_train) predictions = bbc.predict(X_test) print(confusion_matrix(y_test,predictions)) print(classification_report(y_test,predictions)) from sklearn.metrics import accuracy_score print( accuracy_score(y_test, predictions) ) print(grid.best_params_)
print(clf_rf.score(x_val, y_val)) print(recall_score(y_val, clf_rf.predict(x_val))) print(precision_score(y_val, clf_rf.predict(x_val))) print('\nTest Results') print(clf_rf.score(data_features_test, data_labels_test)) print(recall_score(data_labels_test, clf_rf.predict(data_features_test))) print(precision_score(data_labels_test, clf_rf.predict(data_features_test))) print("END") bbc = BalancedBaggingClassifier(random_state=12) bbc.fit(x_train, np.array(y_train.iloc[:, 0])) print('Validation Results') print(bbc.score(x_val, y_val)) print(recall_score(y_val, bbc.predict(x_val))) print(precision_score(y_val, bbc.predict(x_val))) print('\nTest Results') print(bbc.score(data_features_test, data_labels_test)) print(recall_score(data_labels_test, bbc.predict(data_features_test))) print(precision_score(data_labels_test, bbc.predict(data_features_test))) clf_xg = GradientBoostingClassifier(learning_rate=0.15, n_estimators=70, min_samples_split=0.5, min_samples_leaf=45, max_depth=8, max_features='sqrt', subsample=0.8) clf_xg.fit(x_train_res, y_train_res)
for i in range(8, len(Residues) - 8): r = (Residues[i - 8:i + 9]).upper() # Converting Sequences to Patterns of size 17 t = [] for j in r: # Binary Encoding of Patterns t = t + Encoding[j] Predictors.append(t) Average_Predictions = [0 for i in range(len(Predictors)) ] # Average of 5 Random Runs for i in range(5): print("> Run:", i + 1) SVM = svm.SVC(kernel="rbf", gamma=0.1, C=2) BBC = BalancedBaggingClassifier(base_estimator=SVM) BBC.fit(Patterns, Labels) P = BBC.predict(Predictors) for i in range(len(P)): Average_Predictions[i] += P[i] for i in range(len(Average_Predictions)): if Average_Predictions[i] < 0: Average_Predictions[i] = -1 else: Average_Predictions[i] = 1 Result = pd.DataFrame() # Exporting Predictions Result["ID"] = Test["ID"] Result["Lable"] = Average_Predictions Result.to_csv("2018022_AVG_SVM_BBC.txt", index=False) print(Result)
from imblearn.ensemble import BalancedBaggingClassifier from sklearn.ensemble import RandomForestClassifier #Create an object of the classifier. bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(), sampling_strategy='auto', replacement=False, random_state=0) y_train = train['m13'] X_train = train.drop(['m13'], axis = 1) #Train the classifier. bbc.fit(X_train, y_train) pred_y_1 = bbc.predict(X_train) # print( accuracy_score(y_test, pred_y_1) ) # print(recall_score(y_test, pred_y_1)) # confusion_matrix(y_test, pred_y_1) # In[3]: from imblearn.ensemble import BalancedBaggingClassifier from sklearn.tree import DecisionTreeClassifier #Create an object of the classifier. bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False,
bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) print('Class distribution of the test set: {}'.format(Counter(y_test))) print('Classification results using a bagging classifier on imbalanced data') y_pred_bagging = bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_bagging)) cm_bagging = confusion_matrix(y_test, y_pred_bagging) plt.figure() plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target), title='Confusion matrix using BaggingClassifier') print('Classification results using a bagging classifier on balanced data') y_pred_balanced_bagging = balanced_bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_balanced_bagging)) cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging) plt.figure() plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(ozone.target), title='Confusion matrix using BalancedBaggingClassifier') ############################################################################### # Turning the balanced bagging classifier into a balanced random forest ############################################################################### # It is possible to turn the ``BalancedBaggingClassifier`` into a balanced # random forest by using a ``DecisionTreeClassifier`` with # ``max_features='auto'``. We illustrate such changes below. balanced_random_forest = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(max_features='auto'),
y_pred = model.predict(X_test) mostrar_resultados(y_test, y_pred, 'Oversampling') # Estrategia: Combinamos resampling con Smote-Tomek # Ahora probaremos una técnica muy usada que consiste en aplicar # en simultáneo un algoritmo de undersampling y otro de oversampling # a la vez al dataset. En este caso usaremos SMOTE para oversampling: # busca puntos vecinos cercanos y agrega puntos “en linea recta” entre ellos. # Y usaremos Tomek para undersampling que quita los de distinta clase que sean # nearest neighbor y deja ver mejor el decisión boundary # (la zona limítrofe de nuestras clases). os_us = SMOTETomek(sampling_strategy=0.5) X_train_res, y_train_res = os_us.fit_resample(X_train, y_train) print(f'Distribution before resampling {Counter(y_train)}') print(f'Distribution after resampling {Counter(y_train_res)}') model = run_model(X_train_res, X_test, y_train_res, y_test) y_pred = model.predict(X_test) mostrar_resultados(y_test, y_pred, 'Smote-Tomek') # Estrategia: Ensamble de Modelos con Balanceo # Para esta estrategia usaremos un Clasificador de Ensamble # que usa Bagging y el modelo será un DecisionTree. Veamos como se comporta: bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) # Train the classifier bbc.fit(X_train, y_train) y_pred = bbc.predict(X_test) mostrar_resultados(y_test, y_pred, 'Ensamble BBC')
Y, test_size=0.2, shuffle=Y) cl = BalancedBaggingClassifier( base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11), n_estimators=50, max_samples=0.6, max_features=0.7, n_jobs=-1, bootstrap_features=True, oob_score=False) cl.fit(X_train, Y_train) predictions = cl.predict(X_train) # print(X_train.shape,Y_train.shape,predictions.shape) # print(list(zip(Y_train,predictions))) print('\n\nModel Train: f1 = {0} '.format( f1_score(Y_train, predictions, average='micro'))) predictions = cl.predict(X_test) print('\nModel Test: f1 = {0} '.format( f1_score(Y_test, predictions, average='micro'))) # exit() cl = BalancedBaggingClassifier( base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11), n_estimators=10, max_samples=0.8,
#------------------------------------------------------------- #-------------------------------------algo comparision chart-------------------------------- fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() #----------------------------------------------------- #---------------------------------TESTING OUR MODEL------------------------------------ balbag = BalancedBaggingClassifier() balbag.fit(X_train_res, Y_train_res) predictions = balbag.predict(X_test) accur = "Accuracy of test data:" + str( accuracy_score(Y_test, predictions) * 100) popupmsg(accur) print(confusion_matrix(Y_test, predictions)) balbag = BalancedBaggingClassifier(RandomForestClassifier()) predictions2 = balbag.predict(X2) popupmsg("Accuracy of unseen data:" + str(accuracy_score(Y2, predictions2) * 100)) predictions2 = predictions2.astype(int) output = '' c = 1 for i in predictions2:
class Models(object): def __init__(self, feature_engineer=False): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 1. 使用torchvision 初始化resnet152模型 # 2. 使用torchvision 初始化 resnext101_32x8d 模型 # 3. 使用torchvision 初始化 wide_resnet101_2 模型 # 4. 加载bert 模型 print("load") self.res_model = torchvision.models.resnet152(pretrained=False) self.res_model.load_state_dict( torch.load(config.root_path + '/model/resnet150/resnet152-b121ed2d.pth')) self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) self.ml_data = MLData(debug_mode=True) if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', device='gpu', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") train_tfidf, test_tfidf, train, test = get_embedding_feature( self.ml_data) logger.info("generate basic feature ") # 1. 获取 基本的 NLP feature train = get_basic_feature(train) test = get_basic_feature(test) print(test.loc[0]) logger.info("generate modal feature ") cover = os.listdir(config.root_path + '/data/book_cover/') train['cover'] = train.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 1. 获取 三大CV模型的 modal embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) print(len(test.loc[0, 'res_embedding'])) #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) logger.info("generate bert feature ") # 1. 获取bert embedding train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print(test.loc[0]) logger.info("generate lda feature ") # 1. 获取 lda feature train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) print(test['queryCutRMStopWord']) print(test['bow']) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) print(test['lda']) print(test.loc[0]) logger.info("formate data") print(test) print(test_tfidf) train, test = formate_data(train, test, train_tfidf, test_tfidf) print(test) print(test.loc[0]) cols = [x for x in train.columns if str(x) not in ['labelIndex']] print(cols) X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] print(y_test) return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") # 1. 使用over_sampling 处理样本不平衡问题 print(self.y_train) self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) print(self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") # 1. 使用 under_sampling 处理样本不平衡问题 print(self.X_train) #print(self.y_train) self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) print(self.X_train) #print(self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) logger.info('fit model ') self.model.fit(self.X_train, self.y_train) # 1. 预测测试集的label # 2. 预测训练机的label # 3. 计算percision , accuracy, recall, fi_score Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): ''' @description: using different embedding feature to train common ML models @param {type} X_train, feature of train X_test, feature of test set y_train, label of train set y_test, label of test set feature_method, three options , tfidf, word2vec and fasttext @return: None ''' for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 logger.info(model_name + '_' + ' test accuracy %s' % acc) # 输出recall logger.info(model_name + '_' + 'test recall %s' % recall) # 输出F1-score logger.info(model_name + '_' + 'test F1_score %s' % f1) def predict(self, title, desc): inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): self.model = joblib.load(path)
def model_baseline3(x_train, y_train, x_test, y_test): bagging = BaggingClassifier(random_state=0) balanced_bagging = BalancedBaggingClassifier(random_state=0) bagging.fit(x_train, y_train) balanced_bagging.fit(x_train, y_train) prob = bagging.predict_proba(x_test)[:, 1] predict_score = [float('%.2f' % x) for x in prob] loss_val = log_loss(y_test, predict_score) y_pred = [1 if x > 0.5 else 0 for x in predict_score] fpr, tpr, thresholds = roc_curve(y_test, predict_score) mean_fpr = np.linspace(0, 1, 100) mean_tpr = interp(mean_fpr, fpr, tpr) x_auc = auc(fpr, tpr) fig = plt.figure('Bagging') ax = fig.add_subplot(1, 1, 1) name = 'base_Bagging' plt.plot(mean_fpr, mean_tpr, linestyle='--', label='{} (area = %0.2f, logloss = %0.2f)'.format(name) % (x_auc, loss_val), lw=2) y_pred_bagging = bagging.predict(x_test) cm_bagging = confusion_matrix(y_test, y_pred_bagging) cm1 = plt.figure() plot_confusion_matrix(cm_bagging, classes=[0, 1], title='Confusion matrix of BaggingClassifier') # balanced_bagging prob = balanced_bagging.predict_proba(x_test)[:, 1] predict_score = [float('%.2f' % x) for x in prob] loss_val = log_loss(y_test, predict_score) fpr, tpr, thresholds = roc_curve(y_test, predict_score) mean_fpr = np.linspace(0, 1, 100) mean_tpr = interp(mean_fpr, fpr, tpr) x_auc = auc(fpr, tpr) plt.figure('Bagging') # 选择图 name = 'base_Balanced_Bagging' plt.plot(mean_fpr, mean_tpr, linestyle='--', label='{} (area = %0.2f, logloss = %0.2f)'.format(name) % (x_auc, loss_val), lw=2) y_pred_balanced_bagging = balanced_bagging.predict(x_test) cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging) cm2 = plt.figure() plot_confusion_matrix(cm_balanced_bagging, classes=[0, 1], title='Confusion matrix of BalancedBagging') plt.figure('Bagging') # 选择图 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Luck') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") plt.show() return cm1, cm2, fig
from sklearn.tree import DecisionTreeClassifier # Create an object of the classifier bbc = BalancedBaggingClassifier(base_estimator =DecisionTreeClassifier(criterion='entropy', random_state=0), sampling_strategy='auto', replacement=False, random_state=0) Y_train = train['Taxable.Income'] X_train = train.drop(['Taxable.Income'], axis=1) X_test = test.drop(['Taxable.Income'], axis=1) Y_test = test['Taxable.Income'] # Train the classifier bbc.fit(X_train, Y_train) preds = bbc.predict(X_test) pd.Series(preds).value_counts() # Confusion matrix pd.crosstab(Y_test,preds) # Accuracy np.mean(preds==Y_test) # 58% [# gini 57%] # Cross validation K-fold X = final_data.iloc[:,1:].values Y = final_data.iloc[:,0].values from sklearn.model_selection import KFold kf = KFold(n_splits=2) kf.get_n_splits(X) print(kf)
# ### Fit the model # Fit the best model based on tuned parameters GBM_clf = ensemble.GradientBoostingClassifier(learning_rate=0.05, max_depth=3, n_estimators=100) best_clf = BalancedBaggingClassifier(base_estimator=GBM_clf, ratio='auto', replacement=False, random_state=0) # Fit the model and check ConfusionMatrix best_clf.fit(X_train, y_train) # Check R-Style confusionMatrix y_pred = best_clf.predict(X_test).tolist( ) ## change type: object to list, cannot create Confusion Matrix if not change confusionMatrix(y_pred, y_test).show() ## Show the Confusion Matrix # Classification Report print('Classification Report:\n', classification_report(y_test, y_pred, target_names=["AS", "PsA", "RA"])) ### prepare input for ROC n_classes = len( y_train.unique() ) # number of indications, if 2 then n_class=1, if >2 then the number of indications y_score = best_clf.fit(X_train, y_train).decision_function(X_test) y_test2 = pd.get_dummies(y_test) ROC(n_classes, y_score, y_test2) PRC(n_classes, y_test2, y_score) AUC_model3(best_clf, X_train, y_train, X_test, y_test, n_classes)
class Models(object): def __init__(self, model_path=None, feature_engineer=False, train_mode=True): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中 ########################################### # TODO: module 2 task 2.1 # ########################################### self.res_model = torchvision.models.resnet152( pretrained=True) # res model for modal feature [1* 1000] self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if train_mode: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合 ########################################### # TODO: module 3 task 1.1 # ########################################### train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.em.tfidf, self.ml_data.em.w2v) test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.em.tfidf, self.ml_data.em.w2v) logger.info("generate autoencoder feature ") # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder train_ae = get_autoencoder_feature( train, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) test_ae = get_autoencoder_feature( test, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) logger.info("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) logger.info("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.root_path + '/data/book_cover/') # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding ########################################### # TODO: module 3 task 1.2 # ########################################### train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) logger.info("generate bert feature ") ########################################### # TODO: module 3 task 1.3 # ########################################### train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) logger.info("generate lda feature ") ########################################### # TODO: module 3 task 1.4 # ########################################### # 生成bag of word格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) logger.info("formate data") # 将所有的特征拼接到一起 train = formate_data(train, train_tfidf, train_ae) test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") # 生成所有feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble ########################################### # TODO: module 4 task 1.1 # ########################################### if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') # 使用set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': ########################################### # TODO: module 4 task 1.2 # ########################################### # param = self.param_search(search_method=search_method) # param['params']['num_leaves'] = int(param['params']['num_leaves']) # param['params']['max_depth'] = int(param['params']['max_depth']) param = {} param['params'] = {} param['params']['num_leaves'] = 3 param['params']['max_depth'] = 5 self.model = self.model.set_params(**param['params']) logger.info('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) ########################################### # TODO: module 4 task 1.3 # ########################################### Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def process(self, title, desc): ########################################### # TODO: module 5 task 1.1 # ########################################### # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in self.ml_data.em.stopWords]) df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf, self.ml_data.em.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), df.bow)) print("generate autoencoder feature ") df_ae = get_autoencoder_feature(df, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' ########################################### # TODO: module 5 task 1.1 # ########################################### inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### self.model = joblib.load(path)
class Models(object): """ 获取基于机器学习的文本算法 """ def __init__(self, model_path=None, feature_engineer=False, train_mode=True): # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中 self.res_model = torchvision.models.resnet152(pretrained=True).to( config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True).to(config.device) self.wide_model = torchvision.models.wide_resnet101_2( pretrained=True).to(config.device) # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert').to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if not train_mode: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} else: # 如果 feature_engineer, 则使用lightgbm 进行训练, 反之对比经典机器学习模型 if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): print(" generate embedding feature ") # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合 train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.tfidf, self.ml_data.w2v) # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为: # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300] # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_mean:[seq, 300] -> [300] # w2v_max:[seq, 300] -> [300] # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300] # w2v_win_3_mean # w2v_win_4_mean # w2v_win_2_max # w2v_win_3_max # w2v_win_4_max test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) print("generate lda feature ") # 生成 bag of word 格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...] # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), test['bow'])) # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布 print("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.book_cover_path) # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) # print("generate autoencoder feature ") # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder # TODO # train_ae = get_autoencoder_feature( # train, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) # test_ae = get_autoencoder_feature( # test, # self.ml_data.ae.max_fe atures, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") # 将所有的特征拼接到一起 train = formate_data( train, train_tfidf) # train = formate_data(train, train_tfidf, train_ae) test = formate_data( test, test_tfidf) # test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': print("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': print("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) print("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): print("get all feature") # 生成所有 feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTE deal with unbalance data ") # https://www.zhihu.com/question/269698662 # https://www.cnblogs.com/kamekin/p/9824294.html self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' print('search best param') # 使用 set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) print('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 print('Train accuracy %s' % per) # 输出测试集的准确率 print('test accuracy %s' % acc) # 输出recall print('test recall %s' % recall) # 输出F1-score print('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果 for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 print(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 print(model_name + '_' + ' test accuracy %s' % acc) # 输出recall print(model_name + '_' + 'test recall %s' % recall) # 输出F1-score print(model_name + '_' + 'test F1_score %s' % f1) def process(self, title, desc): # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in get_stop_word_list()]) df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow)) print("generate autoencoder feature ") # df_ae = get_autoencoder_feature(df, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf) #, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' self.model = joblib.load(path)
# Classification using bagging classifier with and without sampling ############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc))) cm_bagging = confusion_matrix(y_test, y_pred_bc) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0], title='Bagging') print('Balanced Bagging classifier performance:')