def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(X, y).predict(X)
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BalancedBaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1) bagging = BalancedBaggingClassifier(KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1) bagging.fit(X, y) assert bagging._max_samples == max_samples
def buildModel(X, y): # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2])) print X.shape, y.shape scaler = StandardScaler() print(scaler.fit(X)) scaled_train_x = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(scaled_train_x, y, random_state=19, test_size=0.3) bag = BalancedBaggingClassifier(n_estimators=200, random_state=19) svm = SVC(class_weight='balanced', random_state=19, decision_function_shape='ovo') neural = MLPClassifier(max_iter=500, random_state=19, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(49, 8, 4)) ada = AdaBoostClassifier(n_estimators=100, random_state=19) logistic = LogisticRegression(solver='lbfgs', max_iter=500) bag.fit(X_train, y_train) svm.fit(X_train, y_train) neural.fit(X_train, y_train) ada.fit(X_train, y_train) logistic.fit(X_train, y_train) # joblib.dump(bag,'bag.pkl') # joblib.dump(scaler,'scaler.pkl') y_pred = bag.predict(X_test) y_pred2 = svm.predict(X_test) y_pred3 = neural.predict(X_test) y_pred4 = ada.predict(X_test) y_pred5 = logistic.predict(X_test) print matthews_corrcoef(y_test, y_pred) print matthews_corrcoef(y_test, y_pred2) print matthews_corrcoef(y_test, y_pred3) print matthews_corrcoef(y_test, y_pred4) print matthews_corrcoef(y_test, y_pred5) print confusion_matrix(y_test, y_pred) print confusion_matrix(y_test, y_pred2) print confusion_matrix(y_test, y_pred3) print confusion_matrix(y_test, y_pred4) print confusion_matrix(y_test, y_pred5) print(classification_report_imbalanced(y_test, y_pred)) print(classification_report_imbalanced(y_test, y_pred2)) print(classification_report_imbalanced(y_test, y_pred3)) print(classification_report_imbalanced(y_test, y_pred4)) print(classification_report_imbalanced(y_test, y_pred5))
def test_oob_score_consistency(): # Make sure OOB scores are identical when random_state, estimator, and # training data are fixed and fitting is done twice X, y = make_hastie_10_2(n_samples=200, random_state=1) bagging = BalancedBaggingClassifier( KNeighborsClassifier(), max_samples=0.5, max_features=0.5, oob_score=True, random_state=1, ) assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_bagging_with_pipeline(): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) estimator = BalancedBaggingClassifier( make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2, ) estimator.fit(X, y).predict(X)
def cross_validation(x): with open('../data/conv_pred/train_data_' + x + '.pickle', 'rb') as f: data = pickle.load(f) print(data) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) zero = 0 one = 0 for i in y: if i == 0: zero += 1 else: one += 1 print(zero) print(one) cv = 5 kf = KFold(n_splits=cv) fscore = 0 ftscore = 0 all_f_value = 0 all_prec = 0 for train_index, test_index in tqdm(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #model = RandomForestRe(n_estimators=100, n_jobs=8) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8) #model = xgb.XGBClassifier(n_estimators=500,max_delta_step=1,scale_pos_weight=zero/one) model.fit(X_train, y_train) predict = model.predict_proba(X_test) precision, recall, f_value, all_pre = eval(y_test, predict) all_prec += all_pre fscore += precision ftscore += recall all_f_value += f_value pprint( sorted(zip( np.mean([ est.steps[1][1].feature_importances_ for est in model.estimators_ ], axis=0), v.feature_names_), key=lambda x: x[0], reverse=True)) print('\n') print('final precision : ', str(fscore / cv)) print('final recall : ', str(ftscore / cv)) print('final f-value : ', str(all_f_value / cv)) print('final all_precision : ', str(all_prec / cv))
class Classifier(BaseEstimator): def __init__(self): # mimicking balanced random forest with the BalancedBaggingClassifier # and DecisionTreeClassifier combination self.bbc = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(max_features='auto'), ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1) def fit(self, X, y): self.bbc.fit(X, y) def predict_proba(self, X): return self.bbc.predict_proba(X)
def test_max_samples_consistency(): # Make sure validated max_samples and original max_samples are identical # when valid integer max_samples supplied by user max_samples = 100 X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) bagging = BalancedBaggingClassifier( KNeighborsClassifier(), max_samples=max_samples, max_features=0.5, random_state=1, ) bagging.fit(X, y) assert bagging._max_samples == max_samples
def classifier_imblearn_SVM_training(_X, _Y, _weight): X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split( _X, _Y, _weight, test_size=0.2, random_state=0xdeadbeef) bbc = BalancedBaggingClassifier(base_estimator=SVC(kernel="rbf", gamma="auto"), n_estimators=10, sampling_strategy="auto", max_samples=80, replacement=False, random_state=0xdeadbeef) bbc.fit(X_train, Y_train) y_pred = bbc.predict(X_test) print("Result from bagging labeled SVM:") print("tn, fp, fn, tp =", confusion_matrix(Y_test, y_pred).ravel())
def clf_wrapper(classifier, X_train, y_train, X_test, y_test): clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', replacement=False, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) cfm = confusion_matrix(y_test, y_pred) # Predictive Value PPV = cfm[0,0]/(cfm[0,0]+cfm[0,1]) NPV = cfm[1,1]/(cfm[1,0]+cfm[1,1]) ACR = (cfm[0,0]+cfm[1,1])/(cfm[0,0]+cfm[1,1]+cfm[1,0]+cfm[0,1]) return (PPV+NPV+ACR)/3
def test_balanced_bagging_classifier_with_function_sampler(replace): # check that we can provide a FunctionSampler in BalancedBaggingClassifier X, y = make_classification( n_samples=1_000, n_features=10, n_classes=2, weights=[0.3, 0.7], random_state=0, ) def roughly_balanced_bagging(X, y, replace=False): """Implementation of Roughly Balanced Bagging for binary problem.""" # find the minority and majority classes class_counts = Counter(y) majority_class = max(class_counts, key=class_counts.get) minority_class = min(class_counts, key=class_counts.get) # compute the number of sample to draw from the majority class using # a negative binomial distribution n_minority_class = class_counts[minority_class] n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) # draw randomly with or without replacement majority_indices = np.random.choice( np.flatnonzero(y == majority_class), size=n_majority_resampled, replace=replace, ) minority_indices = np.random.choice( np.flatnonzero(y == minority_class), size=n_minority_class, replace=replace, ) indices = np.hstack([majority_indices, minority_indices]) return X[indices], y[indices] # Roughly Balanced Bagging rbb = BalancedBaggingClassifier( base_estimator=CountDecisionTreeClassifier(), n_estimators=2, sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": replace}), ) rbb.fit(X, y) for estimator in rbb.estimators_: class_counts = estimator[-1].class_counts_ assert (class_counts[0] / class_counts[1]) > 0.8
def ranking_by_matthew_punishment_rf(self): std = np.zeros(len(self.X.columns),) rankings = np.zeros(len(self.X.columns),) for x in range(self.loops): seed = randint(0, 10000) #Splits the train/val set by a seed that generates randomly each loop. X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed) #Initializing a random forest rf = BalancedBaggingClassifier(n_estimators=50, random_state=0) #Fits the Random forest and we calculate a R2. rf.fit(X_train, y_train) r2original = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We initialize 2 lists to append values from the next loop. r2fr= [] columnsrf= [] for x in self.X.columns: X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed) #We drop a different column each loop. X_train = X_train.drop([x], axis=1) X_fr = X_fr.drop([x], axis=1) #We fit our random forest again, but this time our training dataset lacks a feature. rf.fit(X_train, y_train) r2 = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We append to the list each column that we dropped. columnsrf.append(x) #And we also append, the drop (or gain), in r2 that we got when the feature was missing. r2fr.append(r2original - r2) outcome = np.array(r2fr) rankings = np.add(outcome, rankings) std = np.vstack((outcome, std)) rankings = np.true_divide(rankings, self.loops) std = np.delete(std, -1, axis = 0) std = np.std(std, axis = 0) std = np.dstack((columnsrf, std)) std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'SD_of_matt_punishment']) featuresranks = np.dstack((columnsrf, rankings)) borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'average-mtt-punishment']) borda['ranking'] = borda['average-mtt-punishment'].rank(ascending = False) borda = borda.merge(std, on = 'Categories',) borda.sort_values(by='average-mtt-punishment', inplace = True, ascending = False) return borda
def test_probability(): # Predict probabilities. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), ) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression(solver="lbfgs", multi_class="auto"), random_state=0, max_samples=5, ) ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)), ) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)), )
def test_balanced_bagging_classifier_error(params): # Test that it gives proper exception on deficient input. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }) base = DecisionTreeClassifier() clf = BalancedBaggingClassifier(base_estimator=base, **params) with pytest.raises(ValueError): clf.fit(X, y) # Test support of decision_function assert not (hasattr( BalancedBaggingClassifier(base).fit(X, y), "decision_function"))
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators" " does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def impute_by_model(df, df_test, impList, classifier): # convert ' ?' to NAN sothat those values will be converted to -1 when tranform to numerical df, df_test = unknown_to_NAN(df, df_test) # create a new df by dropping all rows having NAN values # only to build model for imputation dropna_df = df.dropna(how='any').reset_index(drop=True) # before convert both df, df_test to numerical, replace below value to its column mode # so that, native_country will have same numerical value for each country df.__getitem__('native_country').replace(' Holand-Netherlands', ' United-States') # convert to numerical num_dropna_df = df2num(dropna_df, headers) num_df_test = df2num(df_test, headers) num_df = df2num(df, headers) # to learn model on dataset which dropped rows contains missing values Xtr_train = num_dropna_df[impList[0]].values ytr_train = num_dropna_df[impList[1]].values # colum missing value from training data, use to impute training set Xtr_test = num_df[impList[0]].values # colum missing value from test data, use to impute training set Xt_test = num_df_test[impList[0]].values clf = BalancedBaggingClassifier(base_estimator=classifier, ratio='auto', random_state=0) clf.fit(Xtr_train, ytr_train) # impute training data ytr_pred = clf.predict(Xtr_test) lst = df.loc[num_df[impList[1]] == -1, impList[1]].index.tolist() num_df.loc[lst, impList[1]] = ytr_pred[lst] # impute test data yt_pred = clf.predict(Xt_test) lstt = df_test.loc[num_df_test[impList[1]] == -1, impList[1]].index.tolist() num_df_test.loc[lstt, impList[1]] = yt_pred[lstt] # return df, df_test return df, df_test
def fit(self, X, Y, sample_weight=None): import sklearn.tree if self.estimator is None: self.max_depth = int(self.max_depth) self.estimator = sklearn.tree.DecisionTreeClassifier(max_depth=self.max_depth) from imblearn.ensemble import BalancedBaggingClassifier estimator = BalancedBaggingClassifier(base_estimator=self.estimator, n_estimators=self.n_estimators, max_features=self.max_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, sampling_strategy=self.sampling_strategy, replacement=self.replacement, n_jobs=self.n_jobs, random_state=self.random_state) estimator.fit(X, Y) self.estimator = estimator return self
def Model_Building(): X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Train.csv', engine='python') Y_train = X['Bad_label'].values X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True) X_train = X.values X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Test.csv', engine='python') Y_test = X['Bad_label'].values X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True) X_test = X.values imp1.fit(X_train) X_train = imp1.transform(X_train).astype(float) # print(X_train) imp2.fit(X_test) X_test = imp2.transform(X_test).astype(float) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) print(X_train.shape) bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100)\ ,ratio='auto',replacement=False,random_state=0, bootstrap_features=False) clf = SelectKBest(mutual_info_classif, k=49) X_train = clf.fit_transform(X_train, Y_train) X_test = clf.transform(X_test) bbc.fit(X_train, Y_train) y_pred = bbc.predict(X_test) print(confusion_matrix(Y_test, y_pred)) print(classification_report(Y_test, y_pred)) fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred, pos_label=1) auc_score = metrics.auc(fpr, tpr) print('auc score =', auc_score) print('gini score =', 2 * auc_score - 1)
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) bagging = BalancedBaggingClassifier( LogisticRegression(solver="lbfgs", multi_class="auto"), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False, ) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == "i" # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.steps[-1][1].coef_ estimator.fit(X_train, y_train) new_coefs = estimator.steps[-1][1].coef_ assert_allclose(orig_coefs, new_coefs)
def test_probability(): # Predict probabilities. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0).fit( X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression(solver='lbfgs', multi_class='auto'), random_state=0, max_samples=5) ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal( ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def test_estimators_samples(): # Check that format of estimators_samples_ is correct and that results # generated at fit time can be identically reproduced at a later time # using data saved in object attributes. X, y = make_hastie_10_2(n_samples=200, random_state=1) # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs', multi_class='auto'), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) bagging.fit(X, y) # Get relevant attributes estimators_samples = bagging.estimators_samples_ estimators_features = bagging.estimators_features_ estimators = bagging.estimators_ # Test for correct formatting assert len(estimators_samples) == len(estimators) assert len(estimators_samples[0]) == len(X) // 2 assert estimators_samples[0].dtype.kind == 'i' # Re-fit single estimator to test for consistent sampling estimator_index = 0 estimator_samples = estimators_samples[estimator_index] estimator_features = estimators_features[estimator_index] estimator = estimators[estimator_index] X_train = (X[estimator_samples])[:, estimator_features] y_train = y[estimator_samples] orig_coefs = estimator.steps[-1][1].coef_ estimator.fit(X_train, y_train) new_coefs = estimator.steps[-1][1].coef_ assert_allclose(orig_coefs, new_coefs)
def cross_validation_another(x): with open('../data/conv_pred/super_train_data_day_' + 'A' + '.pickle', 'rb') as f: data = pickle.load(f) with open('../data/conv_pred/super_test_data_day_' + 'A' + '.pickle', 'rb') as f: test = pickle.load(f) v = DictVectorizer() X_train = v.fit_transform(data['X']) y_train = np.array(data['y']) X_test = v.transform(test['X']) y_test = np.array(test['y']) zero = 0 one = 0 for i in y_train: if i == 0: zero += 1 else: one += 1 print(zero) print(one) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8, max_samples=0.6) #model = xgb.XGBClassifier(n_estimators=500, max_delta_step=1, scale_pos_weight=zero / one) model.fit(X_train, y_train) predict = model.predict_proba(X_test) precision, recall, f_value, all_pre = eval(y_test, predict) all_prec = all_pre fscore = precision ftscore = recall all_f_value = f_value print('\n') print('final precision : ', str(fscore)) print('final recall : ', str(ftscore)) print('final f-value : ', str(all_f_value)) print('final all_precision : ', str(all_prec))
def train_model(data): dataset = pd.get_dummies( data, columns=['Employment.Type', 'Driving_flag', 'Bureau_bin'], drop_first=True) #dataset = pd.get_dummies(data,columns=['Employment.Type','Driving_flag'],drop_first=True) X = dataset.drop('loan_default', axis=1) y = dataset['loan_default'] #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.8, stratify=y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y) rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100) rfc.fit(X_train, y_train) lr = LogisticRegression(class_weight='balanced') lr.fit(X_train, y_train) xgb = XGBClassifier(scale_pos_weight=3.4) xgb.fit(X_train, y_train) brfc = BalancedRandomForestClassifier(max_depth=4, random_state=0) brfc.fit(X_train, y_train) bbc = BalancedBaggingClassifier(n_estimators=100, random_state=42) bbc.fit(X_train, y_train) models = [rfc, lr, xgb, brfc, bbc] model_names = [ 'RandomForestClassifier', 'LogisticRegression', 'XGBClassifier', 'BalancedRandomForestClassifier', 'BalancedBaggingClassifier' ] for m, n in zip(models, model_names): print('Classifier: ' + n) predict_evaluate_classifier(X_test, y_test, m) return rfc, lr, xgb, brfc, bbc
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BalancedBaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set([pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
class Models(object): """ 获取基于机器学习的文本算法 """ def __init__(self, model_path=None, feature_engineer=False, train_mode=True): # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中 self.res_model = torchvision.models.resnet152(pretrained=True).to( config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True).to(config.device) self.wide_model = torchvision.models.wide_resnet101_2( pretrained=True).to(config.device) # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert').to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if not train_mode: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} else: # 如果 feature_engineer, 则使用lightgbm 进行训练, 反之对比经典机器学习模型 if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): print(" generate embedding feature ") # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合 train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.tfidf, self.ml_data.w2v) # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为: # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300] # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_mean:[seq, 300] -> [300] # w2v_max:[seq, 300] -> [300] # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300] # w2v_win_3_mean # w2v_win_4_mean # w2v_win_2_max # w2v_win_3_max # w2v_win_4_max test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) print("generate lda feature ") # 生成 bag of word 格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...] # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), test['bow'])) # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布 print("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.book_cover_path) # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) # print("generate autoencoder feature ") # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder # TODO # train_ae = get_autoencoder_feature( # train, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) # test_ae = get_autoencoder_feature( # test, # self.ml_data.ae.max_fe atures, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") # 将所有的特征拼接到一起 train = formate_data( train, train_tfidf) # train = formate_data(train, train_tfidf, train_ae) test = formate_data( test, test_tfidf) # test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': print("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': print("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) print("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): print("get all feature") # 生成所有 feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTE deal with unbalance data ") # https://www.zhihu.com/question/269698662 # https://www.cnblogs.com/kamekin/p/9824294.html self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' print('search best param') # 使用 set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) print('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 print('Train accuracy %s' % per) # 输出测试集的准确率 print('test accuracy %s' % acc) # 输出recall print('test recall %s' % recall) # 输出F1-score print('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果 for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 print(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 print(model_name + '_' + ' test accuracy %s' % acc) # 输出recall print(model_name + '_' + 'test recall %s' % recall) # 输出F1-score print(model_name + '_' + 'test F1_score %s' % f1) def process(self, title, desc): # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in get_stop_word_list()]) df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow)) print("generate autoencoder feature ") # df_ae = get_autoencoder_feature(df, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf) #, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' self.model = joblib.load(path)
y_pred = model.predict(X_test) mostrar_resultados(y_test, y_pred, 'Oversampling') # Estrategia: Combinamos resampling con Smote-Tomek # Ahora probaremos una técnica muy usada que consiste en aplicar # en simultáneo un algoritmo de undersampling y otro de oversampling # a la vez al dataset. En este caso usaremos SMOTE para oversampling: # busca puntos vecinos cercanos y agrega puntos “en linea recta” entre ellos. # Y usaremos Tomek para undersampling que quita los de distinta clase que sean # nearest neighbor y deja ver mejor el decisión boundary # (la zona limítrofe de nuestras clases). os_us = SMOTETomek(sampling_strategy=0.5) X_train_res, y_train_res = os_us.fit_resample(X_train, y_train) print(f'Distribution before resampling {Counter(y_train)}') print(f'Distribution after resampling {Counter(y_train_res)}') model = run_model(X_train_res, X_test, y_train_res, y_test) y_pred = model.predict(X_test) mostrar_resultados(y_test, y_pred, 'Smote-Tomek') # Estrategia: Ensamble de Modelos con Balanceo # Para esta estrategia usaremos un Clasificador de Ensamble # que usa Bagging y el modelo será un DecisionTree. Veamos como se comporta: bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) # Train the classifier bbc.fit(X_train, y_train) y_pred = bbc.predict(X_test) mostrar_resultados(y_test, y_pred, 'Ensamble BBC')
bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0) bc.fit(X_train, y_train) y_pred = bc.predict(X_test) print(confusion_matrix(y_test, y_pred)) ''' BalancedBaggingClassifier 允许在训练每个基学习器之前对每个子集进行重抽样. 简而言之,该方法结合了EasyEnsemble采样器与分类器(如BaggingClassifier)的结果. ''' from imblearn.ensemble import BalancedBaggingClassifier bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), ratio='auto', replacement=False, random_state=0) bbc.fit(X, y) y_pred = bbc.predict(X_test) print(confusion_matrix(y_test, y_pred)) ''' imblearn.datasets包与sklearn.datasets包形成了很好的互补. 该包主要有以下两个功能:(i)提供一系列的不平衡数据集来实现测试;(ii) 提供一种工具将原始的平衡数据转换为不平衡数据. '''
random_state=1) y_train = y_train.squeeze() y_test = y_test.squeeze() # ### Fit the model # Fit the best model based on tuned parameters GBM_clf = ensemble.GradientBoostingClassifier(learning_rate=0.05, max_depth=3, n_estimators=100) best_clf = BalancedBaggingClassifier(base_estimator=GBM_clf, ratio='auto', replacement=False, random_state=0) # Fit the model and check ConfusionMatrix best_clf.fit(X_train, y_train) # Check R-Style confusionMatrix y_pred = best_clf.predict(X_test).tolist( ) ## change type: object to list, cannot create Confusion Matrix if not change confusionMatrix(y_pred, y_test).show() ## Show the Confusion Matrix # Classification Report print('Classification Report:\n', classification_report(y_test, y_pred, target_names=["AS", "PsA", "RA"])) ### prepare input for ROC n_classes = len( y_train.unique() ) # number of indications, if 2 then n_class=1, if >2 then the number of indications y_score = best_clf.fit(X_train, y_train).decision_function(X_test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=Y) cl = BalancedBaggingClassifier( base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11), n_estimators=50, max_samples=0.6, max_features=0.7, n_jobs=-1, bootstrap_features=True, oob_score=False) cl.fit(X_train, Y_train) predictions = cl.predict(X_train) # print(X_train.shape,Y_train.shape,predictions.shape) # print(list(zip(Y_train,predictions))) print('\n\nModel Train: f1 = {0} '.format( f1_score(Y_train, predictions, average='micro'))) predictions = cl.predict(X_test) print('\nModel Test: f1 = {0} '.format( f1_score(Y_test, predictions, average='micro'))) # exit() cl = BalancedBaggingClassifier( base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11),
classifier_3 = RandomForestClassifier(n_estimators=5, criterion='entropy') classifier_3.fit(X_train, Y_train) # Fitting classifier to the training data: Model 4 from sklearn.linear_model import LogisticRegression classifier_4 = LogisticRegression(penalty='l1', random_state=0) classifier_4.fit(X_train, Y_train) # Fitting Balanced Bagging Classifier to the training data: Model 5 from imblearn.ensemble import BalancedBaggingClassifier from sklearn.ensemble import RandomForestClassifier classifier_5 = BalancedBaggingClassifier( base_estimator=RandomForestClassifier(criterion='entropy'), n_estimators=5, bootstrap=True) classifier_5.fit(X_train, Y_train) # Fitting Decision Tree to the training data: Model 6 from sklearn.tree import DecisionTreeClassifier classifier_6 = DecisionTreeClassifier() classifier_6.fit(X_train, Y_train) # In[ ]: # Predicting the results y_pred_1 = classifier_1.predict(X_test) y_pred_2 = classifier_2.predict(X_test) y_pred_3 = classifier_3.predict(X_test) y_pred_4 = classifier_4.predict(X_test) y_pred_5 = classifier_5.predict(X_test) y_pred_6 = classifier_6.predict(X_test)
# use original features X_train_o = X_train[:, 0:original_len] X_test_o = X_test[:, 0:original_len] X_train_n = X_train[:, original_len:] X_test_n = X_test[:, original_len:] for clf, clf_name in zip(clf_list, clf_name_list): print('processing', clf_name, 'round', i + 1) if clf_name != 'xgb': clf = BalancedBaggingClassifier(base_estimator=clf, ratio='auto', replacement=False) # fully supervised clf.fit(X_train_o, y_train.ravel()) y_pred = clf.predict_proba(X_test_o) roc_score = roc_auc_score(y_test, y_pred[:, 1]) prec_n = get_precn(y_test, y_pred[:, 1]) result_dict[clf_name + 'ROC' + 'o'].append(roc_score) result_dict[clf_name + 'PRC@n' + 'o'].append(prec_n) # unsupervised clf.fit(X_train_n, y_train.ravel()) y_pred = clf.predict_proba(X_test_n) roc_score = roc_auc_score(y_test, y_pred[:, 1]) prec_n = get_precn(y_test, y_pred[:, 1])
############################################################################### # Classification using bagging classifier with and without sampling ############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc))) cm_bagging = confusion_matrix(y_test, y_pred_bc) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_bagging,
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') ozone = fetch_datasets()['ozone_level'] X, y = ozone.data, ozone.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) bagging = BaggingClassifier(random_state=0) balanced_bagging = BalancedBaggingClassifier(random_state=0) print('Class distribution of the training set: {}'.format(Counter(y_train))) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) print('Class distribution of the test set: {}'.format(Counter(y_test))) print('Classification results using a bagging classifier on imbalanced data') y_pred_bagging = bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_bagging)) cm_bagging = confusion_matrix(y_test, y_pred_bagging) plt.figure() plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target), title='Confusion matrix using BaggingClassifier') print('Classification results using a bagging classifier on balanced data') y_pred_balanced_bagging = balanced_bagging.predict(X_test) print(classification_report_imbalanced(y_test, y_pred_balanced_bagging)) cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
title='Decision tree') ############################################################################### # Classification using bagging classifier with and without sampling ############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc))) cm_bagging = confusion_matrix(y_test, y_pred_bc) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0],
class Models(object): def __init__(self, model_path=None, feature_engineer=False, train_mode=True): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中 ########################################### # TODO: module 2 task 2.1 # ########################################### self.res_model = torchvision.models.resnet152( pretrained=True) # res model for modal feature [1* 1000] self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if train_mode: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合 ########################################### # TODO: module 3 task 1.1 # ########################################### train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.em.tfidf, self.ml_data.em.w2v) test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.em.tfidf, self.ml_data.em.w2v) logger.info("generate autoencoder feature ") # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder train_ae = get_autoencoder_feature( train, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) test_ae = get_autoencoder_feature( test, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) logger.info("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) logger.info("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.root_path + '/data/book_cover/') # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding ########################################### # TODO: module 3 task 1.2 # ########################################### train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) logger.info("generate bert feature ") ########################################### # TODO: module 3 task 1.3 # ########################################### train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) logger.info("generate lda feature ") ########################################### # TODO: module 3 task 1.4 # ########################################### # 生成bag of word格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) logger.info("formate data") # 将所有的特征拼接到一起 train = formate_data(train, train_tfidf, train_ae) test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") # 生成所有feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble ########################################### # TODO: module 4 task 1.1 # ########################################### if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') # 使用set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': ########################################### # TODO: module 4 task 1.2 # ########################################### # param = self.param_search(search_method=search_method) # param['params']['num_leaves'] = int(param['params']['num_leaves']) # param['params']['max_depth'] = int(param['params']['max_depth']) param = {} param['params'] = {} param['params']['num_leaves'] = 3 param['params']['max_depth'] = 5 self.model = self.model.set_params(**param['params']) logger.info('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) ########################################### # TODO: module 4 task 1.3 # ########################################### Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def process(self, title, desc): ########################################### # TODO: module 5 task 1.1 # ########################################### # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in self.ml_data.em.stopWords]) df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf, self.ml_data.em.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), df.bow)) print("generate autoencoder feature ") df_ae = get_autoencoder_feature(df, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' ########################################### # TODO: module 5 task 1.1 # ########################################### inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### self.model = joblib.load(path)