def model_train(self): env = Environment() data = self.stat() t_start = timer() y, X = self.model_prepare_data(data) seed = 241 scoring = 'accuracy' n_splits = 4 frac_test_size = 0.25 #Cross-validation kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) #clf = DecisionTreeClassifier(criterion='gini', random_state=seed) #clf = GradientBoostingClassifier(n_estimators=50) model = xgb.XGBClassifier(n_estimators=400, max_depth=24, colsample=1, subsample=1, seed=seed) cv_scores = cross_val_score(model, X, y, cv=kf) #eval X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=frac_test_size, random_state=seed) eval_set = [(X_train, y_train), (X_test, y_test)] #print(eval_set) f_eval = 'merror' # f_eval = 'mlogloss' model.fit(X_train, y_train, eval_metric=f_eval, eval_set=eval_set, verbose=False, early_stopping_rounds=10) ev_scores = model.evals_result() cv_mean = np.array(cv_scores.mean()) #ev_mean = np.array(ev_scores['validation_0']['mlogloss']).mean() ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean() #Посмотрим важность признаков в модели #print(model.feature_importances_) xgb.plot_importance(model) #plt.bar(range(len(model.feature_importances_)), model.feature_importances_) plt.show() #Обучаем модель на всех данных model.fit(X, y, verbose=False) #Сохраняем модель на диск pickle.dump(model, open(env.filename_model_texts(), 'wb')) #print('CV', cv_scores, 'EV', ev_scores) print('Cross-validation: mean', cv_mean, 'eval_set mean', ev_mean) return model
def model_predict(self, df, b_retrain=False): env = Environment() y, X = self.model_prepare_data(df, mode='test') if b_retrain: model = self.model_train( ) #Если хотим для кажжого теста вновь тренировать модель else: #Загружаем ранее тренированную модель с диска model = pickle.load(open(env.filename_model_texts(), 'rb')) #Предсказываем y = model.predict(X) return y