示例#1
0
    def model_train(self):
        env = Environment()
        data = self.stat()
        t_start = timer()
        y, X = self.model_prepare_data(data)

        seed = 241
        scoring = 'accuracy'
        n_splits = 4
        frac_test_size = 0.25

        #Cross-validation
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        #clf = DecisionTreeClassifier(criterion='gini', random_state=seed)
        #clf = GradientBoostingClassifier(n_estimators=50)
        model = xgb.XGBClassifier(n_estimators=400,
                                  max_depth=24,
                                  colsample=1,
                                  subsample=1,
                                  seed=seed)
        cv_scores = cross_val_score(model, X, y, cv=kf)

        #eval
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=frac_test_size, random_state=seed)
        eval_set = [(X_train, y_train), (X_test, y_test)]
        #print(eval_set)
        f_eval = 'merror'
        # f_eval = 'mlogloss'
        model.fit(X_train,
                  y_train,
                  eval_metric=f_eval,
                  eval_set=eval_set,
                  verbose=False,
                  early_stopping_rounds=10)
        ev_scores = model.evals_result()

        cv_mean = np.array(cv_scores.mean())
        #ev_mean = np.array(ev_scores['validation_0']['mlogloss']).mean()
        ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()

        #Посмотрим важность признаков в модели
        #print(model.feature_importances_)
        xgb.plot_importance(model)
        #plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
        plt.show()

        #Обучаем модель на всех данных
        model.fit(X, y, verbose=False)
        #Сохраняем модель на диск
        pickle.dump(model, open(env.filename_model_texts(), 'wb'))

        #print('CV', cv_scores, 'EV', ev_scores)
        print('Cross-validation: mean', cv_mean, 'eval_set mean', ev_mean)
        return model
示例#2
0
 def model_predict(self, df, b_retrain=False):
     env = Environment()
     y, X = self.model_prepare_data(df, mode='test')
     if b_retrain:
         model = self.model_train(
         )  #Если хотим для кажжого теста вновь тренировать модель
     else:
         #Загружаем ранее тренированную модель с диска
         model = pickle.load(open(env.filename_model_texts(), 'rb'))
     #Предсказываем
     y = model.predict(X)
     return y