Пример #1
0
def lgb_model(num_boost_round=200):

    train,test,y_train,y_test = get_data_and_split_train_test()
    print('shape of train set=',train.shape)
    params_lgb['metric']=['binary_logloss','auc']
    params_lgb['max_depth']=15
    params_lgb['feature_fraction'] = .8
    import lightgbm as lgb
    s=time()
    print('shape of train set=',train.shape)
    evals_result={}
    lgb_train=lgb.Dataset(train,label=y_train)
    lgb_test=lgb.Dataset(test,label=y_test)
    model=lgb.train(params=params_lgb,train_set=lgb_train,
                    num_boost_round=num_boost_round,
                    valid_sets=[lgb_train,lgb_test],
                    evals_result=evals_result,
                    verbose_eval=num_boost_round//3)
    evaluate_model(model, train, test, y_train, y_test)
    predict_fill_sample(model, file='LGB_model')
    lgb.plot_metric(evals_result,'auc')
    plt.show()
    lgb.plot_importance(model,max_num_features=40)
    plt.show()
    print('-'*80)
Пример #2
0
def train(train_x,train_y,test_x,res,show_importance=True):
    clf = lgb.LGBMClassifier(
        boosting_type="gbdt",num_leaves=31,reg_alpha=0.0,reg_lambda=1,
        max_depth=-1,n_estimators=10,objective="binary",
        subsample=0.7,colsample_bytree=0.7,subsample_freq=1,
        learning_rate=0.05,min_child_weight=50,random_state=1024,n_jobs=-1
    )
    clf.fit(train_x,train_y,eval_set=[(train_x,train_y)],eval_metric="auc",early_stopping_rounds=100)
    return
    if show_importance:
        lgb.plot_importance(clf,max_num_features=10) 
        plt.title("Feature Importances")
        plt.savefig("feature_importance.png") 
        booster = clf.booster_
        importance = booster.feature_importance(importance_type="split")
        feature_name = booster.feature_name()
        feature_importance = pd.DataFrame({"feature_name":feature_name,"importance":importance} )
        feature_importance.to_csv("feature_importance.csv",index=False)
        plt.close()
        lgb.plot_metric(clf.evals_result_,metric="auc")
        plt.savefig("metrics.png")
    res["score"] = clf.predict_proba(test_x)[:,1]
    res["score"] = res["score"].apply(lambda x: float("%.6f" % x))
    res.to_csv("./res.csv", index=False)
    try:
        clf.booster_.save_model("lgb_classifier.txt") 
    except Exception as e:
        print(str(e))
        pass
Пример #3
0
    def test_plot_metrics(self):
        test_data = lgb.Dataset(self.X_test, self.y_test, reference=self.train_data)
        self.params.update({"metric": {"binary_logloss", "binary_error"}})

        evals_result0 = {}
        gbm0 = lgb.train(self.params, self.train_data,
                         valid_sets=[self.train_data, test_data],
                         valid_names=['v1', 'v2'],
                         num_boost_round=10,
                         evals_result=evals_result0,
                         verbose_eval=False)
        ax0 = lgb.plot_metric(evals_result0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Metric during training')
        self.assertEqual(ax0.get_xlabel(), 'Iterations')
        self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
        ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
        ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])

        evals_result1 = {}
        gbm1 = lgb.train(self.params, self.train_data,
                         num_boost_round=10,
                         evals_result=evals_result1,
                         verbose_eval=False)
        self.assertRaises(ValueError, lgb.plot_metric, evals_result1)

        gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm2.fit(self.X_train, self.y_train, eval_set=[(self.X_test, self.y_test)], verbose=False)
        ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
Пример #4
0
def model_metrics_lgb(clf):
    fig3 = plt.figure(figsize=(8, 11))
    gs3 = gridspec.GridSpec(2, 1)
    ax7 = fig3.add_subplot(gs3[0])
    ax8 = fig3.add_subplot(gs3[1])
    lgb.plot_metric(clf, metric="l2", ax=ax7, title="l2 during Training")
    lgb.plot_metric(clf,
                    metric="huber",
                    ax=ax8,
                    title="Huber Loss during Training")
    gs3.tight_layout(fig3, rect=[0.05, 0.05, 0.95, 0.95], pad=0.5)
    return [fig3]
Пример #5
0
    def learning_curve(self):
        cols = 3
        if self.cfg.training.num_fold % cols != 0:
            rows = self.cfg.training.num_fold // cols + 1
        else:
            rows = self.cfg.training.num_fold // cols

        fig = plt.figure(figsize=(25, 12))
        for i in range(len(self.evals_results)):
            ax = fig.add_subplot(rows, cols, i + 1)
            lgb.plot_metric(self.evals_results[i], ax=ax)
            ax.set_title('Learning curve in Fold {}'.format(i + 1))
        plt.tight_layout()
        plt.show()
Пример #6
0
def show_model_performance(gbm, evals_result):
    # show model importance
    # lgb.plot_importance(gbm)
    # Show Decision Tree
    if config.can_plot_tree:
        graph = lgb.create_tree_digraph(gbm, name='Decision Tree')
        graph.render(view=True)
    if config.can_show_metric:
        fig, axs = plt.subplots(2, 1, figsize=(8, 10))
        for index in range(len(config.metric)):
            lgb.plot_metric(evals_result,
                            config.metric[index],
                            title=config.metric[index],
                            ax=axs[index])
    plt.show()
Пример #7
0
def plot_model_information(bst, validation_metrics, my_own_metrics):
    print('Number of trees:', bst.num_trees())

    print('Plot model performance')
    ax = lgb.plot_metric(validation_metrics, metric='auc')
    plt.show()

    print('Plot feature importances...')
    ax = lgb.plot_importance(bst, max_num_features=15)
    plt.show()

    def plot_my_own_metrics(my_own_metrics):
        x = list(my_own_metrics.keys())
        y = list(my_own_metrics.values())
        plt.barh(x, y)

        for index, value in enumerate(y):
            plt.text(value, index, str(value))

    print('plot_my_own_metrics')
    plot_my_own_metrics(my_own_metrics)
    plt.show()

    tree_index = 0
    print('Plot ' + str(tree_index) +
          'th tree...')  # one tree use categorical feature to split
    ax = lgb.plot_tree(bst,
                       tree_index=tree_index,
                       figsize=(64, 36),
                       show_info=['split_gain'])
    plt.show()
Пример #8
0
def lgb_train(train_data,
              val_data,
              threshold,
              init_model,
              boost_round=1000,
              random_seed=6,
              for_submit=False):
    print('boost round: ', boost_round)

    def lgb_f1_score(y_hat, data, THRESHOLD=threshold):
        y_true = data.get_label()
        y_hat = np.where(y_hat >= THRESHOLD, 1, 0)
        return 'f1', f1_score(y_true, y_hat), True

    valid_sets = [train_data] if for_submit else [train_data, val_data]

    params = {
        'objective': 'binary',
        # 'early_stopping_rounds': 100,
        'learning_rate': 0.01,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'max_depth': -1,
        'num_leaves': 100,
        'seed': random_seed,
        'metrics': 'None'
    }
    eval_dict = {}
    clf = lgb.train(params,
                    train_data,
                    valid_sets=valid_sets,
                    evals_result=eval_dict,
                    num_boost_round=boost_round,
                    verbose_eval=100,
                    init_model=init_model,
                    feval=lgb_f1_score)

    if for_submit:
        del eval_dict
        gc.collect()
        return clf
    else:
        lgb.plot_metric(eval_dict, metric='f1')
        res = max(eval_dict['valid_1']['f1'])
        del eval_dict
        gc.collect()
        return res
Пример #9
0
    def test_plot_metrics(self):
        X_train, X_test, y_train, y_test = train_test_split(
            *load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, y_train)
        test_data = lgb.Dataset(X_test, y_test, reference=train_data)

        params = {
            "objective": "binary",
            "metric": {"binary_logloss", "binary_error"},
            "verbose": -1,
            "num_leaves": 3
        }

        evals_result0 = {}
        gbm0 = lgb.train(params,
                         train_data,
                         valid_sets=[train_data, test_data],
                         valid_names=['v1', 'v2'],
                         num_boost_round=10,
                         evals_result=evals_result0,
                         verbose_eval=False)
        ax0 = lgb.plot_metric(evals_result0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Metric during training')
        self.assertEqual(ax0.get_xlabel(), 'Iterations')
        self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
        ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
        ax0 = lgb.plot_metric(evals_result0,
                              metric='binary_logloss',
                              dataset_names=['v2'])

        evals_result1 = {}
        gbm1 = lgb.train(params,
                         train_data,
                         num_boost_round=10,
                         evals_result=evals_result1,
                         verbose_eval=False)
        self.assertRaises(ValueError, lgb.plot_metric, evals_result1)

        gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
Пример #10
0
def test_plot_example():
    print('Loading data...')
    # load or create your dataset
    df_train = pd.read_csv(
        r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.train',
        header=None,
        sep='\t')
    df_test = pd.read_csv(
        r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.test',
        header=None,
        sep='\t')

    y_train = df_train[0]
    y_test = df_test[0]
    X_train = df_train.drop(0, axis=1)
    X_test = df_test.drop(0, axis=1)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # specify your configurations as a dict
    params = {'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0}

    evals_result = {}  # to record eval results for plotting

    print('Starting training...')
    # train
    gbm = lgb.train(
        params,
        lgb_train,
        num_boost_round=100,
        valid_sets=[lgb_train, lgb_test],
        feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
        categorical_feature=[21],
        evals_result=evals_result,
        verbose_eval=10)

    print('Plotting metrics recorded during training...')
    ax = lgb.plot_metric(evals_result, metric='l1')
    plt.show()

    print('Plotting feature importances...')
    ax = lgb.plot_importance(gbm, max_num_features=10)
    plt.show()

    print('Plotting 84th tree...')  # one tree use categorical feature to split
    ax = lgb.plot_tree(gbm,
                       tree_index=83,
                       figsize=(20, 8),
                       show_info=['split_gain'])
    plt.show()

    print('Plotting 84th tree with graphviz...')
    graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
    graph.render(view=True)
Пример #11
0
    def test_plot_metrics(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, y_train)
        test_data = lgb.Dataset(X_test, y_test, reference=train_data)

        params = {
            "objective": "binary",
            "metric": {"binary_logloss", "binary_error"},
            "verbose": -1,
            "num_leaves": 3
        }

        evals_result0 = {}
        gbm0 = lgb.train(params, train_data,
                         valid_sets=[train_data, test_data],
                         valid_names=['v1', 'v2'],
                         num_boost_round=10,
                         evals_result=evals_result0,
                         verbose_eval=False)
        ax0 = lgb.plot_metric(evals_result0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Metric during training')
        self.assertEqual(ax0.get_xlabel(), 'Iterations')
        self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
        ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
        ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])

        evals_result1 = {}
        gbm1 = lgb.train(params, train_data,
                         num_boost_round=10,
                         evals_result=evals_result1,
                         verbose_eval=False)
        self.assertRaises(ValueError, lgb.plot_metric, evals_result1)

        gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
Пример #12
0
def train_predict_model(train_list_x, train_list_label, params, train_times=500):
    d_x = train_list_x
    d_y = train_list_label

    train_X, valid_X, train_Y, valid_Y = train_test_split(d_x, d_y, test_size=0.2, random_state=2)  # 将训练集分为训练集+验证集
    lgb_train = lgb.Dataset(train_X, label=train_Y)
    lgb_eval = lgb.Dataset(valid_X, label=valid_Y, reference=lgb_train)
    # select_suit_parameter(lgb_train)
    evals_result = {}
    print("Training...")
    bst = lgb.train(
        params,
        lgb_train,
        # categorical_feature=list(range(1, 82)),  # 指明哪些特征的分类特征
        valid_sets=[lgb_eval],
        num_boost_round=train_times,

        feval=lgb_f1_score, evals_result=evals_result

        # early_stopping_rounds=30
    )
    lgb.plot_metric(evals_result, metric='f1')
    return bst,lgb_train
Пример #13
0
 def train(self,
           X_train,
           y,
           X_test=None,
           y_test=None,
           parameters=None,
           plot=False):
     self.evals_result = {}  # to record eval results for plotting
     if parameters is not None:
         self.parameters = parameters
     dtrain = lgb.Dataset(X_train, label=y)
     dval = lgb.Dataset(X_test, label=y_test)
     self.model = lgb.train(self.parameters,
                            dtrain,
                            valid_sets=[dtrain, dval],
                            evals_result=self.evals_result,
                            verbose_eval=False,
                            feval=accuracy)
     if plot:
         print('Plotting metrics recorded during training...')
         ax = lgb.plot_metric(self.evals_result, metric='accuracy')
         plt.show()
         ax = lgb.plot_metric(self.evals_result, metric='auc')
         plt.show()
Пример #14
0
def train_lightgbm_model(level,
                         fold=1,
                         params={},
                         model_dir='models/uncertainty/',
                         prediction_lag=28,
                         model_name="lightgbm",
                         augment_events=False,
                         verbose=True,
                         num_boost_round=2500,
                         early_stopping_rounds=50,
                         verbose_eval=50):
    # only require lightgbm to be installed when calling this function
    import lightgbm as lgb

    # read data
    train, val, test, features = get_train_val_slit(
        level,
        fold,
        augment_events=augment_events,
        prediction_lag=prediction_lag)

    # make lgb datasets
    labels = ['demand']
    train_set = lgb.Dataset(train[features], train[labels])
    val_set = lgb.Dataset(val[features], val[labels])

    # cleanup memory
    del train
    gc.collect()

    # perform training
    evals_result = {}  # to record eval results for plotting
    model = lgb.train(
        params,
        train_set,
        num_boost_round=num_boost_round,
        early_stopping_rounds=early_stopping_rounds,
        valid_sets=[val_set],
        verbose_eval=verbose_eval,  # fobj="mae",#feval = "mae",
        evals_result=evals_result)

    model.save_model(
        model_dir + model_name +
        "-level{}-lag{}-fold{}.txt".format(level, prediction_lag, fold))
    ax = lgb.plot_metric(evals_result, metric='l1')
    plt.show()

    return model, evals_result, val
def train(train_x, train_y, kfold, best_params=None):
    params = {
        "objective": "binary",
        "boosting_type": "gbdt",
        "metric": {"binary_logloss"},
        "num_leaves": 50,
        "min_data_in_leaf": 100,
        "learning_rate": 0.1,
        "feature_fraction": 0.5,
    }
    models = []
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        tr_set = lgb.Dataset(tr_x, tr_y)
        val_set = lgb.Dataset(val_x, val_y, reference=tr_set)

        evals_result = {}
        model = lgb.train(
            params=params,
            train_set=tr_set,
            valid_sets=[val_set, tr_set],
            num_boost_round=1000,
            early_stopping_rounds=20,
            verbose_eval=1,
            evals_result=evals_result,
            feval=accuracy,
        )

        importance = pd.DataFrame(model.feature_importance(),
                                  index=train_x.columns,
                                  columns=["importance"
                                           ]).sort_values("importance",
                                                          ascending=[False])

        # print(f"######################importance#####################")
        # print(importance.head(50))

        # 検証結果の描画
        fig = lgb.plot_metric(evals_result)
        plt.savefig(f"{DATA_DIR}/learning_curve_{i+1}.png")

        models.append(model)

    return models
Пример #16
0
def bo_lgb_train(opt, x_train, y_train, x_test, y_test):
    num_train, num_feature = x_train.shape
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test)
    evals_result = {}
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_boosting_round': 300,
        'n_jobs': 2
    }
    params.update(opt.max['params'])
    params['num_leaves'] = int(round(params['num_leaves']))
    params['max_depth'] = int(round(params['max_depth']))

    feature_name = ['f' + str(i + 1) for i in range(num_feature)]

    print('Start training...')

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=
        300,  # This number could be changed for iteration times for lightgbm training
        valid_sets=[lgb_train, lgb_eval],
        feature_name=feature_name,
        evals_result=evals_result,
        #fobj=loglikelihood,
        feval=lgb_f1_score,
        verbose_eval=10)

    model.save_model('model.txt', num_iteration=model.best_iteration)
    print('Plot metrics recorded during training...')

    #lightgbm could show f1 score figure or accuracy figure
    ax = lgb.plot_metric(evals_result, metric='f1')
    #ax = lgb.plot_metric(evals_result, metric='accuracy')
    plt.show()
    return model
    def train_light_gbm(self, dts):
        # create dataset for lightgbm
        lgb_train = lgb.Dataset(dts.trainX, dts.trainY)
        lgb_test = lgb.Dataset(dts.testX, dts.testY, reference=lgb_train)

        # specify your configurations as a dict
        params = {
            'num_leaves': 5,
            'metric': ('l1', 'l2'),
            'verbose': 0
        }

        evals_result = {}  # to record eval results for plotting

        print('Starting training...')
        # train
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=100,
                        valid_sets=[lgb_train, lgb_test],
                        feature_name=['close', 'open', 'high', 'low', 'volume'],
                        categorical_feature=[21],
                        evals_result=evals_result,
                        verbose_eval=10)

        print('Plotting metrics recorded during training...')
        ax = lgb.plot_metric(evals_result, metric='l1')
        plt.show()

        print('Plotting feature importances...')
        ax = lgb.plot_importance(gbm, max_num_features=10)
        plt.show()

        print('Plotting 84th tree...')  # one tree use categorical feature to split
        ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
        plt.show()

        print('Plotting 84th tree with graphviz...')
        graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
        graph.render(view=True)
Пример #18
0
def get_model_train_result(evals_result,
                           model_name="default",
                           outputpath="./"):
    '''
    画出训练结果函数
    :param evals_result:
    :param model_name:
    :param outputpath:
    :return:
    '''
    try:
        outputpath = outputpath + model_name + "_train_result.png"
        ax = lgb.plot_metric(evals_result,
                             metric='binary_logloss',
                             figsize=(20, 13))
        plt.savefig(outputpath)
    except:
        logger.error("create model train result fail.")
        #raise RuntimeError("create model train result fail.")
        return False
    else:
        logger.info("create model train result sucess.")
        return True
Пример #19
0
def train_model(model_name, X_train, y_train):
    kf = KFold(config.k_folds)
    cv_scores = []
    for i, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)):
        print('FOLD {} \n'.format(i))
        X_tr, y_tr = X_train.loc[tr_idx], y_train[tr_idx]
        X_vl, y_vl = X_train.loc[vl_idx], y_train[vl_idx]

        if model_name == 'lgb':
            model = model_lgb()
            model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_vl, y_vl)], \
                      eval_metric='auc', verbose=config.verbose, early_stopping_rounds=config.stop_rounds)
            with open('lgb_model_{}.pkl'.format(i), 'wb') as handle:
                pickle.dump(model, handle)


#code to visualize feature importance
            ax = lgb.plot_importance(model,
                                     max_num_features=100,
                                     figsize=(15, 15))
            #            ax2 = lgb.plot_tree(model,figsize=(15,15))
            ax3 = lgb.plot_metric(model, figsize=(15, 15))
            plt.show()
            pred_y_val = model.predict(X_vl)
            score = mean_squared_error(pred_y_val, y_vl)
            cv_scores.append(score)
            print(np.mean(cv_scores))
            del model, X_tr, X_vl
            gc.collect()
        if model_name == 'rf':
            model = model_rf()
            model.fit(X_tr, y_tr)
            with open('rf_model_{}.pkl'.format(i), 'wb') as handle:
                pickle.dump(model, handle)
            del model, X_tr, X_vl
            gc.collect()
Пример #20
0
def test_plot_metrics(params, breast_cancer_split, train_data):
    X_train, X_test, y_train, y_test = breast_cancer_split
    test_data = lgb.Dataset(X_test, y_test, reference=train_data)
    params.update({"metric": {"binary_logloss", "binary_error"}})

    evals_result0 = {}
    lgb.train(params,
              train_data,
              valid_sets=[train_data, test_data],
              valid_names=['v1', 'v2'],
              num_boost_round=10,
              evals_result=evals_result0,
              verbose_eval=False)
    ax0 = lgb.plot_metric(evals_result0)
    assert isinstance(ax0, matplotlib.axes.Axes)
    assert ax0.get_title() == 'Metric during training'
    assert ax0.get_xlabel() == 'Iterations'
    assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'}
    ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
    ax0 = lgb.plot_metric(evals_result0,
                          metric='binary_logloss',
                          dataset_names=['v2'])

    evals_result1 = {}
    lgb.train(params,
              train_data,
              num_boost_round=10,
              evals_result=evals_result1,
              verbose_eval=False)
    with pytest.raises(ValueError):
        lgb.plot_metric(evals_result1)

    gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
    gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
    assert isinstance(ax2, matplotlib.axes.Axes)
    assert ax2.get_title() == ''
    assert ax2.get_xlabel() == ''
    assert ax2.get_ylabel() == ''
Пример #21
0
def Test_XGB(dump_path, dtest, dtrain, dval, evals_result):
    loaded_bst = xgb.Booster()
    loaded_bst.load_model(dump_path + "/best.model")

    y_pred_test = loaded_bst.predict(dtest)
    y_pred_eval = loaded_bst.predict(dtrain)

    y_pred_train = loaded_bst.predict(dtrain)
    y_pred_val = loaded_bst.predict(dval)
    y_pred_test = loaded_bst.predict(dtest)

    predictions_train = [round(value) for value in y_pred_train]
    predictions_val = [round(value) for value in y_pred_val]
    predictions_test = [round(value) for value in y_pred_test]

    accuracy_train = accuracy_score(dtrain.get_label(),
                                    predictions_train) * 100
    accuracy_val = accuracy_score(dval.get_label(), predictions_val) * 100
    accuracy_test = accuracy_score(dtest.get_label(), predictions_test) * 100

    rmse_train, rmse_test = mean_squared_error(
        dtrain.get_label(),
        y_pred_train)**0.5, mean_squared_error(dtest.get_label(),
                                               y_pred_test)**0.5
    rmse_val = mean_squared_error(dval.get_label(), y_pred_val)**0.5

    roc_train, roc_val = roc_auc_score(dtrain.get_label(),
                                       y_pred_train), roc_auc_score(
                                           dval.get_label(), y_pred_val)
    roc_test = roc_auc_score(dtest.get_label(), y_pred_test)

    tests = [
        accuracy_train, accuracy_val, accuracy_test, rmse_train, rmse_val,
        rmse_test, roc_train, roc_val, roc_test
    ]
    testing_labels = [
        'training accuracy', 'dev accuracy', 'test accuracy', 'train rmse',
        'val rmse', 'test rmse', 'train roc', 'dev roc', 'test roc'
    ]

    with open(dump_path + '/metrics.txt', 'w') as writer:
        writer.write('XGB metrics...\n' + '-' * 10 + '\n')
        for i in range(len(tests)):
            writer.write(testing_labels[i] + ": " + str(tests[i]) + "\n")

    ptool.my_plot_importance(loaded_bst,
                             figsize=(7, 7),
                             title='XGB Feature importance',
                             path=dump_path)
    class_labels = ('neutron', 'electron')
    ax = lgb.plot_metric(evals_result, metric='logloss', figsize=(12, 12))
    ax.legend()
    plt.ylabel('logloss classification error')
    plt.title('XGBoost Log Loss')
    plt.savefig(dump_path + '/log loss classification error',
                bbox_inches='tight')
    ptool.plot_confusion_matrix(dump_path=dump_path + "/",
                                classes=class_labels,
                                model="XGB",
                                pred=predictions_test,
                                labels=dtest.get_label())

    return
Пример #22
0
evals_result = {}  # to record eval results for plotting

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()

print('Plotting 54th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm,
                   tree_index=53,
                   figsize=(15, 15),
                   show_info=['split_gain'])
Пример #23
0
def main():
    """
    << 処理の流れ >>
    データ読み込み ⇒ 投球データと選手データの結合(train,testも結合) ⇒ nanの置換 ⇒ カテゴリ変数の変換 ⇒
    RFEによる特徴量選択(個数の最適化) ⇒ ハイパーパラメータの最適化 ⇒ 交差検証
    """

    train_pitch = pd.read_csv(TRAIN_PITCH_PATH)
    train_player = pd.read_csv(TRAIN_PLAYER_PATH)
    test_pitch = pd.read_csv(TEST_PITCH_PATH)
    test_player = pd.read_csv(TEST_PLAYER_PATH)

    pitching_type_2016 = pd.read_csv(EXTERNAL_1_PATH)
    pitching_type_2017 = pd.read_csv(EXTERNAL_2_PATH)
    pitching_type_2018 = pd.read_csv(EXTERNAL_3_PATH)

    train_pitch["use"] = "train"
    test_pitch["use"] = "test"
    test_pitch["球種"] = 0
    pitch_data = pd.concat([train_pitch, test_pitch],
                           axis=0).drop(PITCH_REMOVAL_COLUMNS, axis=1)

    player_data = pd.concat([train_player, test_player],
                            axis=0).drop(PLAYER_REMOVAL_COLUMNS,
                                         axis=1)  # .fillna(0)
    pitchers_data = train_player[train_player["位置"] == "投手"].drop(
        PLAYER_REMOVAL_COLUMNS, axis=1)
    pitching_type_ratio = pd.concat(
        [pitching_type_2016, pitching_type_2017, pitching_type_2018],
        axis=0).reset_index(drop=True)

    merged = (pd.merge(
        pitch_data,
        player_data,
        how="left",
        left_on=["年度", "投手ID"],
        right_on=["年度", "選手ID"],
    ).drop(["選手ID", "投球位置区域"], axis=1).fillna(0))
    merged = merged.rename(columns={"選手名": "投手名", "チーム名": "投手チーム名"})

    # データセットと前年度投球球種割合をmergeする
    merged = pd.merge(
        merged,
        pitching_type_ratio,
        how="left",
        left_on=["年度", "投手ID", "投手名"],
        right_on=["年度", "選手ID", "選手名"],
    ).drop(["選手ID", "選手名"], axis=1)

    use = merged.loc[:, "use"]
    merged = merged.drop(["use", "位置", "年度"], axis=1)

    # category_encodersによってカテゴリ変数をencordingする
    categorical_columns = [
        c for c in merged.columns if merged[c].dtype == "object"
    ]
    ce_oe = ce.OrdinalEncoder(cols=categorical_columns,
                              handle_unknown="impute")
    encorded_data = ce_oe.fit_transform(merged)
    encorded_data = pd.concat([encorded_data, use], axis=1)

    train = (encorded_data[encorded_data["use"] == "train"].drop(
        "use", axis=1).reset_index(drop=True))
    test = (encorded_data[encorded_data["use"] == "test"].drop(
        "use", axis=1).reset_index(drop=True))
    train_x = train.drop("球種", axis=1)
    train_y = train.loc[:, "球種"]
    test_x = test.drop("球種", axis=1)

    label_counts = train_y.value_counts()
    sm = SMOTE(
        ratio={
            0: sum(train_y == 0),
            1: sum(train_y == 1) * 3,
            2: sum(train_y == 2),
            3: sum(train_y == 3) * 2,
            4: sum(train_y == 4) * 2,
            5: sum(train_y == 5) * 4,
            6: sum(train_y == 6) * 20,
            7: sum(train_y == 7) * 4,
        })

    train_x_resampled, train_y_resampled = sm.fit_sample(train_x, train_y)
    train_x_resampled = pd.DataFrame(train_x_resampled,
                                     columns=train_x.columns)
    train_y_resampled = pd.Series(train_y_resampled, name="球種")

    # f = partial(objective, train_x, train_y) # 目的関数に引数を固定しておく
    # study = optuna.create_study(direction='maximize') # Optuna で取り出す特徴量の数を最適化する

    # study.optimize(f, n_trials=10) # 試行回数を決定する
    # print('params:', study.best_params)# 発見したパラメータを出力する
    # best_feature_count = study.best_params['n_components']
    best_feature_count = 47
    # x_pca, train_y = get_important_features(train_x, train_y, best_feature_count)

    n_splits = 10
    num_class = 8
    # best_params = get_best_params(x_pca, train_y, num_class) # 最適ハイパーパラメータの探索

    best_params = {
        "lambda_l1": 5.96,
        "lambda_l2": 1.1,
        "num_leaves": 12,
        "feature_fraction": 0.75,
        "bagging_fraction": 0.89,
        "bagging_freq": 7,
        "min_data_in_leaf": 200,
    }

    submission = np.zeros((len(test_x), num_class))
    accs = {}

    tscv = TimeSeriesSplit(n_splits=n_splits)
    for i, (tr_idx, val_idx) in enumerate(tscv.split(train_x_resampled)):
        tr_x = train_x_resampled.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y_resampled.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x_resampled.iloc[val_idx].reset_index(drop=True)
        val_y = train_y_resampled.iloc[val_idx].reset_index(drop=True)

        tr_dataset = lgb.Dataset(tr_x, tr_y, free_raw_data=False)
        val_dataset = lgb.Dataset(val_x,
                                  val_y,
                                  reference=tr_dataset,
                                  free_raw_data=False)
        model, evals_result = get_model(tr_dataset, val_dataset, num_class,
                                        best_params, train_x_resampled.columns)

        # 学習曲線の描画
        fig = lgb.plot_metric(evals_result, metric="multi_logloss")
        plt.savefig(f"{DATA_DIR}/learning_curve_{i}.png")

        y_pred = np.argmax(model.predict(val_x), axis=1)  # 0~8の確率
        acc = accuracy_score(val_y, y_pred)
        accs[i] = acc
        print("#################################")
        print(f"accuracy: {acc}")
        print("#################################")
        y_preda = model.predict(test_x,
                                num_iteration=model.best_iteration)  # 0~8の確率
        submission += y_preda

    submission_df = pd.DataFrame(submission / n_splits)
    print("#################################")
    print(submission_df)
    print(best_params)
    print(accs)
    print("#################################")

    submission_df.to_csv(f"{DATA_DIR}/my_submission35.csv", header=False)
            "feature_fraction": 0.4,
            "bagging_fraction": 0.6,
            "bagging_freq": 17,
            "num_threads": 16,
        }
        f_evals_result = {}
        f_model = lgb.train(
            params,
            f_train,
            valid_sets=[f_valid],
            num_boost_round=10000,
            verbose_eval=1000,
            early_stopping_rounds=1000,
            evals_result=f_evals_result,
        )
        lgb.plot_metric(f_evals_result)

        # 收集当前fold的模型在vaild和test上的结果
        s_y_valid[f_index_valid] += f_model.predict(f_x_valid)
        s_y_test += f_model.predict(x_test) / 5
        gc.collect()

    # 收集当前seed的模型在vaild和test上的结果
    y_valid += s_y_valid / len(seeds)
    y_pred += s_y_test / len(seeds)
    print("logloss", log_loss(pd.get_dummies(y_train).values, s_y_valid))
    print("ac", accuracy_score(y_train, np.argmax(s_y_valid, axis=1)))

# 收集全部模型在vaild和test上的结果
print("logloss", log_loss(pd.get_dummies(y_train).values, y_valid))
print("ac", accuracy_score(y_train, np.argmax(y_valid, axis=1)))
Пример #25
0
    lgb_eval = lgb.Dataset(va_x, va_y)

    # 学習の実行
    model = lgb.LGBMRegressor(objective='rmse', early_stopping_rounds=50)
    model.fit(tr_x, tr_y, eval_set=[(va_x, va_y), (tr_x, tr_y)], verbose=10)

    # バリデーションデータでのスコアの確認
    va_pred = model.predict(va_x)
    score = np.sqrt(mse(va_y, va_pred))
    score_list.append(score)

score_ave = np.mean(score_list)
print(f'RMSE: {score_ave:.4f}')

# 学習曲線
lgb.plot_metric(model, metric='rmse')

# 提出用データ
tr_x = train_x
tr_y = train_y
ts_x = test_x
"""
# 変数をループしてtarget encoding
for c in tr_x.columns:
    if tr_x[c].dtype == 'object':
        # 学習データ全体で各カテゴリにおけるtargetの平均を計算
        data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        # バリデーションデータのカテゴリを置換
        ts_x.loc[:, c] = ts_x[c].map(target_mean)
        
#
#grid4 = GridSearchCV(xgb.XGBRegressor(),hyper_params,n_jobs=-1,verbose=10,cv=fold1)
#
#grid4.fit(fine_tune_train,y)
#
#reg = xgb.XGBRegressor(max_depth = grid4.best_params_['max_depth'],
#                       min_child_weight = grid4.best_params_['min_child_weight'],
#                       learning_rate = grid4.best_params_['learning_rate'], colsample_bylevel = 0.8,
#                       subsample = 0.75, reg_lambda = 2, nthread = -1,
#                       booster = 'gbtree', silent = 1, gamma = 0)
#
#reg.fit(fine_tune_train,y)
#
#y_pred = reg.predict(fine_tune_test)

plot_metric(evals_result1, metric='rmse')

plot_metric(evals_result, metric='rmse')

xgb.plot_importance(mdl)
lgb.plot_importance(model)

train_size = np.linspace(0.1, 1.0, 20)
plt.figure()
plt.title("Learning Curve for SVM")
plt.xlabel("Traning Set Size")
plt.ylabel("Error")
train_sizes, train_scores, test_scores = learning_curve(
    svm_model,
    X_train,
    y_train,
Пример #27
0
            'bagging_seed': 0,
            'feature_fraction': 0.2319,
            'feature_fraction_seed': 0,
        }

        evals_results = {}
        bst = lgb.train(lgb_params,
                        xgtrain,
                        valid_sets=[xgtrain, xgvalid],
                        evals_result=evals_results,
                        early_stopping_rounds=100,
                        verbose_eval=0,
                        feval=None)

        print('Plot metrics during training...')
        ax = lgb.plot_metric(evals_results, metric='l2')
        plt.show()

        ax = lgb.plot_importance(bst, max_num_features=100)
        plt.show()

        gain = bst.feature_importance('gain')
        ft = pd.DataFrame({
            'feature': bst.feature_name(),
            'split': bst.feature_importance('split'),
            'gain': 100 * gain / gain.sum()
        }).sort_values('split', ascending=False)
        print(ft.head(100))

        stacked_train_pred = np.expm1(bst.predict(df_train[features].values))
        print(mape(df_train['target'].values, stacked_train_pred))
Пример #28
0
print(
    f'RMSE train: {sqrt(mean_squared_error(y_train, lgb_reg.predict(X_train)))}'
)
print(
    f'\nMAE validate: {mean_absolute_error(y_test, lgb_reg.predict(X_test))}')
print(
    f'RMSE validate: {sqrt(mean_squared_error(y_test, lgb_reg.predict(X_test)))} '
)

joblib.dump(lgb_reg, 'models/lgb_optimized.pkl')

#%%
lgb_reg.n_features_
lgb_reg.objective_
lgb_reg.get_params
lgb_reg.feature_importances_

#%%
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette('pastel')
lgb.plot_importance(lgb_reg, figsize=(6, 8))
lgb.plot_metric(lgb_reg, figsize=(6, 8))

#%%

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

#%%
Пример #29
0
}

evals_result = {}  # to record eval results for plotting

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(28)],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plot metrics during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plot 84th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
Пример #30
0
def test_register_logger(tmp_path):
    logger = logging.getLogger("LightGBM")
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(levelname)s | %(message)s')
    log_filename = str(tmp_path / "LightGBM_test_logger.log")
    file_handler = logging.FileHandler(log_filename,
                                       mode="w",
                                       encoding="utf-8")
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    def dummy_metric(_, __):
        logger.debug('In dummy_metric')
        return 'dummy_metric', 1, True

    lgb.register_logger(logger)

    X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]],
                 dtype=np.float32)
    y = np.array([0, 1, 1, 0])
    lgb_data = lgb.Dataset(X, y)

    eval_records = {}
    lgb.train({
        'objective': 'binary',
        'metric': ['auc', 'binary_error']
    },
              lgb_data,
              num_boost_round=10,
              feval=dummy_metric,
              valid_sets=[lgb_data],
              evals_result=eval_records,
              categorical_feature=[1],
              early_stopping_rounds=4,
              verbose_eval=2)

    lgb.plot_metric(eval_records)

    expected_log = r"""
WARNING | categorical_feature in Dataset is overridden.
New categorical_feature is [1]
INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2
INFO | [LightGBM] [Info] Total Bins 0
INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | Training until validation scores don't improve for 4 rounds
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [2]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [4]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [6]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [8]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [10]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
WARNING | More than one metric available, picking one to plot.
""".strip()

    gpu_lines = [
        "INFO | [LightGBM] [Info] This is the GPU trainer",
        "INFO | [LightGBM] [Info] Using GPU Device:",
        "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...",
        "INFO | [LightGBM] [Info] GPU programs have been built",
        "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
        "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
        "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
        "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
    ]
    with open(log_filename, "rt", encoding="utf-8") as f:
        actual_log = f.read().strip()
        actual_log_wo_gpu_stuff = []
        for line in actual_log.split("\n"):
            if not any(line.startswith(gpu_line) for gpu_line in gpu_lines):
                actual_log_wo_gpu_stuff.append(line)

    assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
Пример #31
0
def lightgbm_model(x_train, x_test, y_train, y_test, group_id):
    features = x_train.columns.tolist()
    features.remove('product_id')
    lgb_train = lgb.Dataset(x_train[features], y_train.qty)
    lgb_test = lgb.Dataset(x_test[features], y_test.qty, reference=lgb_train)

    params = {
        'objective': 'regression',
        'learning_rate': 0.03,
        'lambda_l1': 0.5,
        'metric': {'mape', 'rmse'},
        'max_depth': 6,
        'num_leaves': 64,
        'min_data_in_leaf': 30,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'subsample_freq': 50,
        'verbose': 0
    }

    gridParams = {
        'max_depth': [6, 8],
        'num_leaves': [64, 126],
        'min_child_samples': [30, 40, 50],
        'reg_alpha': [0.001, 0.01, 0.03]
    }

    print("Start GridSearch for Parameters")
    lg = lgb.LGBMRegressor(objective='regression',
                           n_jobs=3,
                           n_estimators=1000,
                           silent=True,
                           metric='rmse')
    grid = GridSearchCV(lg, gridParams, verbose=0, cv=4, n_jobs=3)
    # convert data to list to fit GridSearch
    grid.fit(x_train[features], y_train.qty)
    print(grid.best_params_)
    print(grid.best_score_)

    params['max_depth'] = grid.best_params_['max_depth']
    params['num_leaves'] = grid.best_params_['num_leaves']
    params['min_data_in_leaf'] = grid.best_params_['min_child_samples']
    params['lambda_l1'] = grid.best_params_['reg_alpha']

    evals_result = {}
    print("Start Model Training")
    lg_model = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=lgb_test,
                         evals_result=evals_result,
                         verbose_eval=200,
                         early_stopping_rounds=200)

    from sklearn.externals import joblib
    joblib.dump(lg_model, 'lg_model.pkl')
    print("LightGBM Model dumped!")

    model_columns = list(x_train.columns)
    joblib.dump(model_columns, 'lg_model_columns.pkl')
    print("LightGBM Models columns dumped!")

    # plot training result
    ax = lgb.plot_metric(evals_result, metric='mape')
    plt.show()
    return 1
Пример #32
0
def main():
    # Whether we should invalidate preprocessing and or training data. When invalidated, data will
    # not be deserialized and instead will be recomputed.
    invalidate_preprocessing, invalidate_training = parse_args()
    # Placemark paths.
    region_pmarks_path = Path(f"./in/placemarks/")
    label_pmarks_path = Path(f"./in/placemarks/training")

    ##
    # Train
    ##
    # Preprocessing for training data.
    training_out_path = Path(f"./out/training/")
    training_files = list(Path().glob("./in/training/*"))
    training_sets, training_files = create_training_data(
        training_files=training_files,
        training_out_path=training_out_path,
        region_pmarks_path=region_pmarks_path,
        label_pmarks_path=label_pmarks_path,
        invalidate=invalidate_preprocessing)
    # Write out the labeled kml for inspection.
    write_routes_kml(routes=training_sets,
                     files=training_files,
                     out_path=training_out_path,
                     file_label="labeled")
    # Features to use for training. Any delta or rolling_mean column.
    # Keep columns that have 'delta' or 'rolling_mean' in the name.
    features = [
        col for col in training_sets[0].columns
        if any([substr in col for substr in ["delta", "rolling_mean"]])
    ]
    # Train the model.
    fitted_model, label_encoder, evals_result = fit_model(
        training_sets=training_sets,
        features=features,
        invalidate=invalidate_training)
    # Inspect training if it was recomputed.
    if invalidate_training:
        # Save plot of the training metric.
        training_img = Path("./analysis/training.png")
        print(
            f"\nPlotting metrics during training and saving to {training_img.name}."
        )
        lgb.plot_metric(evals_result, metric="multi_logloss")
        plt.show()
        # Save plot of feature importances.
        feature_importances_img = Path("./analysis/feature_importances.png")
        print(
            f"\nPlotting feature importances and saving to {feature_importances_img.name}.\n"
        )
        lgb.plot_importance(fitted_model, max_num_features=len(features))
        plt.show()

    ##
    # Classify
    ##
    # Preprocessing for unseen data.
    unseen_out_path = Path(f"./out/unseen/")
    unseen_files = list(Path().glob("./in/unseen/*"))
    unseen_sets, unseen_files = create_unseen_data(
        unseen_files=unseen_files,
        unseen_out_path=unseen_out_path,
        region_pmarks_path=region_pmarks_path,
        invalidate=invalidate_preprocessing)
    # Classify the unseen data.
    classified_unseen_data = classify_unseen_data(
        model=fitted_model,
        unseen_sets=unseen_sets,
        features=features,
        label_encoder=label_encoder,
        invalidate=invalidate_training)
    # Write out the classified kml for inspection.
    write_routes_kml(routes=classified_unseen_data,
                     files=unseen_files,
                     out_path=unseen_out_path,
                     file_label="classified")

    ##
    # Score Routes
    ##
    # Make a recommendation for a route to take when going to A or to B.
    # Routes that do not start at either B or A are ignored.
    scored_routes = score_routes(classified_unseen_data=classified_unseen_data,
                                 files=unseen_files,
                                 invalidate=False)
    # Sort the routes by cost ascending.
    scored_routes = sorted(scored_routes, key=lambda data: data["cost"])
    # Write out the scored routes
    # Separate the routes by their destination.
    scored_to_b_routes = [
        rdata for rdata in scored_routes if "to_b" in rdata["file"].name
    ]
    scored_to_a_routes = [
        rdata for rdata in scored_routes if "to_a" in rdata["file"].name
    ]
    # Write out the scored routes.
    scored_out_path = Path("./out/scored_unseen/")
    write_routes_kml(routes=[data["route"] for data in scored_to_b_routes],
                     files=[data["file"] for data in scored_to_b_routes],
                     out_path=scored_out_path,
                     file_label="scored",
                     stops_as_path=False,
                     turns_as_path=False,
                     altitude_as_speed=False)
    write_routes_kml(routes=[data["route"] for data in scored_to_a_routes],
                     files=[data["file"] for data in scored_to_a_routes],
                     out_path=scored_out_path,
                     file_label="scored",
                     stops_as_path=False,
                     turns_as_path=False,
                     altitude_as_speed=False)

    ##
    # Summarize
    ##
    # Training routes KML summary.
    write_coordinates_summary(
        kml_filename=Path("./out/training_routes_summary.kml"),
        coords_list=training_sets,
        fnames=training_files)
    # Unseen routes KML summary.
    write_coordinates_summary(
        kml_filename=Path("./out/unseen_routes_summary.kml"),
        coords_list=unseen_sets,
        fnames=unseen_files)
    # All routes KML summary.
    write_coordinates_summary(
        kml_filename=Path("./out/all_routes_summary.kml"),
        coords_list=training_sets + unseen_sets,
        fnames=training_files + unseen_files)
    # Scored to B routes KML summary.
    write_coordinates_summary(
        kml_filename=Path("./out/scored_to_b_summary.kml"),
        coords_list=[data["route"] for data in scored_to_b_routes],
        fnames=[data["file"] for data in scored_to_b_routes])
    # Scored to A routes KML summary.
    write_coordinates_summary(
        kml_filename=Path("./out/scored_to_a_summary.kml"),
        coords_list=[data["route"] for data in scored_to_a_routes],
        fnames=[data["file"] for data in scored_to_a_routes])
    # Summarize scores.
    pd.set_option("display.max_colwidth", -1)
    pd.set_option("display.expand_frame_repr", False)
    routes_table = create_scored_routes_table(scored_routes)
    to_b_table = routes_table[routes_table.destination == "B"].drop(
        "destination", axis=1)
    to_a_table = routes_table[routes_table.destination == "A"].drop(
        "destination", axis=1)
    # Show the route scores for going to B.
    print("###### To B Scored Routes ######")
    print(to_b_table)
    # Show the route scored for going A.
    print("###### To A Scored Routes ######")
    print(to_a_table)
    # Show the route scores..
    print("###### All Scored Routes  ######")
    print(routes_table)
    print()