def test_plot_split_value_histogram(self):
        gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10)
        ax0 = lgb.plot_split_value_histogram(gbm0, 27)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(),
                         'Split value histogram for feature with index 27')
        self.assertEqual(ax0.get_xlabel(), 'Feature split value')
        self.assertEqual(ax0.get_ylabel(), 'Count')
        self.assertLessEqual(len(ax0.patches), 2)

        gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm1.fit(self.X_train, self.y_train)

        ax1 = lgb.plot_split_value_histogram(
            gbm1,
            gbm1.booster_.feature_name()[27],
            figsize=(10, 5),
            title='Histogram for feature @index/name@ @feature@',
            xlabel='x',
            ylabel='y',
            color='r')
        self.assertIsInstance(ax1, matplotlib.axes.Axes)
        self.assertEqual(
            ax1.get_title(), 'Histogram for feature name {}'.format(
                gbm1.booster_.feature_name()[27]))
        self.assertEqual(ax1.get_xlabel(), 'x')
        self.assertEqual(ax1.get_ylabel(), 'y')
        self.assertLessEqual(len(ax1.patches), 2)
        for patch in ax1.patches:
            self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.))  # red

        ax2 = lgb.plot_split_value_histogram(gbm0,
                                             27,
                                             bins=10,
                                             color=['r', 'y', 'g', 'b'],
                                             title=None,
                                             xlabel=None,
                                             ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
        self.assertEqual(len(ax2.patches), 10)
        self.assertTupleEqual(ax2.patches[0].get_facecolor(),
                              (1., 0, 0, 1.))  # r
        self.assertTupleEqual(ax2.patches[1].get_facecolor(),
                              (.75, .75, 0, 1.))  # y
        self.assertTupleEqual(ax2.patches[2].get_facecolor(),
                              (0, .5, 0, 1.))  # g
        self.assertTupleEqual(ax2.patches[3].get_facecolor(),
                              (0, 0, 1., 1.))  # b

        self.assertRaises(ValueError, lgb.plot_split_value_histogram, gbm0,
                          0)  # was not used in splitting
Exemplo n.º 2
0
def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
    X_train, _, y_train, _ = breast_cancer_split

    gbm0 = lgb.train(params, train_data, num_boost_round=10)
    ax0 = lgb.plot_split_value_histogram(gbm0, 27)
    assert isinstance(ax0, matplotlib.axes.Axes)
    assert ax0.get_title() == 'Split value histogram for feature with index 27'
    assert ax0.get_xlabel() == 'Feature split value'
    assert ax0.get_ylabel() == 'Count'
    assert len(ax0.patches) <= 2

    gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
    gbm1.fit(X_train, y_train)

    ax1 = lgb.plot_split_value_histogram(
        gbm1,
        gbm1.booster_.feature_name()[27],
        figsize=(10, 5),
        title='Histogram for feature @index/name@ @feature@',
        xlabel='x',
        ylabel='y',
        color='r')
    assert isinstance(ax1, matplotlib.axes.Axes)
    title = 'Histogram for feature name {}'.format(
        gbm1.booster_.feature_name()[27])
    assert ax1.get_title() == title
    assert ax1.get_xlabel() == 'x'
    assert ax1.get_ylabel() == 'y'
    assert len(ax1.patches) <= 2
    for patch in ax1.patches:
        assert patch.get_facecolor() == (1., 0, 0, 1.)  # red

    ax2 = lgb.plot_split_value_histogram(gbm0,
                                         27,
                                         bins=10,
                                         color=['r', 'y', 'g', 'b'],
                                         title=None,
                                         xlabel=None,
                                         ylabel=None)
    assert isinstance(ax2, matplotlib.axes.Axes)
    assert ax2.get_title() == ''
    assert ax2.get_xlabel() == ''
    assert ax2.get_ylabel() == ''
    assert len(ax2.patches) == 10
    assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.)  # r
    assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.)  # y
    assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.)  # g
    assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.)  # b

    with pytest.raises(ValueError):
        lgb.plot_split_value_histogram(gbm0, 0)  # was not used in splitting
Exemplo n.º 3
0
    def train_and_predict(self, show_plots=False):
        print("\n\n\nlgb.Run()")

        # define random hyperparammeters
        # https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50

        cat_feats = [
            'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id',
            "event_name_1", "event_name_2", "event_type_1", "event_type_2"
        ]

        train_set = lgb.Dataset(self.x_train[features],
                                self.y_train,
                                categorical_feature=cat_feats)
        val_set = lgb.Dataset(self.x_val[features],
                              self.y_val,
                              categorical_feature=cat_feats)

        print("Training model...")
        evals_result = {}

        #TODO:
        # 'metric':'auc'
        params = {
            'boosting_type': 'gbdt',
            'metric': 'rmse',
            'objective': 'regression',
            'n_jobs': -1,
            'seed': 236,
            'learning_rate': 0.075,
            'num_leaves': 128,
            'min_data_in_leaf': 50,
            #'bagging_fraction': 0.75,
            #'bagging_freq': 10,
            #'colsample_bytree': 0.75
        }

        self.model = lgb.train(
            params,
            train_set,
            valid_sets=val_set,
            verbose_eval=10,
            num_boost_round=2500,
            early_stopping_rounds=50,
        )

        # TODO: Getting error with this once I moved it into class
        if (show_plots):
            ax = lgb.plot_importance(self.model, max_num_features=20)
            plt.show()

            ax = lgb.plot_split_value_histogram(self.model,
                                                feature='store_id',
                                                bins='auto')
            plt.show()
Exemplo n.º 4
0
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()

print('Plotting 54th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm,
                   tree_index=53,
                   figsize=(15, 15),
                   show_info=['split_gain'])
plt.show()

print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
graph.render(view=True)
Exemplo n.º 5
0
def Lgboost(data):
    """
    LGBOOST Model Train/Test. Results logged to file
    """
    logging.info('\n...LGBM training started...\n')
    data = data.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    for i in categorical_labels:
        data[i] = data[i].astype(float)
    Y = data[['SubAffiliateCost']]
    X = data[data.columns.difference(['SubAffiliateCost'])]
    use_case = ['Unnamed0']
    X.drop(use_case, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)

    hyper_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['Huber'],
        'learning_rate': 0.005,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'verbose': -1,
        "alpha": 1.35,
        "max_depth": 8,
        "num_leaves": 128,
        "max_bin": 512,
        "num_iterations": 2500,
        "n_estimators": 250
    }
    try:
        model = lgb.LGBMRegressor(**hyper_params)
        model.fit(X_train,
                  y_train,
                  eval_set=(X_test, y_test),
                  feature_name='auto',
                  categorical_feature=categorical_labels,
                  verbose=False)
        y_train_pred = model.predict(X_train)
        y_pred = model.predict(X_test)
        #print(mean_absolute_error(np.exp(y_train),np.exp(y_train_pred)),np.exp(y_test),np.exp(y_pred))
        '''logging.info("Train "+str(math.sqrt(mean_squared_error(np.exp(y_train_pred), np.exp(y_train)))))
        logging.info("Test "+str(math.sqrt(mean_squared_error(np.exp(y_pred),np.exp(y_test)))))
        logging.info("Train MAE "+str(mean_absolute_error(np.exp(y_train_pred), np.exp(y_train))))
        logging.info("Test MAE "+str( mean_absolute_error(np.exp(y_pred), np.exp(y_test))))

        r2_train = r2_score(np.exp(y_train_pred), np.exp(y_train))
        r2_test = r2_score(np.exp(y_pred), np.exp(y_test))
        adj_r2_train = 1 - ((1 - (r2_train) ** 2) * (len(X_train) - 1) / (len(X_train) - len(list(X.columns)) - 1))
        adj_r2_test = 1 - ((1 - (r2_test) ** 2) * (len(X_test) - 1) / (len(X_test) - len(list(X.columns)) - 1))
        logging.info("R2, Adjusted R2 Train : "+str(r2_train)+" "+str(adj_r2_train))
        logging.info("R2, Adjusted R2 Test : "+str(r2_test)+" "+str(adj_r2_test))
        '''

        logging.info("Train " + str(mean_squared_error(y_train_pred, y_train)))
        logging.info("Test " + str(mean_squared_error(y_pred, y_test)))
        logging.info("Train MAE " +
                     str(mean_absolute_error(y_train_pred, y_train)))
        logging.info("Test MAE " + str(mean_absolute_error(y_pred, y_test)))

        r2_train = r2_score(y_train_pred, y_train)
        r2_test = r2_score(y_pred, y_test)
        adj_r2_train = 1 - ((1 - (r2_train)**2) * (len(X_train) - 1) /
                            (len(X_train) - len(list(X.columns)) - 1))
        adj_r2_test = 1 - ((1 - (r2_test)**2) * (len(X_test) - 1) /
                           (len(X_test) - len(list(X.columns)) - 1))
        logging.info("R2, Adjusted R2 Train : " + str(r2_train) + " " +
                     str(adj_r2_train))
        logging.info("R2, Adjusted R2 Test : " + str(r2_test) + " " +
                     str(adj_r2_test))

        feature_name = zip(X.columns, model.feature_importances_)
        feature_name = sorted(feature_name, key=lambda x: x[1], reverse=True)
        logging.info("Top 15 Features" + str(feature_name[:15]))
        logging.info("\n...LGB Model Train/Test Completed...\n")

        ax = lgb.plot_importance(model, max_num_features=15)
        plt.savefig(reports_dir + "feature_importance.png")

        ax = lgb.plot_split_value_histogram(model,
                                            feature='OnSiteTime',
                                            bins='auto')
        plt.savefig(reports_dir + "feature_histogram.png")

    except Exception as e:
        logging.error(e)
                num_boost_round=1000,
                valid_sets=[dvalid],
                early_stopping_rounds=30)

valid_pred = bst.predict(val_X)
valid_score = metrics.roc_auc_score(val_y, valid_pred)
print(f"Validation AUC score: {valid_score:.4f}")

import matplotlib.pyplot as plt

from lightgbm import plot_importance
from lightgbm import plot_split_value_histogram
fig, ax = plt.subplots(figsize=(10, 8))
plot_importance(bst, ax=ax)
fig, ax = plt.subplots(figsize=(10, 8))
plot_split_value_histogram(bst, 'Forecast', ax=ax)
plt.show()

ax = lgb.plot_tree(bst,
                   tree_index=3,
                   figsize=(200, 200),
                   show_info=['split_gain'])
"""
--------------------------------------------------------------------------
--------------------------------------------------------------------------
--------------------------------------------------------------------------
"""
# Fitting classifier to the Training set
# Create your classifier here

from sklearn.linear_model import LogisticRegression
Exemplo n.º 7
0
score2 = np.mean(np.abs((np.expm1(y_test) - np.expm1(y_pred2)) / np.expm1(y_test))) * 100
print ("\nLGB Model Report")
print("train {:.2f} | valid {:.2f}".format(float(score1), float(score2)))


# plot
print('Plot metrics during training...')
ax = lgb.plot_metric(evals_result, metric='mape')
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(mod_lgb, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(mod_lgb, feature='CRITICSCORE', bins='auto')
plt.show()


# hyperparameters tuning
def my_scorer(y_true, y_pred):
    mape = np.mean(np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100
    return mape
my_func = make_scorer(my_scorer, greater_is_better=False)


cv_lgb = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=300, objective='regression', verbosity = -1, random_state= 1337)

parameters = {
              'learning_rate':[0.1],
              'num_leaves':[5,10,15,20], 
Exemplo n.º 8
0
evals_result = {}  # to record eval results for plotting

# Fit LightGBM model on training data
gbm = lgb.train(params,
                lgb_train,
                valid_sets=[lgb_valid, lgb_train],
                valid_names=['valid', 'train'],
                early_stopping_rounds=5,
                evals_result=evals_result)

ax = lgb.plot_tree(gbm,
                   tree_index=53,
                   figsize=(25, 15),
                   show_info=['split_gain'])
plt.savefig(dm_nodedir + '/rpt_tree.png', dpi=500)

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.savefig(dm_nodedir + '/rpt_importance.png', pad_inches=0.1)

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='IMP_CLNO', bins='auto')
plt.savefig(dm_nodedir + '/rpt_hist1.png')

# Generate predictions and create new columns for Model Studio
tmp = gbm.predict(dm_inputdf.loc[:, dm_input])
dm_scoreddf = pd.DataFrame()
dm_scoreddf[dm_predictionvar[1]] = tmp
dm_scoreddf[dm_predictionvar[0]] = 1 - tmp
    plt.figure()
    lgb.plot_importance(model)
    plt.savefig('lgb_training_results_feature_importance_{}.pdf'.format(suffix), bbox_inches='tight')

    plt.figure()
    lgb.plot_metric(results, metric='pauc', ylabel='PACU')
    plt.savefig('lgb_training_results_learning_curve_{}.pdf'.format(suffix), bbox_inches='tight')

    split_value_hist = []
    plt.figure()
    for feature in features:
      feature = feature.replace('_','-')
      outputname = 'lgb_training_results_split_value_histogram_{}_{}.pdf'.format(feature, suffix)
      try:
        lgb.plot_split_value_histogram(model, feature)
        plt.savefig(outputname, bbox_inches='tight')
        plt.cla()
        split_value_hist.append(outputname)
      except ValueError:
        pass

    pdf_combine(split_value_hist, 'lgb_training_results_split_value_histogram_{}.pdf'.format(suffix))
    map(lambda x: os.system('rm {}'.format(x)), split_value_hist)


    print("Finding working points for new training:")
    working_points = get_working_points(df, "score")
    print("")

    n_pt_bins = 100
Exemplo n.º 10
0
 def plot_split_value_histogram(self, ax=None, height=1):
     lgb.plot_split_value_histogram(self.model, ax=ax)