def test_plot_split_value_histogram(self): gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10) ax0 = lgb.plot_split_value_histogram(gbm0, 27) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Split value histogram for feature with index 27') self.assertEqual(ax0.get_xlabel(), 'Feature split value') self.assertEqual(ax0.get_ylabel(), 'Count') self.assertLessEqual(len(ax0.patches), 2) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm1.fit(self.X_train, self.y_train) ax1 = lgb.plot_split_value_histogram( gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5), title='Histogram for feature @index/name@ @feature@', xlabel='x', ylabel='y', color='r') self.assertIsInstance(ax1, matplotlib.axes.Axes) self.assertEqual( ax1.get_title(), 'Histogram for feature name {}'.format( gbm1.booster_.feature_name()[27])) self.assertEqual(ax1.get_xlabel(), 'x') self.assertEqual(ax1.get_ylabel(), 'y') self.assertLessEqual(len(ax1.patches), 2) for patch in ax1.patches: self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '') self.assertEqual(len(ax2.patches), 10) self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.)) # r self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.)) # y self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.)) # g self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.)) # b self.assertRaises(ValueError, lgb.plot_split_value_histogram, gbm0, 0) # was not used in splitting
def test_plot_split_value_histogram(params, breast_cancer_split, train_data): X_train, _, y_train, _ = breast_cancer_split gbm0 = lgb.train(params, train_data, num_boost_round=10) ax0 = lgb.plot_split_value_histogram(gbm0, 27) assert isinstance(ax0, matplotlib.axes.Axes) assert ax0.get_title() == 'Split value histogram for feature with index 27' assert ax0.get_xlabel() == 'Feature split value' assert ax0.get_ylabel() == 'Count' assert len(ax0.patches) <= 2 gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm1.fit(X_train, y_train) ax1 = lgb.plot_split_value_histogram( gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5), title='Histogram for feature @index/name@ @feature@', xlabel='x', ylabel='y', color='r') assert isinstance(ax1, matplotlib.axes.Axes) title = 'Histogram for feature name {}'.format( gbm1.booster_.feature_name()[27]) assert ax1.get_title() == title assert ax1.get_xlabel() == 'x' assert ax1.get_ylabel() == 'y' assert len(ax1.patches) <= 2 for patch in ax1.patches: assert patch.get_facecolor() == (1., 0, 0, 1.) # red ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) assert isinstance(ax2, matplotlib.axes.Axes) assert ax2.get_title() == '' assert ax2.get_xlabel() == '' assert ax2.get_ylabel() == '' assert len(ax2.patches) == 10 assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b with pytest.raises(ValueError): lgb.plot_split_value_histogram(gbm0, 0) # was not used in splitting
def train_and_predict(self, show_plots=False): print("\n\n\nlgb.Run()") # define random hyperparammeters # https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50 cat_feats = [ 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', "event_name_1", "event_name_2", "event_type_1", "event_type_2" ] train_set = lgb.Dataset(self.x_train[features], self.y_train, categorical_feature=cat_feats) val_set = lgb.Dataset(self.x_val[features], self.y_val, categorical_feature=cat_feats) print("Training model...") evals_result = {} #TODO: # 'metric':'auc' params = { 'boosting_type': 'gbdt', 'metric': 'rmse', 'objective': 'regression', 'n_jobs': -1, 'seed': 236, 'learning_rate': 0.075, 'num_leaves': 128, 'min_data_in_leaf': 50, #'bagging_fraction': 0.75, #'bagging_freq': 10, #'colsample_bytree': 0.75 } self.model = lgb.train( params, train_set, valid_sets=val_set, verbose_eval=10, num_boost_round=2500, early_stopping_rounds=50, ) # TODO: Getting error with this once I moved it into class if (show_plots): ax = lgb.plot_importance(self.model, max_num_features=20) plt.show() ax = lgb.plot_split_value_histogram(self.model, feature='store_id', bins='auto') plt.show()
lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto') plt.show() print('Plotting 54th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain']) plt.show() print('Plotting 54th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54') graph.render(view=True)
def Lgboost(data): """ LGBOOST Model Train/Test. Results logged to file """ logging.info('\n...LGBM training started...\n') data = data.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) for i in categorical_labels: data[i] = data[i].astype(float) Y = data[['SubAffiliateCost']] X = data[data.columns.difference(['SubAffiliateCost'])] use_case = ['Unnamed0'] X.drop(use_case, axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) hyper_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': ['Huber'], 'learning_rate': 0.005, 'feature_fraction': 0.9, 'bagging_fraction': 0.7, 'bagging_freq': 10, 'verbose': -1, "alpha": 1.35, "max_depth": 8, "num_leaves": 128, "max_bin": 512, "num_iterations": 2500, "n_estimators": 250 } try: model = lgb.LGBMRegressor(**hyper_params) model.fit(X_train, y_train, eval_set=(X_test, y_test), feature_name='auto', categorical_feature=categorical_labels, verbose=False) y_train_pred = model.predict(X_train) y_pred = model.predict(X_test) #print(mean_absolute_error(np.exp(y_train),np.exp(y_train_pred)),np.exp(y_test),np.exp(y_pred)) '''logging.info("Train "+str(math.sqrt(mean_squared_error(np.exp(y_train_pred), np.exp(y_train))))) logging.info("Test "+str(math.sqrt(mean_squared_error(np.exp(y_pred),np.exp(y_test))))) logging.info("Train MAE "+str(mean_absolute_error(np.exp(y_train_pred), np.exp(y_train)))) logging.info("Test MAE "+str( mean_absolute_error(np.exp(y_pred), np.exp(y_test)))) r2_train = r2_score(np.exp(y_train_pred), np.exp(y_train)) r2_test = r2_score(np.exp(y_pred), np.exp(y_test)) adj_r2_train = 1 - ((1 - (r2_train) ** 2) * (len(X_train) - 1) / (len(X_train) - len(list(X.columns)) - 1)) adj_r2_test = 1 - ((1 - (r2_test) ** 2) * (len(X_test) - 1) / (len(X_test) - len(list(X.columns)) - 1)) logging.info("R2, Adjusted R2 Train : "+str(r2_train)+" "+str(adj_r2_train)) logging.info("R2, Adjusted R2 Test : "+str(r2_test)+" "+str(adj_r2_test)) ''' logging.info("Train " + str(mean_squared_error(y_train_pred, y_train))) logging.info("Test " + str(mean_squared_error(y_pred, y_test))) logging.info("Train MAE " + str(mean_absolute_error(y_train_pred, y_train))) logging.info("Test MAE " + str(mean_absolute_error(y_pred, y_test))) r2_train = r2_score(y_train_pred, y_train) r2_test = r2_score(y_pred, y_test) adj_r2_train = 1 - ((1 - (r2_train)**2) * (len(X_train) - 1) / (len(X_train) - len(list(X.columns)) - 1)) adj_r2_test = 1 - ((1 - (r2_test)**2) * (len(X_test) - 1) / (len(X_test) - len(list(X.columns)) - 1)) logging.info("R2, Adjusted R2 Train : " + str(r2_train) + " " + str(adj_r2_train)) logging.info("R2, Adjusted R2 Test : " + str(r2_test) + " " + str(adj_r2_test)) feature_name = zip(X.columns, model.feature_importances_) feature_name = sorted(feature_name, key=lambda x: x[1], reverse=True) logging.info("Top 15 Features" + str(feature_name[:15])) logging.info("\n...LGB Model Train/Test Completed...\n") ax = lgb.plot_importance(model, max_num_features=15) plt.savefig(reports_dir + "feature_importance.png") ax = lgb.plot_split_value_histogram(model, feature='OnSiteTime', bins='auto') plt.savefig(reports_dir + "feature_histogram.png") except Exception as e: logging.error(e)
num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=30) valid_pred = bst.predict(val_X) valid_score = metrics.roc_auc_score(val_y, valid_pred) print(f"Validation AUC score: {valid_score:.4f}") import matplotlib.pyplot as plt from lightgbm import plot_importance from lightgbm import plot_split_value_histogram fig, ax = plt.subplots(figsize=(10, 8)) plot_importance(bst, ax=ax) fig, ax = plt.subplots(figsize=(10, 8)) plot_split_value_histogram(bst, 'Forecast', ax=ax) plt.show() ax = lgb.plot_tree(bst, tree_index=3, figsize=(200, 200), show_info=['split_gain']) """ -------------------------------------------------------------------------- -------------------------------------------------------------------------- -------------------------------------------------------------------------- """ # Fitting classifier to the Training set # Create your classifier here from sklearn.linear_model import LogisticRegression
score2 = np.mean(np.abs((np.expm1(y_test) - np.expm1(y_pred2)) / np.expm1(y_test))) * 100 print ("\nLGB Model Report") print("train {:.2f} | valid {:.2f}".format(float(score1), float(score2))) # plot print('Plot metrics during training...') ax = lgb.plot_metric(evals_result, metric='mape') plt.show() print('Plot feature importances...') ax = lgb.plot_importance(mod_lgb, max_num_features=10) plt.show() print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(mod_lgb, feature='CRITICSCORE', bins='auto') plt.show() # hyperparameters tuning def my_scorer(y_true, y_pred): mape = np.mean(np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100 return mape my_func = make_scorer(my_scorer, greater_is_better=False) cv_lgb = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=300, objective='regression', verbosity = -1, random_state= 1337) parameters = { 'learning_rate':[0.1], 'num_leaves':[5,10,15,20],
evals_result = {} # to record eval results for plotting # Fit LightGBM model on training data gbm = lgb.train(params, lgb_train, valid_sets=[lgb_valid, lgb_train], valid_names=['valid', 'train'], early_stopping_rounds=5, evals_result=evals_result) ax = lgb.plot_tree(gbm, tree_index=53, figsize=(25, 15), show_info=['split_gain']) plt.savefig(dm_nodedir + '/rpt_tree.png', dpi=500) print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.savefig(dm_nodedir + '/rpt_importance.png', pad_inches=0.1) print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(gbm, feature='IMP_CLNO', bins='auto') plt.savefig(dm_nodedir + '/rpt_hist1.png') # Generate predictions and create new columns for Model Studio tmp = gbm.predict(dm_inputdf.loc[:, dm_input]) dm_scoreddf = pd.DataFrame() dm_scoreddf[dm_predictionvar[1]] = tmp dm_scoreddf[dm_predictionvar[0]] = 1 - tmp
plt.figure() lgb.plot_importance(model) plt.savefig('lgb_training_results_feature_importance_{}.pdf'.format(suffix), bbox_inches='tight') plt.figure() lgb.plot_metric(results, metric='pauc', ylabel='PACU') plt.savefig('lgb_training_results_learning_curve_{}.pdf'.format(suffix), bbox_inches='tight') split_value_hist = [] plt.figure() for feature in features: feature = feature.replace('_','-') outputname = 'lgb_training_results_split_value_histogram_{}_{}.pdf'.format(feature, suffix) try: lgb.plot_split_value_histogram(model, feature) plt.savefig(outputname, bbox_inches='tight') plt.cla() split_value_hist.append(outputname) except ValueError: pass pdf_combine(split_value_hist, 'lgb_training_results_split_value_histogram_{}.pdf'.format(suffix)) map(lambda x: os.system('rm {}'.format(x)), split_value_hist) print("Finding working points for new training:") working_points = get_working_points(df, "score") print("") n_pt_bins = 100
def plot_split_value_histogram(self, ax=None, height=1): lgb.plot_split_value_histogram(self.model, ax=ax)