def test_create_tree_digraph(breast_cancer_split): X_train, _, y_train, _ = breast_cancer_split constraints = [-1, 1] * int(X_train.shape[1] / 2) gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, monotone_constraints=constraints) gbm.fit(X_train, y_train) with pytest.raises(IndexError): lgb.create_tree_digraph(gbm, tree_index=83) graph = lgb.create_tree_digraph(gbm, tree_index=3, show_info=['split_gain', 'internal_value', 'internal_weight'], name='Tree4', node_attr={'color': 'red'}) graph.render(view=False) assert isinstance(graph, graphviz.Digraph) assert graph.name == 'Tree4' assert len(graph.node_attr) == 1 assert graph.node_attr['color'] == 'red' assert len(graph.graph_attr) == 0 assert len(graph.edge_attr) == 0 graph_body = ''.join(graph.body) assert 'leaf' in graph_body assert 'gain' in graph_body assert 'value' in graph_body assert 'weight' in graph_body assert '#ffdddd' in graph_body assert '#ddffdd' in graph_body assert 'data' not in graph_body assert 'count' not in graph_body
def fit(self, X, y=None): render = False zero_update=True logging.debug("Starting KiGB fit") lgb_train = lgb.Dataset(X, y, free_raw_data=False) param = self.get_params().copy() param.pop('trees') param.pop('lamda') param.pop('epsilon') param.pop('advice') # Learn first tree kigb_gbm = lgb.train(param, lgb_train, num_boost_round=1) if render: # Render tree in pdf for debugging graph = lgb.create_tree_digraph(kigb_gbm, tree_index=0, name='before_update_' + str(0)) graph.render('./render/lgbm/before_update_' + str(0)) # Update penalty values update = kigb_penalty_update(kigb_gbm, self.advice, lamda=self.lamda, epsilon=self.epsilon) if update: zero_update=False kigb_gbm.model_from_string(update, verbose=False) if render: # Rrender tree in pdf for debugging graph = lgb.create_tree_digraph(kigb_gbm, tree_index=0, name='after_update_' + str(0)) graph.render('./render/lgbm/after_update_' + str(0)) # iterate over trees. for h in range(1, self.trees + 1): lgb_train = lgb.Dataset(X, y, free_raw_data=False) # Bug in Lightgbm, need to initialize data # Learn next tree with initial model kigb_gbm = lgb.train(param, lgb_train, num_boost_round=1, init_model=kigb_gbm) # If trees are not learnt further, break the loop if kigb_gbm.num_trees() <= h: logging.info("Trees are not learnt further") break if render: # Render tree for debugging graph = lgb.create_tree_digraph(kigb_gbm, tree_index=h, name='before_update_'+str(h)) graph.render('./render/lgbm/before_update_'+str(h)) # Update the penalty update = kigb_penalty_update(kigb_gbm, self.advice, h, lamda=self.lamda, epsilon=self.epsilon) if update: zero_update=False kigb_gbm.model_from_string(update, verbose=False) if render: # Render tree for debugging graph = lgb.create_tree_digraph(kigb_gbm, tree_index=h, name='after_update_' + str(h)) graph.render('./render/lgbm/after_update_' + str(h)) self.kigb = kigb_gbm if zero_update: logging.info("ZERO UPDATES") logging.debug("finished KiGB fit") return self
def test_create_tree_digraph(self): constraints = [-1, 1] * int(self.X_train.shape[1] / 2) gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True, monotone_constraints=constraints) gbm.fit(self.X_train, self.y_train, verbose=False) self.assertRaises(IndexError, lgb.create_tree_digraph, gbm, tree_index=83) graph = lgb.create_tree_digraph(gbm, tree_index=3, show_info=['split_gain', 'internal_value', 'internal_weight'], name='Tree4', node_attr={'color': 'red'}) graph.render(view=False) self.assertIsInstance(graph, graphviz.Digraph) self.assertEqual(graph.name, 'Tree4') self.assertEqual(graph.filename, 'Tree4.gv') self.assertEqual(len(graph.node_attr), 1) self.assertEqual(graph.node_attr['color'], 'red') self.assertEqual(len(graph.graph_attr), 0) self.assertEqual(len(graph.edge_attr), 0) graph_body = ''.join(graph.body) self.assertIn('leaf', graph_body) self.assertIn('gain', graph_body) self.assertIn('value', graph_body) self.assertIn('weight', graph_body) self.assertIn('#ffdddd', graph_body) self.assertIn('#ddffdd', graph_body) self.assertNotIn('data', graph_body) self.assertNotIn('count', graph_body)
def __use_model__(self): # # Create a submission # submission = pd.read_csv('test.csv') ids = submission['id'].values submission.drop('id', inplace=True, axis=1) x = submission.values y = self.model.predict(x) # note: anything above .5 is rounded up binY = [round(i) for i in y] time_label = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") output = pd.DataFrame({'id': ids, 'target': binY}) output.to_csv("output{0}{1}_submission.csv".format( self.sep, time_label), index=False) auc = max(self.myoutput["valid_0"]["auc"]) params = ",".join(self.model.model_to_string().split("parameters:\n") [1].split("\n\n")[0].split("\n")) with open("submission_list.csv", "a") as csv: csv.write("{0},{1},{2},{3}\n".format(time_label, auc, params, self.test_size)) if self.save_graph: graph = lightgbm.create_tree_digraph(self.model) graph.format = "png" graph.render("output{0}{1}".format(self.sep, time_label))
def test_create_tree_digraph(self): gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm.fit(self.X_train, self.y_train, verbose=False) self.assertRaises(IndexError, lgb.create_tree_digraph, gbm, tree_index=83) graph = lgb.create_tree_digraph(gbm, tree_index=3, show_info=['split_gain', 'internal_value'], name='Tree4', node_attr={'color': 'red'}) graph.render(view=False) self.assertIsInstance(graph, graphviz.Digraph) self.assertEqual(graph.name, 'Tree4') self.assertEqual(graph.filename, 'Tree4.gv') self.assertEqual(len(graph.node_attr), 1) self.assertEqual(graph.node_attr['color'], 'red') self.assertEqual(len(graph.graph_attr), 0) self.assertEqual(len(graph.edge_attr), 0) graph_body = ''.join(graph.body) self.assertIn('threshold', graph_body) self.assertIn('split_feature_name', graph_body) self.assertNotIn('split_feature_index', graph_body) self.assertIn('leaf_index', graph_body) self.assertIn('split_gain', graph_body) self.assertIn('internal_value', graph_body) self.assertNotIn('internal_count', graph_body) self.assertNotIn('leaf_count', graph_body)
def test_plot_example(): print('Loading data...') # load or create your dataset df_train = pd.read_csv( r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.train', header=None, sep='\t') df_test = pd.read_csv( r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.test', header=None, sep='\t') y_train = df_train[0] y_test = df_test[0] X_train = df_train.drop(0, axis=1) X_test = df_test.drop(0, axis=1) # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train) # specify your configurations as a dict params = {'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0} evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train( params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plotting 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
def plot_lgb_n_graphviz_trees(self, n: int): for idx in range(n): graph = lgb.create_tree_digraph( booster=self.model, tree_index=idx, show_info=['split_gain', 'leaf_count', 'internal_value']) graph.render(view=False, directory=self.folder_structure.dir_tree_graphviz, filename=str(idx) + "_tree", cleanup=True)
def show_model_performance(gbm, evals_result): # show model importance # lgb.plot_importance(gbm) # Show Decision Tree if config.can_plot_tree: graph = lgb.create_tree_digraph(gbm, name='Decision Tree') graph.render(view=True) if config.can_show_metric: fig, axs = plt.subplots(2, 1, figsize=(8, 10)) for index in range(len(config.metric)): lgb.plot_metric(evals_result, config.metric[index], title=config.metric[index], ax=axs[index]) plt.show()
def plot_tree(model_path, tree_index, save_plot_path): ''' 对模型进行可视化 :param model_path: :param tree_index: :param save_plot_path: :return: ''' if not os.path.exists(model_path): print("file no exists! {}".format(model_path)) sys.exit(0) gbm = lgb.Booster(model_file=model_path) graph = lgb.create_tree_digraph(gbm, tree_index=tree_index, name='tree' + str(tree_index)) graph.render(filename=save_plot_path, view=True) #可视图保存到save_plot_path中
def train(self): train_x, valid_x, train_y, valid_y = train_test_split( self.train_features, self.train_labels, test_size=0.2, shuffle=False, random_state=712) if self.use_sparse_matrix: train_x, valid_x, self.test_features = csr_matrix(train_x, dtype='float32'), \ csr_matrix(valid_x, dtype='float32'), \ csr_matrix(self.test_features, dtype='float32') lgb_train = lgb.Dataset(train_x, train_y) lgb_eval = lgb.Dataset(valid_x, valid_y) param = read_json(self.config.LIGHTGBM_BEST_PARAM) param = self.config.PARAM gbm = lgb.train(param, lgb_train, valid_sets=[lgb_eval], categorical_feature=self.config.CATEGORY_VARIABLES) print('Predicting...') test_predictions = gbm.predict(self.test_features, num_iteration=gbm.best_iteration) # save model to file gbm.save_model(self.config.MODEL_SAVING_PATH) if not self.use_sparse_matrix: self.feature_importance = pd.DataFrame({ 'feature': self.train_features.columns, 'importance': gbm.feature_importance() }) self.plot_feature_importance() print('Saving model...') print_tree = False if print_tree: print('Plotting 1th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=0, name='Tree1') graph.render(filename='assets/tree_graph') submission(self.config, test_predictions, True, '%.5f' % gbm.best_score['valid_0']['auc']) return test_predictions
def get_model_tree_digraph(model, model_name="default", outputpath="./"): ''' 画出tree 的树结构 :param model: :param model_name: :param outputpath: :param importance_type: :param num_feature: :return: ''' try: outputpath = outputpath + model_name + "_tree_digraph.gv" graph = lgb.create_tree_digraph(model, name=model_name) graph.render(filename=outputpath) except: logger.error("create model tree_digrap fail.") return False else: logger.info("create model tree_digrap sucess.") return True
def train_light_gbm(self, dts): # create dataset for lightgbm lgb_train = lgb.Dataset(dts.trainX, dts.trainY) lgb_test = lgb.Dataset(dts.testX, dts.testY, reference=lgb_train) # specify your configurations as a dict params = { 'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0 } evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['close', 'open', 'high', 'low', 'volume'], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plotting 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
'verbose': 0 } evals_result = {} # to record eval results for plotting print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(28)], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plot metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plot feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plot 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plot 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score # 加载数据 print('Load data...') X_train,X_test,y_train,y_test =train_test_split(x,l,test_size=0.2) print('Start training...') # 创建模型,训练模型 lgbm= lgb.LGBMClassifier() lgbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5) lgb.create_tree_digraph(lgbm, tree_index=1) import matplotlib.pyplot as plt import matplotlib fig2 = plt.figure(figsize=(20, 20)) ax = fig2.subplots() lgb.plot_tree(lgbm._Booster, tree_index=1, ax=ax) plt.show() print('Start predicting...') # 测试机预测 y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_) # feature importances print('Feature importances:', list(lgbm.feature_importances_)) recall = recall_score(y_pred,y_test)
} evals_result = {} # to record eval results for plotting print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(28)], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plot metrics during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plot feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plot 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plot 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
def plot_tree(est_or_grower, est_lightgbm=None, tree_index=0, view=True, **kwargs): """Plot the i'th predictor tree of a GBM or a grower tree est_or_grower can either be a GradientBoostingMachine instance or a TreeGrower. In this latter case tree_index is ignored, and more debugging info are displayed. Trees displayed from TreeGrower has additional profiling information that are not kept in the predictor trees that result from fitting a GradientBoostingMachine. tree_index corresponds to the ith built tree. In a multiclass setting, e.g. with 3 classes, tree_index=5 will print the third tree of the second iteration. Can also plot a LightGBM estimator (on the left) for comparison. Requires matplotlib and graphviz (both python package and binary program). kwargs are passed to graphviz.Digraph() Example: plotting.plot_tree(est_pygbm, est_lightgbm, view=False, filename='output') will silently save output to output.pdf """ def make_pygbm_tree(): def add_predictor_node(node_idx, parent=None, decision=None): iteration = tree_index // est_or_grower.n_trees_per_iteration_ k = tree_index % est_or_grower.n_trees_per_iteration_ predictor_tree = est_or_grower.predictors_[iteration][k] node = predictor_tree.nodes[node_idx] name = 'split__{}'.format(node_idx) label = 'split_feature_index: {}'.format(node['feature_idx']) label += r'\nthreshold: {:.3f}'.format(node['threshold']) label += r'\ngain: {:.3E}'.format(node['gain']) label += r'\nvalue: {:.3f}'.format(node['value']) label += r'\ncount: {:,}'.format(node['count']) graph.node(name, label=label) if not node['is_leaf']: add_predictor_node(node['left'], name, decision='<=') add_predictor_node(node['right'], name, decision='>') if parent is not None: graph.edge(parent, name, decision) def add_grower_node(node, parent=None, decision=None): name = 'split__{0}'.format(id(node)) si = node.split_info if si is None: feature_idx = 0 bin_idx = 0 gain = 0. sum_gradients = 0. sum_hessians = 0. else: feature_idx = si.feature_idx gain = 0. if si.gain is None else si.gain bin_idx = si.bin_idx sum_gradients = si.gradient_left + si.gradient_right sum_hessians = si.hessian_left + si.hessian_right value = 0. if node.value is None else node.value label = 'split_feature_index: {}'.format(feature_idx) label += r'\nbin threshold: {}'.format(bin_idx) label += r'\ngain: {:.3E}'.format(gain) label += r'\nvalue: {:.3f}'.format(value) label += r'\ncount: {:,}'.format(node.sample_indices.shape[0]) label += r'\nhist substration: {}'.format(node.hist_subtraction) label += r'\nhist speed: {:.3E}'.format(node.construction_speed) label += r'\nfind split time: {:.4f}'.format(node.find_split_time) label += r'\napply split time: {:.4f}'.format( node.apply_split_time) label += r'\nsum gradients: {:.3E}'.format(sum_gradients) label += r'\nsum hessians: {:.3E}'.format(sum_hessians) graph.node(name, label=label) if node.value is None: # not a leaf node add_grower_node(node.left_child, name, decision='<=') add_grower_node(node.right_child, name, decision='>') if parent is not None: graph.edge(parent, name, decision) if isinstance(est_or_grower, BaseGradientBoostingMachine): add_predictor_node(0) elif isinstance(est_or_grower, pygbm.grower.TreeGrower): add_grower_node(est_or_grower.root) # make lightgbm tree if est_lightgbm is not None: import lightgbm as lb graph = lb.create_tree_digraph(est_lightgbm, tree_index=tree_index, show_info=[ 'split_gain', 'internal_value', 'internal_count', 'leaf_count' ], **kwargs) else: graph = Digraph(**kwargs) # make pygbm tree make_pygbm_tree() graph.render(view=view)
# -*- coding: utf-8 -*- __author__ = 'lijingjie' import sys sys.path.insert(0, 'src/models/') sys.path.insert(0, 'src/conf/') sys.path.insert(0, '../conf/') sys.path.insert(0, '../models') sys.path.insert(0, '../') import graphviz import warnings warnings.filterwarnings("ignore") import os os.environ['LIGHTGBM_EXEC'] = "/Users/jacklee/LightGBM/lightgbm" # os.environ["PATH"] += os.pathsep + 'E:/Program Files (x86)/Graphviz2.38/bin' import lightgbm as lgb bst = lgb.Booster(model_file='lightgbm/20190512-1047/lgb-lgb-tst1-fold-0-0.dump') image = lgb.create_tree_digraph(bst, tree_index=1,show_info=['split_gain','internal_value','internal_count','leaf_count']) image.render('lightgbm/20190512-1047/lgb-lgb-tst1-fold-0-0.gv', view=True) print ('checking Done!')
print('Plot feature importances...') ax = lgb.plot_importance(lgbm, max_num_features=10) # In[73]: print('Plot 4th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(lgbm, tree_index=3, figsize=(20, 8), show_info=['split_gain']) # In[70]: import graphviz print('Plot 4th tree with graphviz...') graph = lgb.create_tree_digraph(lgbm, tree_index=, name='Tree4') graph.render(view=True) # In[60]: from sklearn.model_selection import GridSearchCV param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40] } estimator = lgb.LGBMRegressor(num_boost_round=20, early_stopping_rounds=5) grid_lgbm = GridSearchCV(estimator, param_grid) grid_lgbm.fit(train_all[features], train_all['kda_ratio']) print('Best parameters found by grid search are:', grid_lgbm.best_params_)
yi = dfval.loc[n]['y'] n += 1 print("top n: ", n) print('test:', dtest[dtest['y'] >= yi].shape[0]) # 预训练证集 ytrain_pred = gbm.predict(X_train) dftrain['y'] = ytrain_pred dftrain = dftrain.sort_values('y', ascending=False).reset_index(drop=True) dftrain.to_csv('../datas/{0}_ytrain_pred'.format(xstr), index=None) #y_pred = gbm.predict(X_test) #xtest['y'] = y_pred #xtest = xtest.sort_values('y',ascending=False).reset_index(drop=True) cols = df_train.columns.tolist() scores = gbm.feature_importance() df = pd.DataFrame({'cols': cols, 'scores': scores}) df = df.sort_values('scores', ascending=False).reset_index(drop=True) df.to_csv('../datas/a', index=None, header=None) #print xtest.head() #xtest.to_csv('../datas/xtest',index=None) #print evals_result #lgb.plot_metric(evals_result,metric='auc') # #lgb.plot_metric(evals_result,metric='binary_logloss') #lgb.plot_importance(gbm, max_num_features=50) # graph = lgb.create_tree_digraph(gbm, tree_index=0, name='Tree0') graph.render(view=True) #plt.show()
print("Starting training . . .") categoricals = [] for col in cat_columns: try: categoricals.append(df_train.columns.tolist().index(col)) except ValueError: continue print(categoricals) print(df_train.columns.tolist()) # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_valid, early_stopping_rounds=15, categorical_feature=categoricals) # feature names print('Feature names:', gbm.feature_name()) # feature importances print('Feature importances:', list(gbm.feature_importance())) graph = lgb.create_tree_digraph(gbm) graph.view(cleanup=True) # predict #y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # eval #print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# 2020/05/30 予想値と真の値の差が大きいデータを分析する End # 予測 pred_list.append(model.predict(test_x)) va_pred_list = np.array(va_pred_list) pred_list = np.array(pred_list) print(va_weight_list) # 提出用ファイルの作成 submission = pd.DataFrame({ 'Id': test_id, 'SalePrice': np.average(pred_list, axis=0, weights=va_weight_list) }) submission.to_csv('/kaggle/output/submission_ensemble.csv', index=False) printTime('モデルの作成終了') # - # #### データの分析にどの特徴量が重要だったのかをプロット lgb.plot_importance(model, figsize=(10, 30), max_num_features=100) # #### 分析に使用した決定木を可視化 lgb.create_tree_digraph(model)
os.environ["PATH"] += os.pathsep + 'C:\\Program Files\\Graphviz 2.44.1\\bin' import lightgbm as lgb from src.data.sk_data import Iris from src.utils import data_split import numpy as np feature, label = Iris.features, Iris.label feature = feature[label <= 1] label = label[label <= 1] train_feature, test_feature, train_label, test_label = data_split.split(feature, label) train_data = lgb.Dataset(data=train_feature, label=train_label) test_data = lgb.Dataset(data=test_feature, label=test_label) param = {'num_leaves': 31, 'num_trees': 100, 'objective': 'binary', 'num_class': 1} param['metric'] = 'multi_logloss' num_round = 10 bst = lgb.train(param, train_data, num_round, valid_sets=[test_data]) bst.save_model('model.txt') # A saved model can be loaded: bst = lgb.Booster(model_file='model.txt') ypred = bst.predict(test_feature, num_iteration=bst.best_iteration) print(np.array([1 if score > 0.5 else 0 for score in ypred])) print(test_label) for i in range(0, num_round): img = lgb.create_tree_digraph(bst, tree_index=i) with open('trees-{}.svg'.format(i), 'w') as f: f.write(img._repr_svg_())
'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } # number of leaves,will be used in feature transformation print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, categorical_feature=categorical_cols, valid_sets=lgb_train) lgb.create_tree_digraph() print('Save model...') # save model to file gbm.save_model('model.txt') print('Start predicting...') # predict and get data on leaves, training data y_pred = gbm.predict(X_train, pred_leaf=True) print(np.array(y_pred).shape) print(y_pred[:10]) print('Writing transformed training data') transformed_training_matrix = np.zeros( [len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64) # N * num_tress * num_leafs
def classifier_lgbm_general(self, X_DDTpd, X_eval, features): best_params_ = { 'objective': 'binary', 'num_leaves': 50, 'min_data_in_leaf': 10, 'max_depth': 10, 'max_bin': 50, 'learning_rate': 0.01, 'dart': False, 'reg_alpha': 0.1, 'reg_lambda': 0, 'n_estimators': 1000, 'bootstrap': True, 'dart': False } scaler = StandardScaler().fit(X_DDTpd) X_DDTpd = scaler.transform(X_DDTpd) X_eval = scaler.transform(X_eval) #poly = PolynomialFeatures(2) #X_DDTpd = poly.fit_transform(X_DDTpd) #X_eval = poly.fit_transform #pca = PCA(n_components=50).fit(X_DDTpd) #print(pca.explained_variance_ratio_) #X_DDTpd = pca.transform(X_DDTpd) #X_eval = pca.transform(X_eval) final_model = lgb.LGBMClassifier(**best_params_, random_state=self.args.seed) #cv_score_best = cross_val_score(final_model, X_DDTpd, self.Y_train_proba, cv=5, verbose=6) #print(cv_score_best.mean(), cv_score_best.std()) final_model.fit(X_DDTpd, self.Y_train_proba) self.plot_feat_importance( final_model, features, self.path_save_model + "features_importances_LGBM_nbrefeat_" + str(len(features)) + ".png") y_pred = final_model.predict(X_eval) y_pred_df = pd.DataFrame(y_pred) y_pred_df.to_csv(self.path_save_model + "Y_pred_val.csv", index=False) #print(self.nn_model_ref.outputs_pred_val[:,0].shape, y_pred.shape) #print(self.nn_model_ref.outputs_pred_val[:,0], y_pred) if self.args.eval_nn_ref: same_output = self.nn_model_ref.outputs_pred_val[:, 0] == y_pred #print(same_output) p2 = 100 * np.sum(same_output) / len(same_output) print("Proportion des prediction identiques: " + str(p2)) index_interext = np.logical_and(same_output, self.Y_eval_proba == y_pred) p22 = 100 * np.sum(index_interext) / len(index_interext) print("Proportion des prediction identiques et egal au label: " + str(p22)) else: p22 = None p2 = None same_output = None #print(ok) cm = confusion_matrix(y_pred=y_pred, y_true=self.Y_eval_proba, normalize="true") res = np.array([ accuracy_score(self.Y_eval_proba, y_pred), cm[0][0], cm[1][1], p2, p22 ]) print(res) np.save(self.path_save_model + "res_" + str(self.cpt) + ".npy", res) self.save_logs( self.path_save_model + "logs_lgbm_" + str(len(features)) + ".txt", y_pred, self.Y_eval_proba) lgb.create_tree_digraph(final_model).save( directory=self.path_save_model, filename="tree_LGBM_nbrefeat_" + str(len(features)) + ".dot") os.system("dot -Tpng " + self.path_save_model + "tree_LGBM_nbrefeat_" + str(len(features)) + ".dot > " + self.path_save_model + "tree_LGBM_nbrefeat_" + str(len(features)) + ".png") del X_DDTpd self.importances = final_model.feature_importances_ self.indices = np.argsort(self.importances)[::-1] with open( self.path_save_model + "features_impotances_order_nbrefeat_" + str(len(features)) + ".txt", "w") as file: file.write( str(np.array(features)[self.indices]) + str(self.importances[self.indices])) file.write("\n") if self.masks_infos_score is None: self.masks_infos_score = self.importances.copy() self.masks_infos_rank = np.array([ np.where(self.indices == x)[0][0] for x in range(len(self.importances)) ]) return final_model
lgb_train = lgb.Dataset(X, y) lightgbm = lgb.train(lgb_params, lgb_train) lgb.plot_importance(lightgbm) plt.title("Feature importances by LightGBM") plt.yticks(fontsize=14) plt.xticks(fontsize=14) plt.show() # ### Create Tree digraph using # `create_tree_digraph` # # In[ ]: lgb.create_tree_digraph(lightgbm) # - Contradiction # # 1. Gender should be important, also. # # - The important features from __whole dataset__ are total different than stack method refer to [Anisotropic](https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python) # 1. From [Faron's](https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867) script, he did k-fold training, and I just use entire dataset. # # # # ## New content coming Soon # # Acknowledgements # 1. [Oscar Takeshita](https://www.kaggle.com/pliptor) for pointing my XGB feature importance typo.
def create_booster_summary( booster: Union[lgb.Booster, lgb.sklearn.LGBMModel], log_importances: bool = True, max_num_features: int = 10, list_trees: list = None, log_trees_as_dataframe: bool = True, log_pickled_booster: bool = True, log_trees: bool = False, tree_figsize: int = 30, log_confusion_matrix: bool = False, y_true: np.ndarray = None, y_pred: np.ndarray = None, ): """Create model summary after training that can be assigned to the run namespace. See guide with examples in the `Neptune-LightGBM docs`_. You can log multiple types of metadata: - pickled model - feature importance chart - visualized trees - trees represented as DataFrame - confusion matrix (only for classification problems) See Args section for more info how to parametrize behaviour of this function. Note: You can log summary to the new run, or to the same run that you used for logging model training. Second option can be very useful because you have all the information in the single run. Args: booster (:obj:`lgb.Booster` or :obj:`lgb.sklearn.LGBMModel`): Trained LightGBM model. log_importances (bool): Defaults to True. Log feature importance charts. max_num_features (int): Defaults to 10. Max number of top features on the importance charts. Works only if ``log_importances`` is set to ``True``. If None or <1, all features will be displayed. See `lightgbm.plot_importance`_ for details. list_trees (list): Defaults to None. Indices of the target tree to visualize. Works only if ``log_trees`` is set to ``True``. See `lightgbm.plot_tree`_ for details. log_trees_as_dataframe (bool): Defaults to True. Parse the model and log trees in the easy-to-read pandas DataFrame format. Works only for ``lgb.Booster``. See `lightgbm.Booster.trees_to_dataframe`_ for details. log_pickled_booster (bool): Defaults to True. Log model as pickled file. log_trees (bool): Defaults to False. Log visualized trees. This requires graphviz to work. Learn about setup in the `Neptune-LightGBM installation`_ docs. tree_figsize (int): Defaults to 30, Control size of the visualized tree image. Increase this in case you work with large trees. Works only if ``log_trees`` is set to ``True``. log_confusion_matrix (bool): Defaults to False. Log confusion matrix. If set to True, you need to pass ``y_true`` and ``y_pred``. y_true (:obj:`np.ndarray`): Defaults to None. True labels on the test set. Needed only if ``log_confusion_matrix`` is set to True. y_pred (:obj:`np.ndarray`): Defaults to None. Predictions on the test set. Needed only if ``log_confusion_matrix`` is set to True. Returns: dict: Python dictionary with all metadata, that can be assigned to the run namespace. ``run["booster_summary"] = create_booster_summary(...)`` Examples: For more examples visit `example scripts`_. Full script that does logging during model training and logs booster summary after training:: import lightgbm as lgb import neptune.new as neptune import numpy as np from neptune.new.integrations.lightgbm import NeptuneCallback, create_booster_summary from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split # Create run run = neptune.init( project="common/lightgbm-integration", api_token="ANONYMOUS", name="train-cls", tags=["lgbm-integration", "train", "cls"] ) # Create neptune callback neptune_callback = NeptuneCallback(run=run) # Prepare data X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # Define parameters params = { "boosting_type": "gbdt", "objective": "multiclass", "num_class": 10, "metric": ["multi_logloss", "multi_error"], "num_leaves": 21, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.8, "bagging_freq": 5, "max_depth": 12, } # Train the model and log metadata to the run in Neptune gbm = lgb.train( params, lgb_train, num_boost_round=200, valid_sets=[lgb_train, lgb_eval], valid_names=["training", "validation"], callbacks=[neptune_callback], ) y_pred = np.argmax(gbm.predict(X_test), axis=1) # Log summary metadata to the same run under the "lgbm_summary" namespace run["lgbm_summary"] = create_booster_summary( booster=gbm, log_trees=True, list_trees=[0, 1, 2, 3, 4], log_confusion_matrix=True, y_pred=y_pred, y_true=y_test ) .. _Neptune-LightGBM docs: https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm _lightgbm.plot_importance: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_importance.html#lightgbm-plot-importance _lightgbm.plot_tree: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_tree.html#lightgbm-plot-tree _lightgbm.Booster.trees_to_dataframe: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.trees_to_dataframe _Neptune-LightGBM installation: https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm#install-requirements _example scripts: https://github.com/neptune-ai/examples/tree/main/integrations-and-supported-tools/lightgbm/scripts """ results_dict = {} visuals_path = "visualizations/" if log_importances: split_plot = lgb.plot_importance(booster, importance_type="split", title="Feature importance (split)", max_num_features=max_num_features) gain_plot = lgb.plot_importance(booster, importance_type="gain", title="Feature importance (gain)", max_num_features=max_num_features) results_dict["{}feature_importances/split".format(visuals_path)] \ = neptune.types.File.as_image(split_plot.figure) results_dict["{}feature_importances/gain".format(visuals_path)] \ = neptune.types.File.as_image(gain_plot.figure) if log_trees: try: subprocess.call(["dot", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except OSError: log_trees = False message = "Graphviz executables not found, so trees will not be logged. " \ "Make sure the Graphviz executables are on your systems' PATH" warnings.warn(message) if log_trees: trees_series = [] for i in list_trees: digraph = lgb.create_tree_digraph(booster, tree_index=i, show_info="data_percentage") _, ax = plt.subplots(1, 1, figsize=(tree_figsize, tree_figsize)) s = BytesIO() s.write(digraph.pipe(format="png")) s.seek(0) ax.imshow(image.imread(s)) ax.axis("off") trees_series.append(neptune.types.File.as_image(ax.figure)) results_dict["{}trees".format( visuals_path)] = neptune.types.FileSeries(trees_series) if log_trees_as_dataframe: if isinstance(booster, lgb.Booster): df = booster.trees_to_dataframe() html_df = neptune.types.File.as_html(df) results_dict["trees_as_dataframe"] = html_df if not df.empty and not html_df.content: warnings.warn( "'trees_as_dataframe' wasn't logged. Probably generated dataframe was to large." ) else: warnings.warn( "'trees_as_dataframe' won't be logged." " `booster` must be instance of `lightgbm.Booster` class.") if log_pickled_booster: results_dict["pickled_model"] = neptune.types.File.as_pickle(booster) if log_confusion_matrix: ax = plot_confusion_matrix(y_true=y_true, y_pred=y_pred) results_dict[ f"{visuals_path}confusion_matrix"] = neptune.types.File.as_image( ax.figure) return results_dict
def plot_tree(thing, est_lightgbm=None, tree_index=0, view=True, **kwargs): """Plot the i'th predictor tree of an estimator, a grower's tree, or directly a predictor tree. Trees displayed from TreeGrower have additional information like sum of gradients, etc. tree_index corresponds to the ith built tree (only used when thing is an estimator). In a multiclass setting, the ith tree isn't necessarily the tree built durint the ith iteration because there are K trees per iteration. For example with 3 classes, tree_index=5 will print the third tree of the second iteration. Can also plot a LightGBM estimator (on the left) for comparison. kwargs are passed to graphviz.Digraph() Example: plotting.plot_tree(est_sklearn, est_lightgbm, view=False, filename='output') will silently save output to output.pdf """ def make_sklearn_tree(est): def add_predictor_node(node_idx, parent=None, decision=None): node = predictor_tree.nodes[node_idx] name = 'split__{}'.format(node_idx) label = 'split_feature_index: {}'.format( node['feature_idx']) label += r'\nthreshold: {:.3f}'.format(node['threshold']) label += r'\ngain: {:.3E}'.format(node['gain']) label += r'\nvalue: {:.3f}'.format(node['value']) label += r'\ncount: {:,}'.format(node['count']) graph.node(name, label=label) if not node['is_leaf']: add_predictor_node(node['left'], name, decision='<=') add_predictor_node(node['right'], name, decision='>') if parent is not None: graph.edge(parent, name, decision) def add_grower_node(node, parent=None, decision=None): name = 'split__{0}'.format(id(node)) si = node.split_info if si is None: feature_idx = 0 bin_idx = 0 gain = 0. sum_gradients = 0. sum_hessians = 0. else: feature_idx = si.feature_idx gain = 0. if si.gain is None else si.gain bin_idx = si.bin_idx sum_gradients = si.sum_gradient_left + si.sum_gradient_right sum_hessians = si.sum_hessian_left + si.sum_hessian_right value = 0. if node.value is None else node.value label = 'split_feature_index: {}'.format(feature_idx) label += r'\nbin threshold: {}'.format(bin_idx) label += r'\ngain: {:.3E}'.format(gain) label += r'\nvalue: {:.3f}'.format(value) label += r'\ncount: {:,}'.format(node.sample_indices.shape[0]) label += r'\nsum gradients: {:.3E}'.format(sum_gradients) label += r'\nsum hessians: {:.3E}'.format(sum_hessians) graph.node(name, label=label) if node.value is None: # not a leaf node add_grower_node(node.left_child, name, decision='<=') add_grower_node(node.right_child, name, decision='>') if parent is not None: graph.edge(parent, name, decision) if isinstance(thing, BaseHistGradientBoosting): est = thing # check_is_fitted(est) iteration = tree_index // est.n_trees_per_iteration_ k = tree_index % est.n_trees_per_iteration_ predictor_tree = est._predictors[iteration][k] add_predictor_node(0) elif isinstance(thing, TreePredictor): predictor_tree = thing add_predictor_node(0) elif isinstance(thing, TreeGrower): add_grower_node(thing.root) # make lightgbm tree if est_lightgbm is not None: import lightgbm as lb graph = lb.create_tree_digraph( est_lightgbm, tree_index=tree_index, show_info=['split_gain', 'internal_value', 'internal_count', 'leaf_count'], **kwargs) else: graph = Digraph(**kwargs) # make sklearn tree make_sklearn_tree(thing) graph.render(view=view)
"bagging_seed": 0, "boost_from_average": True, "metric": "mae", "verbosity": -1, } model = lgbm.train(params=params, train_set=training_data, num_boost_round=10**5, valid_sets=[training_data, val_data], early_stopping_rounds=200, verbose_eval=10**4) a = model.predict(submission, num_iterations=model.best_iteration) prediction = prediction + a.reshape(-1, 1) print("Feature Importance") axes = lgbm.plot_importance(model) plt.show() print("Another boosting tree which has to be rendered ") graph = lgbm.create_tree_digraph(model) graph.render(view=True) model.save_model('LGBM.txt') print("Done with saving and printing everything") prediction = prediction / n_fold prediction = prediction.reshape(-1) sample_submission[:, 1] = prediction submission = pandas.DataFrame(sample_submission, columns=['seg_id', 'time_to_failure']) submission.to_csv("Submission.csv", index=None, sep=",")
lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto') plt.show() print('Plotting 54th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain']) plt.show() print('Plotting 54th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54') graph.render(view=True)
# s.write(str(jj) + '\t' + str(count_set[jj]) + '\t' + str(count_set[jj + 1]) + '\t' + str(auc) + '\n') AUC_set.append(auc) median_tpr = np.median(tprs, axis=0) mean_tpr = np.mean(tprs, axis=0) median_tpr[-1] = 1.0 mean_tpr[-1] = 1.0 per_tpr = np.percentile(tprs, [25, 50, 75], axis=0) median_auc = np.trapz(median_tpr, mean_fpr) mean_auc = np.trapz(mean_tpr, mean_fpr) plt.plot(mean_fpr, median_tpr, 'k', lw=3, label='median ROC') plt.title(f'{str(median_auc)}({str(mean_auc)})') plt.fill_between(mean_fpr, per_tpr[0, :], per_tpr[2, :], color='g', alpha=.2, label='Quartile') plt.legend(loc='lower right') plt.show() f = pd.DataFrame({ 'feature_name': x_train.columns, 'feature_importance': clf.feature_importance() }) f.sort_values(by='feature_importance', ascending=False).to_clipboard() lgb.create_tree_digraph(clf, tree_index=1) lgb.plot_tree(clf, tree_index=0, figsize=(100, 50)) plt.savefig('test.png') ###############