Пример #1
0
def test_plot_tree(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split
    gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
    gbm.fit(X_train, y_train)

    with pytest.raises(IndexError):
        lgb.plot_tree(gbm, tree_index=83)

    ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain'])
    assert isinstance(ax, matplotlib.axes.Axes)
    w, h = ax.axes.get_figure().get_size_inches()
    assert int(w) == 15
    assert int(h) == 8
Пример #2
0
def plot_model_information(bst, validation_metrics, my_own_metrics):
    print('Number of trees:', bst.num_trees())

    print('Plot model performance')
    ax = lgb.plot_metric(validation_metrics, metric='auc')
    plt.show()

    print('Plot feature importances...')
    ax = lgb.plot_importance(bst, max_num_features=15)
    plt.show()

    def plot_my_own_metrics(my_own_metrics):
        x = list(my_own_metrics.keys())
        y = list(my_own_metrics.values())
        plt.barh(x, y)

        for index, value in enumerate(y):
            plt.text(value, index, str(value))

    print('plot_my_own_metrics')
    plot_my_own_metrics(my_own_metrics)
    plt.show()

    tree_index = 0
    print('Plot ' + str(tree_index) +
          'th tree...')  # one tree use categorical feature to split
    ax = lgb.plot_tree(bst,
                       tree_index=tree_index,
                       figsize=(64, 36),
                       show_info=['split_gain'])
    plt.show()
Пример #3
0
def get_model_tree_visual(model,
                          model_name="default",
                          tree_index=1,
                          outputpath="./"):
    '''

    :param model:
    :param model_name:
    :param outputpath:
    :param importance_type:
    :param num_feature:
    :return:
    '''
    try:
        outputpath = outputpath + model_name + "_tree.png"
        ax = lgb.plot_tree(
            model,
            tree_index=tree_index,
            figsize=(20, 13),
        )
        plt.savefig(outputpath)
    except:
        logger.error("create model tree fail.")
        return False
    else:
        logger.info("create model tree sucess.")
        return True
Пример #4
0
    def show(self):
        print("Feature importances:", list(self.pst.feature_importance()))

        for i in range(0, 1):
            ax = lgb.plot_tree(self.pst, tree_index=i)
            plt.show()

        ax = lgb.plot_importance(self.pst, importance_type="gain")
        plt.show()
Пример #5
0
def test_plot_example():
    print('Loading data...')
    # load or create your dataset
    df_train = pd.read_csv(
        r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.train',
        header=None,
        sep='\t')
    df_test = pd.read_csv(
        r'/Users/longguangbin/Work/Codes/MLlearn/src/reg_models/LightGBM/data/regression.test',
        header=None,
        sep='\t')

    y_train = df_train[0]
    y_test = df_test[0]
    X_train = df_train.drop(0, axis=1)
    X_test = df_test.drop(0, axis=1)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # specify your configurations as a dict
    params = {'num_leaves': 5, 'metric': ('l1', 'l2'), 'verbose': 0}

    evals_result = {}  # to record eval results for plotting

    print('Starting training...')
    # train
    gbm = lgb.train(
        params,
        lgb_train,
        num_boost_round=100,
        valid_sets=[lgb_train, lgb_test],
        feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
        categorical_feature=[21],
        evals_result=evals_result,
        verbose_eval=10)

    print('Plotting metrics recorded during training...')
    ax = lgb.plot_metric(evals_result, metric='l1')
    plt.show()

    print('Plotting feature importances...')
    ax = lgb.plot_importance(gbm, max_num_features=10)
    plt.show()

    print('Plotting 84th tree...')  # one tree use categorical feature to split
    ax = lgb.plot_tree(gbm,
                       tree_index=83,
                       figsize=(20, 8),
                       show_info=['split_gain'])
    plt.show()

    print('Plotting 84th tree with graphviz...')
    graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
    graph.render(view=True)
Пример #6
0
def lgbm(model, featnames=None, num_trees=None, figsize=(25, 25), verbose=3):
    try:
        from lightgbm import plot_tree, plot_importance
    except:
        if verbose >= 1:
            raise ImportError(
                'lightgbm must be installed. Try to: <pip install lightgbm>')
        return None

    # Check model
    _check_model(model, 'lgb')
    # Set env
    _set_graphviz_path()

    if (num_trees is None) and hasattr(model, 'best_iteration_'):
        num_trees = model.best_iteration_
        if verbose >= 3:
            print('[treeplot] >Best detected tree: %.0d' % (num_trees))
    elif num_trees is None:
        num_trees = 0

    ax1 = None
    try:
        fig, ax1 = plt.subplots(1, 1, figsize=figsize)
        plot_tree(model, tree_index=num_trees, dpi=200, ax=ax1)
    except:
        if _get_platform() != "windows":
            print(
                '[treeplot] >Install graphviz first: <sudo apt install python-pydot python-pydot-ng graphviz>'
            )

    # Plot importance
    ax2 = None
    try:
        fig, ax2 = plt.subplots(1, 1, figsize=figsize)
        plot_importance(model, max_num_features=50, ax=ax2)
    except:
        print(
            '[treeplot] >Error: importance can not be plotted. Booster.get_score() results in empty. This maybe caused by having all trees as decision dumps.'
        )

    return (ax1, ax2)
Пример #7
0
    def test_plot_tree(self):
        gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm.fit(self.X_train, self.y_train, verbose=False)

        self.assertRaises(IndexError, lgb.plot_tree, gbm, tree_index=83)

        ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain'])
        self.assertIsInstance(ax, matplotlib.axes.Axes)
        w, h = ax.axes.get_figure().get_size_inches()
        self.assertEqual(int(w), 15)
        self.assertEqual(int(h), 8)
Пример #8
0
 def lgb_plot(lgb_model, sav_path, is_decision_tree, class_names,
              x_columns):
     """
     :param lgb_model: 模型
     :param sav_path: 图片保存地址
     :param is_decision_tree: 是否决策树,1为决策树 2为随机森林
     :param class_names: list 类别名称,按排序
     :param x_columns: 特征名称
     :return: 无返回,请到图片保存路径查看
     """
     if not os.path.exists(sav_path):
         os.makedirs(sav_path)
     b = lgb_model.booster_.dump_model()
     tree_num = len(b['tree_info'])
     if not os.path.exists(sav_path):
         os.makedirs(sav_path)
     for i in range(tree_num):
         lgb.plot_tree(lgb_model,
                       tree_index=i,
                       figsize=(20, 8),
                       show_info=['split_gain'])
         plt.savefig(sav_path + 'lgb_tree' + str(i) + '.png', dpi=1000)
         plt.savefig(sav_path + 'lgb_pdf_tree' + str(i) + '.pdf', dpi=1000)
Пример #9
0
def lgb_binary(X_train, y_train, X_test, y_test, params, num_rounds):
    # Convert dataset to lgb dataset
    d_train = lgb.Dataset(X_train, label=y_train)
    # Train Model
    clf = lgb.train(params, d_train, num_boost_round=num_rounds)
    # Predict
    yhat = clf.predict(X_test)
    # Convert Probabilities into binary variables
    y_hat = list(map(lambda x: 1 if x >= 0.5 else 0, yhat))
    # Get Confusion Matrix
    cm = confusion_matrix(y_test, y_hat)
    # Get Accuracy Score
    score = accuracy_score(y_test, y_hat)
    print(cm, '\n')
    print(score)
    # Plotting
    ax = lgb.plot_importance(clf, max_num_features=10)
    plt.show()
    ax = lgb.plot_tree(clf)
    plt.show()
    def train_light_gbm(self, dts):
        # create dataset for lightgbm
        lgb_train = lgb.Dataset(dts.trainX, dts.trainY)
        lgb_test = lgb.Dataset(dts.testX, dts.testY, reference=lgb_train)

        # specify your configurations as a dict
        params = {
            'num_leaves': 5,
            'metric': ('l1', 'l2'),
            'verbose': 0
        }

        evals_result = {}  # to record eval results for plotting

        print('Starting training...')
        # train
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=100,
                        valid_sets=[lgb_train, lgb_test],
                        feature_name=['close', 'open', 'high', 'low', 'volume'],
                        categorical_feature=[21],
                        evals_result=evals_result,
                        verbose_eval=10)

        print('Plotting metrics recorded during training...')
        ax = lgb.plot_metric(evals_result, metric='l1')
        plt.show()

        print('Plotting feature importances...')
        ax = lgb.plot_importance(gbm, max_num_features=10)
        plt.show()

        print('Plotting 84th tree...')  # one tree use categorical feature to split
        ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
        plt.show()

        print('Plotting 84th tree with graphviz...')
        graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
        graph.render(view=True)
lightgbm.plot_importance(gbm)
plt.show()

lightgbm.plot_metric(evals_result,
                     metric='l1',
                     title='l1 Metric during training')
plt.show()

lightgbm.plot_metric(evals_result,
                     metric='l2',
                     title='l2 Metric during training')
plt.show()

lightgbm.plot_tree(gbm,
                   tree_index=1,
                   figsize=(50, 50),
                   show_info=['split_gain'])
plt.show()

# From the graph we can notice l1 loss to decrease linearly while l2 plateaus after 110 iterations.
# After reaching the minima, l2 starts increasing after iter #140

# ### RMSE: Actual-Predicted

# In[150]:

from sklearn.metrics import mean_squared_error

y_pred = gbm.predict(x_test)
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5)
#idx_sort = np.argsort(f_imp)[::-1]
f_imp = rf.feature_importances_

pd.Series(f_imp, index=list(X)).sort_values().plot.bar()

pd.Series(prob_rf).unique()

fig = plt.figure(figsize=(12,6))
ax1 = fig.add_subplot(1,2, 1)
ax2 = fig.add_subplot(1,2, 2)
(prob_rf[:df_train.shape[0]]*df_train.S12).plot.hist(ax=ax1)
df_test.S12.plot.hist(ax=ax2)

submit(pred_test, 'sub_lb_20_03_11_19.csv')

lgb.plot_tree(models[0], figsize=(14,14))

target.apply(np.log1p).plot.hist()

ax = scale_predictions(prob_oof).plot()
scale_predictions(pred_test).plot(color='orange', ax=ax)

plot_tree(models[0])

#y.value_counts()
x.describe()

def model_by_col(train, test, target, col):
  train = train.reset_index(drop=True)
  target = target.reset_index(drop=True)
  unique_col = train[col].unique()
######## FEATURE IMPORTANCE ###########

# 29
# Column wise imporatnce. Default Criteria: "split".
# "split":  Result contains numbers of times feature is used in a model.
# “gain”:   Result contains total information-gains of splits
#           which use the feature
print('Plot feature importances...')
ax = lgb.plot_importance(bst_bayes, max_num_features=10)
ax.tick_params(labelsize=20)
plt.show()

# 29.1 Does not work. Needs 'graphviz'
ax= lgb.plot_tree(bst_bayes,
                  tree_index=9,
                  figsize=(40, 20),
                  show_info=['split_gain'])

plt.show()




#################### Bayesian-optimization-II Normal method ###################
# Ref: https://github.com/fmfn/BayesianOptimization

# 25. Create lightgbm dataset, a binary file
#     LightGBM binary file
#     Also saving Dataset into a LightGBM binary file will make loading faster:
d_train = lgb.Dataset(X_train, label=y_train) # transformed train data
d_test = lgb.Dataset(X_test, label = y_test)  # test data
Пример #14
0

def evaluate(preds, labels, prefix):
    acc = compute_acc(preds, labels)
    print(f"{prefix} Accuracy: {acc}")
    rmse = compute_rmse(preds, labels)
    print(f"{prefix} RMSE: {rmse}")


train_fname = './data/agaricus.txt.train'
test_fname = './data/agaricus.txt.test'

# read in data
dtrain = lgb.Dataset(train_fname, free_raw_data=True)
dtest = lgb.Dataset(test_fname, free_raw_data=True)

param = {
    'max_depth': 2,
    'learning_rate': 1,
    'objective': 'binary',
    'metric': ['binary_logloss', 'binary_error', 'rmse']
}
num_round = 2
bst = lgb.train(param, dtrain, num_round, valid_sets=[dtest])

num_trees = bst.num_trees()
print(f'Number of trees: {num_trees}')
for i in range(num_trees):
    lgb.plot_tree(bst, tree_index=i)
plt.show()
Пример #15
0
 def plot_tress(self):
     fig, ax = plt.subplots(1, 1, figsize=(20, 20))
     lgb.plot_tree(self.lgbm, ax=ax)
     plt.show()
Пример #16
0
    'verbose': 0,
    'random_state': 33,
}
print('Start training...')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=70,
                valid_sets=[lgb_train, lgb_eval, lgb_test],
                evals_result=evals_result,
                early_stopping_rounds=10)
print('Start predicting...')
y_pred = gbm.predict(test_feature2, num_iteration=gbm.best_iteration)

lgb.plot_importance(gbm, max_num_features=10)

lgb.plot_tree(gbm, tree_index=3, figsize=(100, 40), show_info=['split_gain'])

lgb.plot_metric(evals_result, metric='binary_logloss')

lgb.plot_metric(evals_result, metric='auc')

#calculate Normalized Cross Entropy
NE = (-1) / len(y_pred) * sum(((1 + y_test_reset) / 2 * np.log(y_pred) +
                               (1 - y_test_reset) / 2 * np.log(1 - y_pred)))
print("Normalized Cross Entropy " + str(NE))

# from sklearn.linear_model import LogisticRegression
# lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
# lm.fit(x_train,y_train)  # fitting the data
# y_pred_test = lm.predict_proba(x_validation)   # Give the probabilty on each label
# y_pred_label = lm.predict(x_validation)
valid_pred = bst.predict(val_X)
valid_score = metrics.roc_auc_score(val_y, valid_pred)
print(f"Validation AUC score: {valid_score:.4f}")

import matplotlib.pyplot as plt

from lightgbm import plot_importance
from lightgbm import plot_split_value_histogram
fig, ax = plt.subplots(figsize=(10, 8))
plot_importance(bst, ax=ax)
fig, ax = plt.subplots(figsize=(10, 8))
plot_split_value_histogram(bst, 'Forecast', ax=ax)
plt.show()

ax = lgb.plot_tree(bst,
                   tree_index=3,
                   figsize=(200, 200),
                   show_info=['split_gain'])
"""
--------------------------------------------------------------------------
--------------------------------------------------------------------------
--------------------------------------------------------------------------
"""
# Fitting classifier to the Training set
# Create your classifier here

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='sag',
                                multi_class='multinomial',
                                random_state=0,
                                max_iter=100)
classifier.fit(train_X, train_y)
Пример #18
0
lgb_train = lgb.Dataset(X_train, y_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'rf',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 2000000,
    'max_depth': 1,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'num_iterations': 10
}

print('Starting training...')

#gbm = lgb.train(params, lgb_train, num_boost_round=20,)

model = lgb.LGBMClassifier(**params)
model.fit(X_train, y_train)

print('Plotting a tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(model,
                   tree_index=2,
                   figsize=(15, 15),
                   show_info=['split_gain'])
plt.show()
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

tree.plot_tree(clf.fit(iris.data, iris.target)) 





import lightgbm as lgb
from sklearn.datasets import load_iris

%matplotlib inline

X, y = load_iris(True)
clf = lgb.LGBMClassifier()
clf.fit(X, y)
lgb.plot_tree(clf)
Пример #20
0
    early_stopping_rounds=100,
    verbose_eval=20,
    evals_result=evals_result,
)

ax = lgb.plot_metric(evals_result, metric='l1')  #metric的值与之前的params里面的值对应
plt.show()

print('画特征重要性排序...')
ax = lgb.plot_importance(
    gbm, max_num_features=30)  #max_features表示最多展示出前10个重要性特征,可以自行设置
plt.show()

print('Plot 3th tree...')  # 画出决策树,其中的第三颗
ax = lgb.plot_tree(gbm,
                   tree_index=3,
                   figsize=(20, 8),
                   show_info=['split_gain'])
plt.show()

# print('导出决策树的pdf图像到本地')#这里需要安装graphviz应用程序和python安装包
# graph = lgb.create_tree_digraph(gbm, tree_index=3, name='Tree3')
# graph.render(view=True)

y_valid_pred = gbm.predict(X_valid)
# y_valid_pred = np.expm1(y_valid_pred)
# y_valid = np.expm1((y_valid))
mae = mean_absolute_error(y_valid, y_valid_pred)

print('valid mae: ', mae)

y_pred = gbm.predict(X_test)
Пример #21
0
 def plot_tree(self):
     ''' Plots a tree to a new MPL figure'''
     lightgbm.plot_tree(self.ml_model,
                        tree_index=0,
                        show_info=["split_gain"])
Пример #22
0
early_stop_rounds = 10
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'12', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}
results = {}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=boost_round,
                valid_sets=(lgb_eval, lgb_train),
                valid_names=('validate', 'train'),
                early_stopping_rounds=early_stop_rounds,
                evals_result=results)
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

lgb.plot_metric(results)
plt.show()

lgb.plot_importance(gbm, importance_type='split')
plt.show()

lgb.plot_tree(gbm, tree_index=0)
plt.show()
Пример #23
0
# 加载数据
print('Load data...')

X_train,X_test,y_train,y_test =train_test_split(x,l,test_size=0.2)


print('Start training...')
# 创建模型,训练模型
lgbm= lgb.LGBMClassifier()
lgbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
lgb.create_tree_digraph(lgbm, tree_index=1)
import matplotlib.pyplot as plt
import matplotlib
fig2 = plt.figure(figsize=(20, 20))
ax = fig2.subplots()
lgb.plot_tree(lgbm._Booster, tree_index=1, ax=ax)
plt.show()   

print('Start predicting...')
# 测试机预测
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_)

# feature importances
print('Feature importances:', list(lgbm.feature_importances_))

recall = recall_score(y_pred,y_test)
precision = precision_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test)
# 模型评估
print('The #accuracy is:',np.mean( y_pred == y_test))
print('The #recall is:',recall) 
Пример #24
0
def xgboost(model,
            featnames=None,
            num_trees=None,
            plottype='horizontal',
            figsize=(25, 25),
            verbose=3):
    """Plot tree based on a xgboost.

    Parameters
    ----------
    model : model
        xgboost model.
    featnames : list, optional
        list of feature names. The default is None.
    num_trees : int, default None
        The best performing tree is choosen. Specify any other ordinal number for another target tree
    plottype : str, optional
        Make 'horizontal' or 'vertical' plot. The default is 'horizontal'.
    figsize: tuple, default (25,25)
        Figure size, (height, width)
    verbose : int, optional
        Print progress to screen. The default is 3.
        0: NONE, 1: ERROR, 2: WARNING, 3: INFO (default), 4: DEBUG, 5: TRACE

    Returns
    -------
    ax : Figure axis
        Figure axis of the input model.

    """
    try:
        from xgboost import plot_tree, plot_importance
    except:
        if verbose >= 1:
            raise ImportError(
                'xgboost must be installed. Try to: <pip install xgboost>')

    _check_model(model, 'xgb')
    # Set env
    _set_graphviz_path()

    if plottype == 'horizontal': plottype = 'UD'
    if plottype == 'vertical': plottype = 'LR'
    if (num_trees is None) and hasattr(model, 'best_iteration'):
        num_trees = model.best_iteration
        if verbose >= 3:
            print('[treeplot] >Best detected tree: %.0d' % (num_trees))
    elif num_trees is None:
        num_trees = 0

    ax1 = None
    try:
        fig, ax1 = plt.subplots(1, 1, figsize=figsize)
        plot_tree(model, num_trees=num_trees, rankdir=plottype, ax=ax1)
    except:
        if _get_platform() != "windows":
            print(
                '[treeplot] >Install graphviz first: <sudo apt install python-pydot python-pydot-ng graphviz>'
            )

    # Plot importance
    ax2 = None
    try:
        fig, ax2 = plt.subplots(1, 1, figsize=figsize)
        plot_importance(model, max_num_features=50, ax=ax2)
    except:
        print(
            '[treeplot] >Error: importance can not be plotted. Booster.get_score() results in empty. This maybe caused by having all trees as decision dumps.'
        )

    return (ax1, ax2)
Пример #25
0
    bst = lgb.train(core_params, lgb_train, num_round, valid_sets=[lgb_valid])
    ypred = bst.predict(X_test, num_iteration=bst.best_iteration)
    mapes[airline] = mean_absolute_percentage_error(y_test, ypred)

core_params = {
    'boosting_type':
    'gbdt',  # GBM type: gradient boosted decision tree, rf (random forest), dart, goss.
    'objective':
    'regression',  # the optimization object: binary, regression, multiclass, xentropy.
    'learning_rate':
    0.01,  # the gradient descent learning or shrinkage rate, controls the step size.
    'num_leaves': 5,  # the number of leaves in one tree.
    'nthread':
    4,  # number of threads to use for LightGBM, best set to number of actual cores.
    'metric':
    'mape'  # an additional metric to calculate during validation: area under curve (auc).
}

num_round = 1000


def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 1


print(mean_absolute_percentage_error(y_test, ypred))
import graphviz
bst.save_model('model.txt')
lgb.plot_tree(bst, figsize=(20, 20))
Пример #26
0
}

evals_result = {}  # to record eval results for plotting

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(28)],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plot metrics during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plot 84th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
Пример #27
0
        X_train, y_train = load_svmlight_file(os.path.join(rank_train_dir, "rank.train.txt"))
        X_test, y_test = load_svmlight_file(os.path.join(rank_train_dir, "rank.test.txt"))
        qgsize_train = np.loadtxt(os.path.join(rank_train_dir, "rank.train.qgsize.txt"))
        qgsize_test = np.loadtxt(os.path.join(rank_train_dir, "rank.test.qgsize.txt")).reshape(-1)
        model = lightgbm.LGBMRanker(boosting_type='gbdt', num_leaves=4,
                                    max_depth=-1, learning_rate=0.1, n_estimators=100,
                                    min_child_samples=5)
        feature_names = ['Aux lang TTR','Overlap word-level','Overlap subword-level','Aux lang dataset size'    ,'TTR difference ratio','Dataset size ratio','Task lang dataset size','GENETIC','SYNTACTIC','FEATURAL','PHONOLOGICAL','INVENTORY','GEOGRAPHIC']
        gbm = model.fit(X_train, y_train, group=qgsize_train,
                  eval_set=[(X_test, y_test)], eval_group=[qgsize_test], eval_at=3,
                  early_stopping_rounds=40, eval_metric="ndcg", verbose=False,feature_name = feature_names)
        print(test_lang_set[0])
        if test_lang_set[0]=='glg':
            model.booster_.save_model('./model_glg_leaves4.txt')
            ax = lightgbm.plot_tree(model.booster_, tree_index=15, figsize=(100, 40), precision = 2,show_info=['split_gain'])
            ax = lightgbm.plot_importance(gbm, max_num_features=10,figsize=(100, 40))
            
            #plt.savefig("./glg_feature_importance.png")
            plt.savefig('./glg_tree15_leaves4.png')
            plt.show()
        #ax = lightgbm.plot_importance(gbm, max_num_features=10)
        #plt.show()
        print("================================")
        print("Features:", data[0, 5:])
        print("Feature importance:", model.feature_importances_)

        #print("Best test NDCG@1 during training =", model.best_score_['valid_0']['ndcg@1'])
        #print("Best test NDCG@2 during training =", model.best_score_['valid_0']['ndcg@2'])
        print("Best test NDCG@3 during training =", model.best_score_['valid_0']['ndcg@3'])
        #print("Best test NDC@10 during training =", model.best_score_['valid_0']['ndcg@10'])
Пример #28
0
#LightGBM model
split = int(len(X_train) * 0.8)
lgbm_train_set = lgbm.Dataset(X_train[:split], y_train[:split])
lgbm_valid_set = lgbm.Dataset(X_train[split:], y_train[split:])
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'n_estimators': 20000,
    'metric': 'mse',
    'num_leaves': 30,
    'learning_rate': 0.002,
    'early_stopping_rounds': 200
}
model = lgbm.train(lgbm_params,
                   lgbm_train_set,
                   2,
                   verbose_eval=100,
                   valid_sets=[lgbm_train_set, lgbm_valid_set])
predict = model.predict(X_test, num_iteration=model.best_iteration)

#plot the decision tree
plt.figure(figsize=(100, 50))
lgbm.plot_tree(model, tree_index=1)
plt.savefig("lgbm_tree_demonstration.png")

#save the prediction
caseid = [i for i in range(143, 1001)]
midprice = np.array(predict) + np.array(m_test)
submit = pd.DataFrame({'caseid': caseid, 'midprice': midprice})
submit.to_csv('lgbm_final.csv', index=False)
Пример #29
0
    'num_leaves': 10,
    'verbose': 0
}
l_progress = dict()
l_model = lightgbm.train(l_params,
                         l_train,
                         valid_sets=[l_train, l_test],
                         num_boost_round=2000, 
                         evals_result=l_progress,
                         verbose_eval=10,
                         feature_name=features)
plt.rcParams['figure.figsize'] = [10, 7]
lightgbm.plot_metric(l_progress)

png = plt.gcf()
lightgbm.plot_tree(l_model,tree_index=1,figsize=(60,60),show_info=['split_gain'])
png.savefig(folder+"tree.png",dpi=100)
plt.show()

lightgbm.plot_importance(l_model)
# building trees using XG_Boost
import xgboost 

g_train = xgboost.DMatrix(x_train,y_train)
g_test = xgboost.DMatrix(x_test,y_test) 

g_params = {
    "objective":"binary:logistic",
    'colsample_bytree': 0.3,
    'learning_rate': 0.05,
    #'tree_method': 'hist',
Пример #30
0
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()

print('Plotting 54th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm,
                   tree_index=53,
                   figsize=(15, 15),
                   show_info=['split_gain'])
plt.show()

print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
graph.render(view=True)
Пример #31
0
    gcv.fit(input_data, winner_data)

    print(gcv.best_params_)
    print(gcv.best_score_)
    print(gcv.cv_results_)

    with open('gcv', 'bw') as out_f:
        import pickle
        pickle.dump(gcv, out_f)

    lgb.plot_importance(gcv.best_estimator_,
                        figsize=(20, 35),
                        importance_type='gain',
                        max_num_features=100)
    plt.savefig("importance.svg")
    lgb.plot_tree(gcv.best_estimator_, figsize=(80, 80))
    plt.savefig("tree.svg")

else:

    train_data = lgb.Dataset('train.bin')
    validation_data = lgb.Dataset('val.bin', reference=train_data)

    # Load num_categories if necessary.
    with open('num_categories.jsonl') as in_f:
        num_categories = json.loads(in_f.read())

    param = {}
    param['max_bin_by_feature'] = num_categories
    param['num_leaves'] = 63
    param['objective'] = 'binary'