示例#1
0
    def test_plot_importance(self):
        gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10)
        ax0 = lgb.plot_importance(gbm0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Feature importance')
        self.assertEqual(ax0.get_xlabel(), 'Feature importance')
        self.assertEqual(ax0.get_ylabel(), 'Features')
        self.assertLessEqual(len(ax0.patches), 30)

        gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm1.fit(self.X_train, self.y_train)

        ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
        self.assertIsInstance(ax1, matplotlib.axes.Axes)
        self.assertEqual(ax1.get_title(), 't')
        self.assertEqual(ax1.get_xlabel(), 'x')
        self.assertEqual(ax1.get_ylabel(), 'y')
        self.assertLessEqual(len(ax1.patches), 30)
        for patch in ax1.patches:
            self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.))  # red

        ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'],
                                  title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
        self.assertLessEqual(len(ax2.patches), 30)
        self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.))  # r
        self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.))  # y
        self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.))  # g
        self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.))  # b
示例#2
0
    def test_plot_importance(self):
        X_train, _, y_train, _ = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, y_train)

        params = {
            "objective": "binary",
            "verbose": -1,
            "num_leaves": 3
        }
        gbm0 = lgb.train(params, train_data, num_boost_round=10)
        ax0 = lgb.plot_importance(gbm0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Feature importance')
        self.assertEqual(ax0.get_xlabel(), 'Feature importance')
        self.assertEqual(ax0.get_ylabel(), 'Features')
        self.assertLessEqual(len(ax0.patches), 30)

        gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm1.fit(X_train, y_train)

        ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
        self.assertIsInstance(ax1, matplotlib.axes.Axes)
        self.assertEqual(ax1.get_title(), 't')
        self.assertEqual(ax1.get_xlabel(), 'x')
        self.assertEqual(ax1.get_ylabel(), 'y')
        self.assertLessEqual(len(ax1.patches), 30)
        for patch in ax1.patches:
            self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.))  # red

        ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'],
                                  title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
        self.assertLessEqual(len(ax2.patches), 30)
        self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.))  # r
        self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.))  # y
        self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.))  # g
        self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.))  # b
def GBDT_test(data,fold_n,num_rounds = 100000,bf=1,ff=1):
    model_type = "mort" if isMORT else "lgb"
    nFeatures = data.X_train.shape[1]
    early_stop = 100;    verbose_eval = 20
    
    #lr = 0.01;   
    bf = bf;    ff = ff

    if data.problem()=="classification":
        metric = 'auc'       #"rmse"
        params = {"objective": "binary", "metric": metric,'n_estimators': num_rounds,
        "bagging_fraction": bf, "feature_fraction": ff,'verbose_eval': verbose_eval, "early_stopping_rounds": early_stop, 'n_jobs': -1, 
              }
    else:
        metric = 'l2'       #"rmse"
        params = {"objective": "regression", "metric": metric,'n_estimators': num_rounds,
              "bagging_fraction": bf, "feature_fraction": ff, 'verbose_eval': verbose_eval, "early_stopping_rounds": early_stop, 'n_jobs': -1,
              }
    print(f"====== GBDT_test\tparams={params}")
    X_train, y_train = data.X_train, data.y_train
    X_valid, y_valid = data.X_valid, data.y_valid
    X_test, y_test = data.X_test, data.y_test
    if not np.isfortran(X_train):   #Very important!!! mort need COLUMN-MAJOR format
        X_train = np.asfortranarray(X_train)
        X_valid = np.asfortranarray(X_valid)
    #X_train, X_valid = pd.DataFrame(X_train), pd.DataFrame(X_valid)
    print(f"GBDT_test\ttrain={X_train.shape} valid={X_valid.shape}")
    #print(f"X_train=\n{X_train.head()}\n{X_train.tail()}")
    if model_type == 'mort':
        params['verbose'] = 667
        model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        #y_pred_valid = model.predict(X_valid)
        #y_pred = model.predict(X_test)

    if model_type == 'lgb':
        if data.problem()=="classification":
            model = lgb.LGBMClassifier(**params)
        else:
            model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=min(num_rounds//10,1000))
        pred_val = model.predict(data.X_test)
        #plot_importance(model)
        lgb.plot_importance(model, max_num_features=32)
        plt.title("Featurertances")
        plt.savefig(f"./results/{dataset}_feat_importance_.jpg")
        #plt.show(block=False)
        plt.close()

        fold_importance = pd.DataFrame()
        fold_importance["importance"] = model.feature_importances_
        fold_importance["feature"] = [i for i in range(nFeatures)]
        fold_importance["fold"] = fold_n
        #fold_importance.to_pickle(f"./results/{dataset}_feat_{fold_n}.pickle")
        print('best_score', model.best_score_)
        acc_train,acc_=model.best_score_['training'][metric], model.best_score_['valid_1'][metric]
    if data.X_test is not None:
        pred_val = model.predict(data.X_test)
        if False:#config.err_relative:
            #nrm_Y = ((YY_) ** 2).mean()
            #mse = ((YY_ - prediction) ** 2).mean()/nrm_Y  
            lenY = np.linalg.norm(data.y_test) 
            acc_ = np.linalg.norm(data.y_test - pred_val)/lenY 
        else:
            acc_ = ((data.y_test - pred_val) ** 2).mean()
        print(f'====== Best step: test={data.X_test.shape} ACCU@Test={acc_:.5f}')
    return acc_,fold_importance
示例#4
0
sub_df = pd.DataFrame({"fullVisitorId":test_id})
pred_test[pred_test<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(pred_test)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("baseline_lgb.csv", index=False)

# In[ ]:


sub_df.head()

# **Feature Importance:**
# 
# Now let us have a look at the important features of the light gbm model.

# In[ ]:


fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()

# "totals.pageviews" turn out to be the most important feature followed by "totals.hits" and "visitStartTime". 

# **More to come. Stay tuned.!**
示例#5
0
    train_y, test_y = y[tr_idx], y[val_idx]

    # Datasetに入れて学習させる
    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_valid = lgb.Dataset(test_x, test_y, reference=lgb_train)

    # Training
    model = lgb.train(light_params,
                      lgb_train,
                      num_boost_round=3000,
                      early_stopping_rounds=50,
                      valid_sets=[lgb_train, lgb_valid],
                      verbose_eval=50)

    test_pred = model.predict(test_df)

    oof = model.predict(test_x)
    rmse = np.sqrt(mean_squared_error(test_y, oof))
    print(f"RMSE : {rmse}")

print(rmse)

lgb.plot_importance(model,
                    importance_type="gain",
                    max_num_features=40,
                    figsize=(12, 12))  # max_num_features=20,

sub = pd.read_csv("submission.csv").iloc[:, 1:]
sub["units_sold_month"] = test_pred.round(3)
sub.to_csv("baseline.csv", index=False)
示例#6
0
def make_predictions_gkf(train_df,
                         test_df,
                         feature_cols,
                         target,
                         param,
                         NFOLDS=2):
    gkf = GroupKFold(n_splits=NFOLDS)
    split_groups = train_df['DT_M']

    test_pred_prob = np.zeros(test_num)
    oof_pred_prob = np.zeros(train_num)

    train_values = train_df[feature_cols]
    test_values = test_df[feature_cols]
    labels = train_df['isFraud']
    split_groups = train_df['DT_M']

    for i, (train_idx, valid_idx) in enumerate(
            gkf.split(train_values, labels, groups=split_groups)):
        print(i, 'fold...')
        start_time = time.time()

        train_x, train_y = train_values.iloc[train_idx], labels[train_idx]
        valid_x, valid_y = train_values.iloc[valid_idx], labels[valid_idx]

        # Construct the dataset
        train_data = lgb.Dataset(train_x,
                                 label=train_y,
                                 categorical_feature=cate_cols,
                                 free_raw_data=True)
        valid_data = lgb.Dataset(valid_x,
                                 label=valid_y,
                                 categorical_feature=cate_cols,
                                 reference=train_data,
                                 free_raw_data=True)

        # Training
        bst = lgb.train(param,
                        train_data,
                        valid_sets=[train_data, valid_data],
                        verbose_eval=200)

        # Prediction
        valid_pred_prob = bst.predict(valid_x,
                                      num_iteration=bst.best_iteration)
        oof_pred_prob[valid_idx] = valid_pred_prob
        print('val logloss: ', log_loss(valid_y, valid_pred_prob))
        print('val auc: ', roc_auc_score(valid_y, valid_pred_prob))

        test_pred_prob += bst.predict(
            test_values, num_iteration=bst.best_iteration) / gkf.n_splits

        print('runtime: {}\n'.format(time.time() - start_time))

        # Plotting
        lgb.plot_importance(bst, max_num_features=30)

    print('oof logloss: ', log_loss(labels, oof_pred_prob))
    print('oof auc: ', roc_auc_score(labels, oof_pred_prob))

    test_df['isFraud'] = test_pred_prob
    return test_df[['TransactionID', 'isFraud']]
示例#7
0
文件: run.py 项目: furu8/blogress
def plot_lgb_importance(lgbm):
    lgb.plot_importance(lgbm.model, height=0.5, figsize=(4, 8))
    plt.show()
示例#8
0
    'scale_pos_weight': 2,
    'drop_rate': 0.02
}

cv_results = lgbm.cv(train_set=lgbm_train,
                     params=lgbm_params,
                     nfold=5,
                     num_boost_round=600,
                     early_stopping_rounds=50,
                     verbose_eval=50,
                     metrics=['auc'])

optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

clf = lgbm.train(train_set=lgbm_train,
                 params=lgbm_params,
                 num_boost_round=optimum_boost_rounds)
""" Predict on test set and create submission """
y_pred = clf.predict(fin_test)
out_df = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': y_pred})
out_df.to_csv('submission_lgbm.csv', index=False)

fig, (ax, ax1) = plt.subplots(1, 2, figsize=[11, 7])
lgbm.plot_importance(clf, ax=ax, max_num_features=20, importance_type='split')
lgbm.plot_importance(clf, ax=ax1, max_num_features=20, importance_type='gain')
ax.set_title('Importance by splits')
ax1.set_title('Importance by gain')
plt.tight_layout()
plt.savefig('feature_importance.png')
示例#9
0
    y_pred[y_pred < 0] = 0
    y_true = _valid_df["likes"].values
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    rmsles.append(rmsle)
    mlflow.log_metric(f"rmsle_{fold}", rmsle)
    print(f"------------------------ fold {fold} -----------------------")
    print(f"------------------- rmsle {rmsle} -----------------------")
    print()

print("")
print(
    f"------------------- average rmsle {np.mean(rmsles)} -----------------------"
)
mlflow.log_metric(f"rmsle_avg", np.mean(rmsles))
if "lgbm" in models:
    lgb.plot_importance(lgb_model, figsize=(16, 16))
    plt.show()
mlflow.end_run()

# %%
# raw.train, raw.test = target_encoding(raw.train, raw.test)
cat_train_dataset = Pool(raw.train[features],
                         raw.train["likes_log"],
                         cat_features=cat_features)
lgb_train_dataset = lgb.Dataset(raw.train[features], raw.train["likes_log"])
cat_model = CatBoostRegressor(**Config.cat_params, iterations=2000)
cat_model.fit(
    cat_train_dataset,
    verbose_eval=100,
    eval_set=[cat_train_dataset],
)
示例#10
0
dataset_train = lgb.Dataset(X_train_tfidf, y_train)
dataset_valid = lgb.Dataset(X_valid_tfidf, y_valid)
booster = lgb.train(
    params,
    dataset_train,
    feature_name=([f"feat_{i}" for i in range(1, 94)] +
                  [f"tfidf_{i}" for i in range(1, 94)]),
    num_boost_round=500,
    valid_sets=dataset_valid,
    early_stopping_rounds=20,
)
best_iteration = booster.best_iteration
print(best_iteration)
lgb.plot_importance(
    booster,
    max_num_features=30,
    figsize=(12, 10),
    dpi=300,
)
df_test = pd.read_csv(
    "/kaggle/input/otto-group-product-classification-challenge/test.csv",
    dtype=dtypes).set_index("id")
tfidf = TfidfTransformer()

tfidf_feature_train_all = tfidf.fit_transform(
    df_train[feature_columns]).toarray().astype("float32")
X_train_all_tfidf = np.hstack(
    (df_train[feature_columns].values, tfidf_feature_train_all))
dataset_train_all = lgb.Dataset(X_train_all_tfidf, df_train[target_column])
booster = lgb.train(
    params,
    dataset_train_all,
def cboost_feature_importance(model):
    fig, ax = plt.subplots(figsize=(12, 18))
    lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
    ax.grid(False)
    plt.title("LightGBM - Feature Importance", fontsize=15)
    plt.show()
示例#12
0
early_stop_rounds = 10
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'12', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}
results = {}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=boost_round,
                valid_sets=(lgb_eval, lgb_train),
                valid_names=('validate', 'train'),
                early_stopping_rounds=early_stop_rounds,
                evals_result=results)
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

lgb.plot_metric(results)
plt.show()

lgb.plot_importance(gbm, importance_type='split')
plt.show()

lgb.plot_tree(gbm, tree_index=0)
plt.show()
bst = lgb.train(param,
                dtrain,
                num_boost_round=1000,
                valid_sets=[dvalid],
                early_stopping_rounds=30)

valid_pred = bst.predict(val_X)
valid_score = metrics.roc_auc_score(val_y, valid_pred)
print(f"Validation AUC score: {valid_score:.4f}")

import matplotlib.pyplot as plt

from lightgbm import plot_importance
from lightgbm import plot_split_value_histogram
fig, ax = plt.subplots(figsize=(10, 8))
plot_importance(bst, ax=ax)
fig, ax = plt.subplots(figsize=(10, 8))
plot_split_value_histogram(bst, 'Forecast', ax=ax)
plt.show()

ax = lgb.plot_tree(bst,
                   tree_index=3,
                   figsize=(200, 200),
                   show_info=['split_gain'])
"""
--------------------------------------------------------------------------
--------------------------------------------------------------------------
--------------------------------------------------------------------------
"""
# Fitting classifier to the Training set
# Create your classifier here
示例#14
0
            markersize=12,
            color='lightgreen',
            linewidth=2,
            label="Label")
pyplot.plot('date',
            'Sunspots_trended',
            data=data_test_dates,
            marker='',
            color='olive',
            linewidth=2,
            label="Forecast")
pyplot.legend()
pyplot.xlabel('date')
pyplot.title("Full dataset + Forecast")

lgb.plot_importance(models_dict[3])

# get feature importance from all 24 models
feature_importance_df = pd.DataFrame()

for index, model in models_dict.items():
    iter = pd.DataFrame(data=model.feature_importance()).T
    iter.columns = model.feature_name()
    iter.index = [index]
    feature_importance_df = feature_importance_df.append(iter)

# plot feature importance for each model (model 1 is for month 1, model 24 is for month 24 respectively)
feature_importance_df.plot()
pyplot.title("Model Feature importance")
pyplot.xlabel('model (month i)')
pyplot.ylabel('feature importance')
示例#15
0
    evals_result = {}
    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
                      evals_result=evals_result)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result


# Splitting the data for model training#
dev_X = train_X.iloc[:-200000, :]
val_X = train_X.iloc[-200000:, :]
dev_y = train_y[:-200000]
val_y = train_y[-200000:]
print(dev_X.shape, val_X.shape, test_X.shape)

# Training the model #
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

# Making a submission file #
pred_test[pred_test > 1] = 1
pred_test[pred_test < 0] = 0
sub_df = pd.DataFrame({"item_id": test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("baseline_lgb.csv", index=False)

fig, ax = plt.subplots(figsize=(12, 18))
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()
示例#16
0
}

evals_result = {}  # to record eval results for plotting

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(28)],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plot metrics during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plot 84th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
示例#17
0
def DO(frm, to, fileno):
    dtypes = {
        'ip': 'uint32',
        'app': 'uint16',
        'device': 'uint16',
        'os': 'uint16',
        'channel': 'uint16',
        'is_attributed': 'uint8',
        'click_id': 'uint32',
    }

    print('loading train data...', frm, to)
    train_df = pd.read_csv("../input/train.csv",
                           parse_dates=['click_time'],
                           skiprows=range(1, frm),
                           nrows=to - frm,
                           dtype=dtypes,
                           usecols=[
                               'ip', 'app', 'device', 'os', 'channel',
                               'click_time', 'is_attributed'
                           ])

    print('loading test data...')
    if debug:
        test_df = pd.read_csv("../input/test.csv",
                              nrows=100000,
                              parse_dates=['click_time'],
                              dtype=dtypes,
                              usecols=[
                                  'ip', 'app', 'device', 'os', 'channel',
                                  'click_time', 'click_id'
                              ])
    else:
        test_df = pd.read_csv("../input/test.csv",
                              parse_dates=['click_time'],
                              dtype=dtypes,
                              usecols=[
                                  'ip', 'app', 'device', 'os', 'channel',
                                  'click_time', 'click_id'
                              ])

    len_train = len(train_df)
    train_df = train_df.append(test_df)

    del test_df
    gc.collect()

    print('Extracting new features...')
    train_df['hour'] = pd.to_datetime(
        train_df.click_time).dt.hour.astype('uint8')
    train_df['day'] = pd.to_datetime(
        train_df.click_time).dt.day.astype('uint8')

    gc.collect()
    train_df = do_countuniq(train_df, ['ip'], 'channel', 'X0', show_max=True)
    gc.collect()
    train_df = do_cumcount(train_df, ['ip', 'device', 'os'],
                           'app',
                           'X1',
                           show_max=True)
    gc.collect()
    train_df = do_countuniq(train_df, ['ip', 'day'],
                            'hour',
                            'X2',
                            show_max=True)
    gc.collect()
    train_df = do_countuniq(train_df, ['ip'], 'app', 'X3', show_max=True)
    gc.collect()
    train_df = do_countuniq(train_df, ['ip', 'app'], 'os', 'X4', show_max=True)
    gc.collect()
    train_df = do_countuniq(train_df, ['ip'], 'device', 'X5', show_max=True)
    gc.collect()
    train_df = do_countuniq(train_df, ['app'], 'channel', 'X6', show_max=True)
    gc.collect()
    train_df = do_cumcount(train_df, ['ip'], 'os', 'X7', show_max=True)
    gc.collect()
    train_df = do_countuniq(train_df, ['ip', 'device', 'os'],
                            'app',
                            'X8',
                            show_max=True)
    gc.collect()
    train_df = do_count(train_df, ['ip', 'day', 'hour'],
                        'ip_tcount',
                        show_max=True)
    gc.collect()
    train_df = do_count(train_df, ['ip', 'app'], 'ip_app_count', show_max=True)
    gc.collect()
    train_df = do_count(train_df, ['ip', 'app', 'os'],
                        'ip_app_os_count',
                        show_max=True)
    gc.collect()

    print('doing nextClick')
    predictors = []

    new_feature = 'nextClick'
    filename = 'nextClick_%d_%d.csv' % (frm, to)

    if os.path.exists(filename):
        print('loading from save file')
        QQ = pd.read_csv(filename).values
    else:
        D = 2**26
        train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
            + "_" + train_df['os'].astype(str)).apply(hash) % D
        click_buffer = np.full(D, 3000000000, dtype=np.uint32)

        train_df['epochtime'] = train_df['click_time'].astype(
            np.int64) // 10**9
        next_clicks = []
        for category, t in zip(reversed(train_df['category'].values),
                               reversed(train_df['epochtime'].values)):
            next_clicks.append(click_buffer[category] - t)
            click_buffer[category] = t
        del (click_buffer)
        QQ = list(reversed(next_clicks))

        if not debug:
            print('saving')
            pd.DataFrame(QQ).to_csv(filename, index=False)

    train_df[new_feature] = QQ
    predictors.append(new_feature)

    train_df[new_feature + '_shift'] = pd.DataFrame(QQ).shift(+1).values
    predictors.append(new_feature + '_shift')

    del QQ
    gc.collect()

    # Adding features with var and mean hour (inspired from nuhsikander's script)
    print('grouping by : ip_day_chl_var_hour')
    gp = train_df[['ip', 'day',
                   'hour', 'channel']].groupby(by=['ip', 'day', 'channel'])[[
                       'hour'
                   ]].var().reset_index().rename(
                       index=str, columns={'hour': 'ip_tchan_count'})
    train_df = train_df.merge(gp, on=['ip', 'day', 'channel'], how='left')
    del gp
    gc.collect()

    print('grouping by : ip_app_os_var_hour')
    gp = train_df[['ip',
                   'app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[[
                       'hour'
                   ]].var().reset_index().rename(
                       index=str, columns={'hour': 'ip_app_os_var'})
    train_df = train_df.merge(gp, on=['ip', 'app', 'os'], how='left')
    del gp
    gc.collect()

    print('grouping by : ip_app_channel_var_day')
    gp = train_df[['ip', 'app',
                   'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[[
                       'day'
                   ]].var().reset_index().rename(
                       index=str, columns={'day': 'ip_app_channel_var_day'})
    train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left')
    del gp
    gc.collect()

    print('grouping by : ip_app_chl_mean_hour')
    gp = train_df[['ip', 'app',
                   'channel', 'hour']].groupby(by=['ip', 'app', 'channel'])[[
                       'hour'
                   ]].mean().reset_index().rename(
                       index=str, columns={'hour': 'ip_app_channel_mean_hour'})
    print("merging...")
    train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left')
    del gp
    gc.collect()

    print("vars and data type: ")
    train_df.info()
    train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16')
    train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
    train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

    target = 'is_attributed'
    predictors.extend([
        'app', 'device', 'os', 'channel', 'hour', 'day', 'ip_tcount',
        'ip_tchan_count', 'ip_app_count', 'ip_app_os_count', 'ip_app_os_var',
        'ip_app_channel_var_day', 'ip_app_channel_mean_hour'
    ])
    categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
    for i in range(0, naddfeat):
        predictors.append('X' + str(i))

    print('predictors', predictors)

    test_df = train_df[len_train:]
    val_df = train_df[(len_train - val_size):len_train]
    train_df = train_df[:(len_train - val_size)]

    print("train size: ", len(train_df))
    print("valid size: ", len(val_df))
    print("test size : ", len(test_df))

    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    gc.collect()

    print("Training...")
    start_time = time.time()

    params = {
        'learning_rate': 0.20,
        #'is_unbalance': 'true', # replaced with scale_pos_weight argument
        'num_leaves': 7,  # 2^max_depth - 1
        'max_depth': 3,  # -1 means no limit
        'min_child_samples':
        100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree':
        0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight':
        0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':
        200  # because training data is extremely unbalanced 
    }
    (bst, best_iteration) = lgb_modelfit_nocv(params,
                                              train_df,
                                              val_df,
                                              predictors,
                                              target,
                                              objective='binary',
                                              metrics='auc',
                                              early_stopping_rounds=30,
                                              verbose_eval=True,
                                              num_boost_round=1000,
                                              categorical_features=categorical)

    print('[{}]: model training time'.format(time.time() - start_time))
    del train_df
    del val_df
    gc.collect()

    print('Plot feature importances...')
    ax = lgb.plot_importance(bst, max_num_features=100)
    plt.show()

    print("Predicting...")
    sub['is_attributed'] = bst.predict(test_df[predictors],
                                       num_iteration=best_iteration)
    if not debug:
        print("writing...")
        sub.to_csv('sub_it%d.csv.gz' % (fileno),
                   index=False,
                   compression='gzip')
    print("done...")
    return sub
    model = xgb.train(params, xgb.DMatrix(x1, y1), 200,  watchlist, feval=xgb_score, maximize=False, verbose_eval=100, early_stopping_rounds=10)
    model.save_model('xgb_model_v2_{}_limit_{}.model'.format(i, model.best_ntree_limit))
    xgb_pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
    #xgb_pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=ntree_limit[i])
    xgb_valid = model.predict(xgb.DMatrix(x2))
    print('xgb valid log loss = {}'.format(log_loss(y2,xgb_valid)))
    '''
    # lgbm
    #print('lgb training')
    d_train = lgb.Dataset(x1, label=y1)
    d_valid = lgb.Dataset(x2, label=y2)
    watchlist = [d_train, d_valid]

    #model = lgb.train(lgb_params, train_set=d_train, num_boost_round=240, valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=100)
    model = lgb.Booster(model_file='lgb_model_v2_{}.model'.format(i))
    ax = lgb.plot_importance(model)
    plt.tight_layout()
    plt.savefig('feature_importance_{}.png'.format(i))
    break
    lgb_pred = model.predict(test[cols])
    model.save_model('lgb_model_v2_{}.model'.format(i))
    lgb_valid = model.predict(x2)
    print('lgb valid log loss = {}'.format(log_loss(y2, lgb_valid)))

    if (i == 0):
        xgb_preds = xgb_pred
        lgb_preds = lgb_pred
        cat_preds = cat_pred
    else:
        xgb_preds += xgb_pred
        lgb_preds += lgb_pred
示例#19
0
lgbm_roc_score = roc_auc_score(y_test,
                               lgbm_clf.predict_proba(x_test)[:, 1],
                               average='macro')
print('ROC AUC:{0:.4f}'.format(lgbm_roc_score))
print('lgbm_clf.predict_proba(x_test)', lgbm_clf.predict_proba(x_test))
print('lgbm_clf.predict_proba(x_test)[:, 1]',
      lgbm_clf.predict_proba(x_test)[:, 1])
print('y_test', y_test)

from sklearn.model_selection import GridSearchCV
'''
# 하이퍼 파라미터 테스트의 수행속도를 향상시키기 위해 n_estimators를 200 감소
from sklearn.model_selection import GridSearchCV
lgbm_clf = LGBMClassifier(n_estimators=200)
params = {'num_leaves':[32, 64], 'max_depth':[128, 160], 'min_child_samples':[60, 100], 'subsample':[0.8, 1]}

# cv=3
gridcv = GridSearchCV(lgbm_clf, param_grid=params, cv=3)
gridcv.fit(x_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(x_train, y_train), (x_test, y_test)])

print('GridSearchCV 최적 파라미터:', gridcv.best_params_)
lgbm_roc_score = roc_auc_score(y_test, gridcv.predict_proba(x_test)[:, 1], average='macro')
print('ROC AUC :{0:.4f}'.format(lgbm_roc_score))'''

from lightgbm import plot_importance
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_importance(lgbm_clf, ax=ax, max_num_features=20, height=.4)
plt.show()
示例#20
0
# In[103]:

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=61,
                valid_sets=lgb_eval,
                feature_name=feature_name,
                early_stopping_rounds=10)
gbm.save_model('model.txt')

# ### lightGBM 特征信息图

# In[104]:

lgb.plot_importance(gbm,
                    importance_type='gain',
                    ignore_zero=False,
                    figsize=(10, 6))

# ### lightGBM 模型加载,输入测试集进行预测

# In[105]:

#bst = lgb.Booster(model_file='model.txt')

y_predict = gbm.predict(x_test, num_iteration=gbm.best_iteration)

# ### 分析训练效果(将预测的第五天风速y_predict与真实的第五天风速y_test对比)

# In[214]:

-sum(y_test * np.log(y_predict) +
def estimate(model, data):
    ax1 = plot_importance(model, importance_type='gain')
    ax1.set_title('gain')
    ax2 = plot_importance(model, importance_type='split')
    ax2.set_title('split')
示例#22
0
train_pred = pd.DataFrame({"fullVisitorId": train_idx})
train_pred["PredictedLogRevenue"] = np.expm1(oof_preds)
train_pred = train_pred.groupby(
    "fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
train_pred.columns = ["fullVisitorId", "PredictedLogRevenue"]
train_pred["PredictedLogRevenue"] = np.log1p(train_pred["PredictedLogRevenue"])
train_rmse = np.sqrt(
    mean_squared_error(train_target, train_pred['PredictedLogRevenue']))
print('User-level score:', str(round(train_rmse, 4)))
print(' ')
end = time.time()
print('training time:', str(round((end - start) / 60)), 'mins')

#Predict and write to file for submission
test_pred = pd.DataFrame({"fullVisitorId": test_idx})
test_pred["PredictedLogRevenue"] = np.expm1(sub_preds)
test_pred = test_pred.groupby(
    "fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
test_pred.columns = ["fullVisitorId", "PredictedLogRevenue"]
test_pred["PredictedLogRevenue"] = np.log1p(test_pred["PredictedLogRevenue"])
test_pred.to_csv("lgb_new_2.csv", index=False)

#Print importances
lgb.plot_importance(lgb_model,
                    height=0.5,
                    max_num_features=90,
                    ignore_zero=False,
                    figsize=(12, 9),
                    importance_type='gain')
plt.tight_layout()
plt.show()
示例#23
0
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=283,
                valid_sets=lgb_eval,
                early_stopping_rounds=50)

print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
error(y_test, y_pred)

# online

# predict = gbm.predict(test_X, num_iteration=gbm.best_iteration)
# data1 = pd.DataFrame(predict)
# save
# save(data1, 'lgb')

# gbm_online = lgb.train(params,
#                 train_all,
#                 num_boost_round=280)
# # predict
# predict = gbm_online.predict(test_X, num_iteration=gbm_online.best_iteration)
# data1 = pd.DataFrame(predict)
# # save
# save(data1, 'lgb')

plt_encoding_error()
lgb.plot_importance(gbm)
plt.show()
示例#24
0
        'min_sum_hessian_in_leaf': 0.001,
        'n_jobs': -1,
        'num_threads': 8,
    }

    print('................Start training {} fold..........................'.
          format(k + 1))
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=2000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=100,
                    verbose_eval=100,
                    feature_name=features)
    lgb.plot_importance(gbm, max_num_features=20)
    plt.show()
    print('................Start predict .........................')
    # 预测
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # 评估
    tmp_auc = roc_auc_score(y_test, y_pred)
    auc_cv.append(tmp_auc)
    print("valid auc:", tmp_auc)
    # test
    pred = gbm.predict(test_data, num_iteration=gbm.best_iteration)
    pred_cv.append(pred)

    # K交叉验证的平均分数
print('the cv information:')
print(auc_cv)
示例#25
0
                  color="r",
                  size=6)
    plt.show()

with code():

    def plot_feature_importance(model):
        n_features = X.shape[1]
        plt.barh(range(n_features), model.feature_importances_, align='center')
        plt.yticks(np.arange(n_features), X.columns)
        plt.xlabel('Feature importance')
        plt.ylabel('Feature')

    # ※Xはtrain_test_splitで分割する前のtrainデータを想定

with code():
    # ランダムフォレスト
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(
        n_estimators=100, random_state=20181101)  # n_estimatorsは構築する決定木の数
    forest.fit(X_train, y_train)

    # 表示
    plot_feature_importance(forest)

with code():
    import lightgbm as lgb
    # 可視化(modelはlightgbmで学習させたモデル)
    lgb.plot_importance(model, figsize=(12, 8))
    plt.show()
示例#26
0
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train', 'valid'],
        #early_stopping_rounds=500,
        verbose_eval=20)
else:
    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=15000,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=60,
                        verbose_eval=20)

# Feature Importance Plot
f, ax = plt.subplots(figsize=[7, 10])
lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax)
plt.title("Light GBM Feature Importance")
plt.savefig(path + '../plots/feature_import_1006A.png')

print("Model Evaluation Stage")
print('RMSE:',
      np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
lgpred = lgb_clf.predict(testing)
lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex)
lgsub['deal_probability'].clip(0.0, 1.0, inplace=True)  # Between 0 and 1
#lgsub.to_csv(path + "../sub/lgsub_0206A.csv.gz",index=True,header=True, compression = 'gzip')
print("Model Runtime: %0.2f Minutes" % ((time.time() - modelstart) / 60))
'''
[20]		train's rmse: 0.240546	valid's rmse: 0.23821
[40]		train's rmse: 0.230003	valid's rmse: 0.22792
[60]		train's rmse: 0.22423	valid's rmse: 0.222459
示例#27
0
lgb_params['sub_feature'] = 0.80
lgb_params['max_depth'] = 7
lgb_params['feature_fraction'] = 0.7
lgb_params['bagging_fraction'] = 0.7
lgb_params['bagging_freq'] = 10
lgb_params['learning_rate'] = 0.01

lgb_train = lgb.Dataset(X_train, y_train)
lightgbm = lgb.train(lgb_params,
                     lgb_train,
                     feature_name=[i for i in feat_names])

# In[ ]:

plt.figure(figsize=(12, 6))
lgb.plot_importance(lightgbm, max_num_features=30)
plt.title("Feature importances by LightGBM")
plt.show()

# In[ ]:

ax = lgb.plot_tree(lightgbm,
                   tree_index=83,
                   figsize=(20, 8),
                   show_info=['split_gain'])
plt.show()

# # Acknowledgement:
# 1. Pedro Schoen
#
#
示例#28
0
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    num_boost_round=MAX_ROUNDS,
    verbose_eval=False,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
)

score = model.best_score["valid"][METRIC]

best_params = model.params
print("Best params:", best_params)
print(f"  {METRIC} = {score}")
print("  Params: ")
for key, value in best_params.items():
    print(f"    {key}: {value}")

import lightgbm as lgb

model = lgb.train(
    best_params,
    dt,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    num_boost_round=MAX_ROUNDS,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose_eval=REPORT_ROUNDS,
)

lgb.plot_importance(model, importance_type="gain", grid=False)
plt.show()
示例#29
0
                            reg_lambda=0,
                            silent=False)

    print(ctime() + '...training final model...')
    bst.fit(X=X_eval,
            y=Y_eval,
            eval_set=[(X_eval, Y_eval)],
            eval_names=['eval'],
            eval_metric=['rmse'],
            early_stopping_rounds=5000,
            feature_name=feature_names,
            categorical_feature=categorical_features)
    joblib.dump(bst,
                join(fittedModelDir, 'model5_nonCV_{}{}'.format(date, '.pkl')))
    #===train the final model on all data===

    #===make prediction for test set==
    fittedMdlPath='/home/arash/MEGA/MEGAsync/Machine Learning/'+\
                    'Kaggle/Recruit/Fitted models/model5_nonCV_{}.pkl'.\
                    format(date)
    bst = joblib.load(fittedMdlPath)
    gbm.plot_importance(bst)

    y_test = bst.predict(X_test)

    df=pd.DataFrame({'id':df_test.air_store_id+'_'+\
                     df_test.visit_date.dt.strftime('%Y-%m-%d'),
                     'visitors':np.expm1(y_test)})
    df.sort_values(by='id', inplace=True)
    df.to_csv(join(submissionsDir, 'model5_{}.csv'.format(date)), index=False)
    #===make prediction for test set===
示例#30
0
def DO(frm, to, fileno):
    dtypes = {
        'ip': 'uint32',
        'app': 'uint16',
        'device': 'uint8',
        'os': 'uint16',
        'channel': 'uint16',
        'is_attributed': 'uint8',
        'click_id': 'uint32',
    }

    print('loading train data...', frm, to)
    train_df = pd.read_csv(inputpath + "/train.csv",
                           parse_dates=['click_time'],
                           skiprows=range(1, frm),
                           nrows=to - frm,
                           dtype=dtypes,
                           usecols=[
                               'ip', 'app', 'device', 'os', 'channel',
                               'click_time', 'is_attributed'
                           ])

    print('loading test data...')
    if debug:
        test_df = pd.read_csv(inputpath + "/test_supplement.csv",
                              nrows=100000,
                              parse_dates=['click_time'],
                              dtype=dtypes,
                              usecols=[
                                  'ip', 'app', 'device', 'os', 'channel',
                                  'click_time', 'click_id'
                              ])
    else:
        test_df = pd.read_csv(inputpath + "/test_supplement.csv",
                              parse_dates=['click_time'],
                              dtype=dtypes,
                              usecols=[
                                  'ip', 'app', 'device', 'os', 'channel',
                                  'click_time', 'click_id'
                              ])

    local_tz = pytz.timezone('Asia/Shanghai')

    def utc_to_local(utc_dt):
        local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
        return local_tz.normalize(local_dt)

    train_df['click_time'] = train_df['click_time'].apply(utc_to_local)
    test_df['click_time'] = test_df['click_time'].apply(utc_to_local)
    train_df['hour'] = pd.to_datetime(
        train_df.click_time).dt.hour.astype('uint8')
    test_df['hour'] = pd.to_datetime(
        test_df.click_time).dt.hour.astype('uint8')
    train_df['day'] = pd.to_datetime(
        train_df.click_time).dt.day.astype('uint8')
    test_df['day'] = pd.to_datetime(test_df.click_time).dt.day.astype('uint8')

    len_train = len(train_df)
    # train_df=train_df.append(test_df)

    # del test_df

    gc.collect()

    def process_data(data):
        data = do_next_Click(data, agg_suffix='nextClick', agg_type='float32')
        gc.collect()
        data = do_prev_Click(data, agg_suffix='prevClick', agg_type='float32')
        gc.collect()  ## Removed temporarily due RAM sortage.

        data = do_countuniq(data, ['day', 'ip'], 'channel')
        gc.collect()
        print('data columns', data.columns)
        data = do_countuniq(data, ['day', 'ip', 'device', 'os'], 'app')
        gc.collect()
        data = do_countuniq(data, ['day', 'ip', 'day'], 'hour')
        gc.collect()
        data = do_countuniq(data, ['day', 'ip'], 'app')
        gc.collect()
        data = do_countuniq(data, ['day', 'ip', 'app'], 'os')
        gc.collect()
        data = do_countuniq(data, ['day', 'ip'], 'device')
        gc.collect()
        data = do_countuniq(data, ['day', 'app'], 'channel')
        gc.collect()
        data = do_cumcount(data, ['day', 'ip'], 'os')
        gc.collect()
        data = do_cumcount(data, ['day', 'ip', 'device', 'os'], 'app')
        gc.collect()
        data = do_count(data, ['day', 'ip', 'day', 'hour'])
        gc.collect()
        data = do_count(data, ['day', 'ip', 'app'])
        gc.collect()
        data = do_count(data, ['day', 'ip', 'app', 'os'])
        gc.collect()
        data = do_var(data, ['day', 'ip', 'app', 'os'], 'hour')
        gc.collect()

        del data['day']
        gc.collect()
        return data

    train_df = process_data(train_df)
    print('train_df cols after process', train_df.columns)
    test_df = process_data(test_df)
    print('test_df cols after process', test_df.columns)

    # predictors = list(set(predictors))

    print('\n\nBefore appending predictors...\n\n', sorted(predictors))
    target = 'is_attributed'
    word = ['app', 'device', 'os', 'channel', 'hour']
    for feature in word:
        if feature not in predictors:
            predictors.append(feature)
    categorical = ['app', 'device', 'os', 'channel', 'hour']
    print('\n\nAfter appending predictors...\n\n', sorted(predictors))
    if debug:
        test_df = test_df
    else:
        relation = pd.read_csv(inputpath + 'mapping.csv',
                               usecols=['old_click_id'])

        # test_df = train_df[len_train:]
        test_df = test_df.iloc[relation.old_click_id]
        del relation

    # val_df = train_df[(len_train-val_size):]
    # train_df = train_df[:(len_train-val_size)]
    val_df = train_df[train_df.day == 9]
    train_df = train_df[(train_df.day == 7) | (train_df.day == 8)]

    print("\ntrain size: ", len(train_df))
    print("\nvalid size: ", len(val_df))
    print("\ntest size : ", len(test_df))

    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    gc.collect()

    print("Training...")
    start_time = time.time()
    print('predictors', predictors)
    print('train cols', train_df.columns)
    print('test cols', test_df.columns)

    params = {
        'learning_rate': 0.02,
        #'is_unbalance': 'true', # replaced with scale_pos_weight argument
        'num_leaves': 31,  # 2^max_depth - 1
        'max_depth': -1,  # -1 means no limit
        'min_child_samples':
        100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 128,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree':
        0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight':
        0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':
        200  # because training data is extremely unbalanced 
    }
    (bst, best_iteration) = lgb_modelfit_nocv(params,
                                              train_df,
                                              val_df,
                                              predictors,
                                              target,
                                              objective='xentropy',
                                              metrics='auc',
                                              early_stopping_rounds=30,
                                              verbose_eval=True,
                                              num_boost_round=2000,
                                              categorical_features=categorical)

    print('[{}]: model training time'.format(time.time() - start_time))
    del train_df
    del val_df
    gc.collect()

    print('Plot feature importances...')
    fig = plt.figure(figsize=(20, 20))
    ax = lgb.plot_importance(bst, max_num_features=100, figsize=(20, 15))
    # plt.show()

    plt.savefig(str(fileno) + '_importance.png')

    print("Predicting...")
    sub['is_attributed'] = bst.predict(test_df[predictors],
                                       num_iteration=best_iteration)
    #     if not debug:
    #         print("writing...")
    sub.click_id = sub.index
    sub.to_csv('sub_it%d.csv' % (fileno), index=False, float_format='%.9f')
    print("done...")
    return sub
示例#31
0
    ["booking_date", "checkin_date", "resort_id"],
]:
    if not isinstance(col, list):
        col = [col]
    col_name = "_".join(col)
    all_df = pd.concat([
        train_df[["reservation_id"] + col], test_df[["reservation_id"] + col]
    ])
    gdf = all_df.groupby(col)["reservation_id"].count().reset_index()
    gdf.columns = col + [col_name + "_count"]
    train_df = pd.merge(train_df, gdf, on=col, how="left")

from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=10000,
                          depth=3,
                          learning_rate=0.1,
                          loss_function='RMSE')
model.fit(X_tr1, y_tr1, eval_set=(X_tst1, y_tst1), plot=True)
print(r2_score(y_tst1, model.predict(X_tst1)))
print(np.sqrt(mean_squared_error(y_tst1, model.predict(X_tst1))))

import matplotlib.pyplot as plt
import lightgbm as lgb

fig, ax = plt.subplots(figsize=(12, 30))
lgb.plot_importance(lgbm, max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()
 def feature_importance(self):
     lgb.plot_importance(self.model, max_num_features=10)
     plt.show()
     return self.model.feature_importance()
def DO(train_frm,train_to, test_nrows, groups, rategroup, fileno, initial_cols=['ip', 'app','device','os', 'channel', 'hour']):
    predictors=[]
    dtypes = {
            'ip'            : 'uint32',
            'app'           : 'uint16',
            'device'        : 'uint16',
            'os'            : 'uint16',
            'channel'       : 'uint16',
            'is_attributed' : 'uint8',
            'click_id'      : 'uint32',
            }

    print('loading train data...',frm,to)
    train_df = pd.read_csv(inputpath + "train.csv", parse_dates=['click_time'], skiprows=range(1,train_frm), nrows=train_to-train_frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

    print('loading test data...')
    # if debug:
    #     test_df = pd.read_csv(inputpath+"test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    # else:
    test_df = pd.read_csv(inputpath+"test.csv", nrows=test_nrows, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    print('Extracting new features...')

    local_tz = pytz.timezone('Asia/Shanghai') # use your local timezone name here
    # NOTE: pytz.reference.LocalTimezone() would produce wrong result here

    ## You could use `tzlocal` module to get local timezone on Unix and Win32
    # from tzlocal import get_localzone # $ pip install tzlocal

    # # get local timezone    
    # local_tz = get_localzone()

    def utc_to_local(utc_dt):
        local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
        return local_tz.normalize(local_dt) 


    train_df['click_time'] = train_df['click_time'].apply(utc_to_local)
    test_df['click_time'] = test_df['click_time'].apply(utc_to_local)
    train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
    test_df['hour'] = pd.to_datetime(test_df.click_time).dt.hour.astype('uint8')
    train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
    test_df['day'] = pd.to_datetime(test_df.click_time).dt.day.astype('uint8')    


    # Find frequency of is_attributed for each unique value in column
    freqs = {}
    for cols in rategroups:
        def rate_calculation(x):
            """Calculate the attributed rate. Scale by confidence"""
            rate = x.sum() / float(x.count())
            conf = np.min([1, np.log(x.count()) / log_group])
            return rate * conf        
        
        # New feature name
        new_feature = '_'.join(cols)+'_confRate'  
        predictors.append(new_feature)
        filename = new_feature + '.csv'
        if os.path.exists(filename):
            gp=pd.read_csv(filename)
            train_df = train_df.merge(gp, on=cols, how='left') 
            test_df = test_df.merge(gp, on=cols, how='left') 
        else:
            # Perform the groupby
            group_object = train_df.groupby(cols)
            
            # Group sizes    
            group_sizes = group_object.size()
            log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
            print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
                cols, new_feature, 
                group_sizes.max(), 
                np.round(group_sizes.mean(), 2),
                np.round(group_sizes.median(), 2),
                group_sizes.min()
            ))
            
            # Aggregation function
            
            gp = group_object['is_attributed'].apply(rate_calculation).reset_index().rename( index=str, columns={'is_attributed': new_feature})[cols + [new_feature]]
            # Perform the merge
            train_df = train_df.merge(gp, on=cols, how='left')
            test_df = test_df.merge(gp, on=cols, how='left')
            gp.to_csv(filename, index=False)
            del gp 

    print(train_df.shape)
    gc.collect()

    print('shape of train: ', train_df.shape)
    print('shape of test: ', test_df.shape)
    # print('train.head: ')
    # print(train_df.head())
    # print('test head: ')
    # print(test_df.head())
    len_train = len(train_df)
    train_df=train_df.append(test_df)
    # train_df = pd.concat([train_df, test_df], 0)

    del test_df
    gc.collect()

    # def extract_feature(df, col):
    #     filename = col + '.csv'
    #     if os.path.exists(filename):
    #         print('loading from {} file...'.format(filename))
    #         dp = pd.read_csv(filename)
    #         df[col] = dp.values 
    #         del dp
    #     else:
    #         df[col] = pd.to_datetime(df.click_time).dt.hour.astype('uint8')  
    #         df[col].to_csv(filename)         
    # print('Preprocessing click_time...')
    # train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)


    # train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
    # extract_feature(train_df)
    # extract_feature(test_df)
    
    gc.collect()
    # naddfeat=9
    # for i in range(0,naddfeat):
    #     if i==0: selcols=['ip', 'channel']; QQ=4;
    #     if i==1: selcols=['ip', 'device', 'os', 'app']; QQ=5;
    #     if i==2: selcols=['ip', 'day', 'hour']; QQ=4;
    #     if i==3: selcols=['ip', 'app']; QQ=4;
    #     if i==4: selcols=['ip', 'app', 'os']; QQ=4;
    #     if i==5: selcols=['ip', 'device']; QQ=4;
    #     if i==6: selcols=['app', 'channel']; QQ=4;
    #     if i==7: selcols=['ip', 'os']; QQ=5;
    #     if i==8: selcols=['ip', 'device', 'os', 'app']; QQ=4;
    

    # tpye: 
    # 4: nunique 不同selctor 所对应unique value 的数量
    # 5: cumcont
    

    

    for i, item in enumerate(groups):
        selcols = item[0]
        QQ = item[-1]
        print('selcols',selcols,'QQ',QQ)
        
        colname = '_'.join(selcols) + '_' + str(QQ)
        predictors.append(colname)
        filename= colname + '.csv'
        
        if os.path.exists(filename):
            if QQ==5: 
                gp=pd.read_csv(filename,header=None)
                train_df[colname]=gp 
            else: 
                gp=pd.read_csv(filename)
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
        else:
            if QQ==0:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].count().reset_index().\
                    rename(index=str, columns={selcols[-1]: colname})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
            if QQ==1:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].mean().reset_index().\
                    rename(index=str, columns={selcols[-1]: 'X'+str(i)})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
            if QQ==2:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].var().reset_index().\
                    rename(index=str, columns={selcols[-1]: 'X'+str(i)})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
            if QQ==3:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].skew().reset_index().\
                    rename(index=str, columns={selcols[-1]: 'X'+str(i)})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
            if QQ==4:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].nunique().reset_index().\
                    rename(index=str, columns={selcols[-1]: colname})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
            if QQ==5:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].cumcount()
                train_df[colname]=gp.values
            if  QQ == 6:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].var().reset_index().rename(index=str, columns={selcols[-1]: colname})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')
            if QQ == 7:
                gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].mean().reset_index().rename(index=str, columns={selcols[-1]: colname})
                train_df = train_df.merge(gp, on=selcols[0:-1], how='left')     
            if QQ == 'NC':
                train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)                
                gp = (train_df.groupby(selcols).click_time.shift(-1) - train_df.click_time).astype(np.float32)                 
                train_df[colname] = gp
            # if not debug:
            if QQ != 'NC':
                if debug:
                    gp.to_csv('test'+filename, index=False)
                else:
                    gp.to_csv(filename,index=False)
            
        del gp
        gc.collect()   



    # train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
    # train_df['NC'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - train_df.click_time).astype(np.float32)




    # print('doing nextClick')
    
    
    # new_feature = 'nextClick'
    # filename='nextClick_%d_%d.csv'%(frm,to)

    # if os.path.exists(filename):
    #     print('loading from save file')
    #     QQ=pd.read_csv(filename).values
    # else:
    #     # D=2**26
    #     # train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
    #     #     + "_" + train_df['os'].astype(str)).apply(hash) % D
    #     # click_buffer= np.full(D, 3000000000, dtype=np.uint32)

    #     # train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9
    #     # next_clicks= []
    #     # for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)):
    #     #     next_clicks.append(click_buffer[category]-t)
    #     #     click_buffer[category]= t
    #     # del(click_buffer)
    #     # QQ= list(reversed(next_clicks))
    #     # del train_df['category']
    #     # del train_df['epochtime']
    #     train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
    #     train_df['nextClick'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - train_df.click_time).astype(np.float32)        

    #     if not debug:
    #         print('saving')
    #         pd.DataFrame(QQ).to_csv(filename,index=False)

    # train_df[new_feature] = QQ
    # predictors.append(new_feature)

    # train_df[new_feature+'_shift'] = pd.DataFrame(QQ).shift(+1).values
    # predictors.append(new_feature+'_shift')
    
    # del QQ
    # gc.collect()
    # predictors.extend(['nextClick', 'category', 'epochtime', 'nextClick_shift'])







    # print('grouping by ip-day-hour combination...')
    # gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
    # train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')
    # del gp
    # gc.collect()

    # print('grouping by ip-app combination...')
    # gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
    # train_df = train_df.merge(gp, on=['ip','app'], how='left')
    # del gp
    # gc.collect()

    # print('grouping by ip-app-os combination...')
    # gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
    # train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
    # del gp
    # gc.collect()

    # Adding features with var and mean hour (inspired from nuhsikander's script)
    # print('grouping by : ip_day_chl_var_hour')
    # gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'})
    # train_df = train_df.merge(gp, on=['ip','day','channel'], how='left')
    # del gp
    # gc.collect()
    # predictors.append()

    # print('grouping by : ip_app_os_var_hour')
    # gp = train_df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
    # train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
    # del gp
    # gc.collect()

    # print('grouping by : ip_app_channel_var_day')
    # gp = train_df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
    # train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left')
    # del gp
    # gc.collect()

    # print('grouping by : ip_app_chl_mean_hour')
    # gp = train_df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
    # print("merging...")
    # train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left')
    # del gp
    # gc.collect()

    print("variables and data type: ")
    train_df.info()

    # train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16')
    # train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
    # train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

    target = 'is_attributed'



    ### 有问题 需要解决
    predictors.extend(initial_cols)
    categorical = ['app', 'device', 'os', 'channel', 'hour']
    # for i in range(0,naddfeat):
    #     predictors.append('X'+str(i))
        
    print('predictors',predictors)

    test_df = train_df[len_train:]
    val_df = train_df[train_df.day == 9]
    train_df = train_df[(train_df.day == 7) | (train_df.day == 8)]
   
    
    # val_df = train_df[(len_train-val_size):len_train]
    # train_df = train_df[:(len_train-val_size)]

    print("train size: ", len(train_df))
    print("valid size: ", len(val_df))
    print("test size : ", len(test_df))

    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    gc.collect()

    print("Training...")
    start_time = time.time()

    params = {
        'learning_rate': 0.08, #0.2,
        #'is_unbalance': 'true', # replaced with scale_pos_weight argument
        'num_leaves': 7,  # 2^max_depth - 1
        'max_depth': 3,  # -1 means no limit
        'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 100,  # Number of bucketed bin for feature values
        'subsample': 0.7,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':200 # because training data is extremely unbalanced 
    }
    (bst,best_iteration) = lgb_modelfit_nocv(params, 
                            train_df, 
                            val_df, 
                            predictors, 
                            target, 
                            objective='binary', 
                            metrics='auc',
                            early_stopping_rounds=30, 
                            verbose_eval=True, 
                            num_boost_round=1000, 
                            categorical_features=categorical)

    print('[{}]: model training time'.format(time.time() - start_time))
    del train_df
    del val_df
    gc.collect()
    
    print('Plot feature importances...')
    fig = plt.figure(figsize=(20, 20))
    ax = lgb.plot_importance(bst, max_num_features=100, figsize=(20, 15))
    # plt.show()
    
    plt.savefig(fileno+'_importance.png')

    print("Predicting...")
    sub['is_attributed'] = bst.predict(test_df[predictors],num_iteration=best_iteration)
    # if not debug:
    print("writing...")
    sub.to_csv('sub_{}.csv.gz'.format(str(fileno)),index=False,compression='gzip')
    print("done...")
    return sub
示例#34
0
def Submission(valid_hour=11):
    wd = ['/Users/ewenwang/Documents/practice_data/conversion_rate/', '/Users/ewenwang/Documents/GitHub/Kaggle/conversion_rate/round2/']

    test_file = ['round2_ijcai_18_test_b_20180510.txt']

    train = Merge(which_data='train')

    if valid_hour>0:
        filter_ = (train.hour>=valid_hour)
        train_ = train[~filter_]
        valid_ = train[filter_]
    else:
        train_, valid_ = train_test_split(train, test_size=0.2, random_state=0)

    target = 'is_trade'
    # if drop_list == None:
    #     drop_list = ['is_trade', 'instance_id', 'user_id', 'item_id', 'context_id', 'context_page_id', 'shop_id', 
    #     'hour', 'context_timestamp']

    # features = [x for x in train.columns if x not in drop_list]

    features = [
    'user_gender_id', 'user_age_level', 'user_star_level', 
    'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 
    'shop_star_level', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_description', 
    
    'item_id_ratio', 'item_city_id_user_age_level_prob', 'item_collected_level_ratio', 'item_price_level_ratio',  
    'context_page_id_user_gender_id_prob', 'context_page_id_user_star_level_prob',
    'shop_score_service_bin_ratio', 'shop_star_level_ratio', 'shop_review_positive_rate_bin_ratio', 'shop_id_ratio', 'shop_review_num_level_ratio',
    'user_pagerank', 'hour_ratio',
    
    'wt_item_id', 'wt_item_category_list', 'match_prop_ct_shop_id_wt', 
    'item_city_id_shop_id_wt', 'item_city_id_context_page_id_wt', 'list_wt_item_property_list', 'wt_item_city_id', 
    'match_cat_ct_shop_id_wt', 'context_page_id_item_category_list_wt', 'item_brand_id_match_prop_ct_wt', 
    'context_page_id_shop_star_level_wt'
    ]

    X = train_[features]
    y = train_[target].values
    X_tes = valid_[features]
    y_tes = valid_[target].values

    print('Training LGBM model...')
    t0=time.time()
    lgb_1 = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        num_leaves=16,
        depth=4,
        learning_rate=0.01,
        seed=2018,
        colsample_bytree=0.6,
        subsample=0.8,
        n_estimators=20000,
        silent = True)
    lgb_model_1 = lgb_1.fit(X, y, eval_set=[(X_tes, y_tes)], early_stopping_rounds=200, verbose=False, callbacks=[lgb.print_evaluation(100)])
    print('\ttime spend: ', time.time()-t0)

    best_iter = lgb_model_1.best_iteration_
    best_score = lgb_model_1.best_score_

    print('best_iter: ', best_iter, '\nbest_score: ', best_score)

    X_2 = train[features]
    y_2 = train[target].values

    print('Training LGBM model...')
    t0=time.time()
    lgb_2 = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        num_leaves=32,
        depth=4,
        learning_rate=0.01,
        seed=2018,
        colsample_bytree=0.6,
        subsample=0.9,
        n_estimators=best_iter,
        silent = True)

    lgb_model_2 = lgb_2.fit(X_2, y_2)
    print('\ttime spend: ', time.time()-t0)

    del train 
    test = Merge(which_data='test')

    print('predicting...')
    t0=time.time()
    pred = lgb_model_2.predict_proba(test[features])[:, 1]
    print('\ttime spend: ', time.time()-t0)

    test['predicted_score'] = pred

    result = test[['instance_id', 'predicted_score']]
    result = pd.DataFrame(pd.read_csv(wd[0]+test_file[0], sep=' ')['instance_id']).merge(result, on='instance_id', how='left').fillna(0)
    
    print('\nsaving...')
    t0=time.time()
    result.to_csv(wd[0]+'results.txt', sep=' ', index=False)
    print('\ttime spend: ', time.time()-t0)

    print('plotting...')
    lgb.plot_importance(lgb_model_2, figsize=(12, 25))
    plt.show()
    return lgb_model_2
示例#35
0
        early_stopping_rounds=20,  # early_stoppingの判定基準
        verbose_eval=10)

    y_pred = model.predict(x_valid, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_valid, y_pred)
    print(auc)

    models.append(model)
    aucs.append(auc)

# 平均AUCを計算する
print(mean(aucs))

# 特徴量重要度の表示
for model in models:
    lgb.plot_importance(model, importance_type='gain', max_num_features=15)
"""
予測精度:
0.9316555393578665
"""
'''
テストデータの予測
'''

# テストデータの説明変数を指定
X_test = test.drop(['y', 'id'], axis=1)

# テストデータにおける予測
preds = []

for model in models:
print('Starting training...')
# train
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=[lgb_train, lgb_test],
    feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
    categorical_feature=[21],
    evals_result=evals_result,
    verbose_eval=10)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting 54th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm,
                   tree_index=53,
                   figsize=(15, 15),
                   show_info=['split_gain'])
plt.show()

print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
graph.render(view=True)
示例#37
0
    # LGBM Dataset Formatting 
    lgtrain = lgb.Dataset(X, y,
                          feature_name=tfvocab,
                          categorical_feature=categorical)
    del X
    gc.collect()
    # Go Go Go
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=1500,
        verbose_eval=100
    )

# Feature Importance Plot
f, ax = plt.subplots(figsize=[7, 10])
lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax)
plt.title("Light GBM Feature Importance")
plt.savefig('feature_import.png')

print("Model Evaluation Stage")
lgpred = lgb_clf.predict(testing)

# Mixing lightgbm with ridge. I haven't really tested if this improves the score or not
# blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0]
lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex)
lgsub['deal_probability'].clip(0.0, 1.0, inplace=True)  # Between 0 and 1
lgsub.to_csv("lgsub.csv", index=True, header=True)
# print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes" % ((time.time() - notebookstart) / 60))