Exemplo n.º 1
0
 def test_continue_train(self):
     X, y = load_boston(True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': 'regression',
         'metric': 'l1',
         'verbose': -1
     }
     lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
     init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
     model_name = 'model.txt'
     init_gbm.save_model(model_name)
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=30,
                     valid_sets=lgb_eval,
                     verbose_eval=False,
                     # test custom eval metrics
                     feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)),
                     evals_result=evals_result,
                     init_model='model.txt')
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
     self.assertLess(ret, 3.5)
     self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
     for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']):
         self.assertAlmostEqual(l1, mae, places=5)
     os.remove(model_name)
Exemplo n.º 2
0
    def test_plot_metrics(self):
        test_data = lgb.Dataset(self.X_test, self.y_test, reference=self.train_data)
        self.params.update({"metric": {"binary_logloss", "binary_error"}})

        evals_result0 = {}
        gbm0 = lgb.train(self.params, self.train_data,
                         valid_sets=[self.train_data, test_data],
                         valid_names=['v1', 'v2'],
                         num_boost_round=10,
                         evals_result=evals_result0,
                         verbose_eval=False)
        ax0 = lgb.plot_metric(evals_result0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Metric during training')
        self.assertEqual(ax0.get_xlabel(), 'Iterations')
        self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
        ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
        ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])

        evals_result1 = {}
        gbm1 = lgb.train(self.params, self.train_data,
                         num_boost_round=10,
                         evals_result=evals_result1,
                         verbose_eval=False)
        self.assertRaises(ValueError, lgb.plot_metric, evals_result1)

        gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm2.fit(self.X_train, self.y_train, eval_set=[(self.X_test, self.y_test)], verbose=False)
        ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
Exemplo n.º 3
0
 def test_early_stopping(self):
     X, y = load_breast_cancer(True)
     params = {
         'objective': 'binary',
         'metric': 'binary_logloss',
         'verbose': -1
     }
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     lgb_train = lgb.Dataset(X_train, y_train)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
     valid_set_name = 'valid_set'
     # no early stopping
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=10,
                     valid_sets=lgb_eval,
                     valid_names=valid_set_name,
                     verbose_eval=False,
                     early_stopping_rounds=5)
     self.assertEqual(gbm.best_iteration, 10)
     self.assertIn(valid_set_name, gbm.best_score)
     self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
     # early stopping occurs
     gbm = lgb.train(params, lgb_train,
                     valid_sets=lgb_eval,
                     valid_names=valid_set_name,
                     verbose_eval=False,
                     early_stopping_rounds=5)
     self.assertLessEqual(gbm.best_iteration, 100)
     self.assertIn(valid_set_name, gbm.best_score)
     self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
Exemplo n.º 4
0
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001):
	params = {}
	params["objective"] = "binary"
	params['metric'] = 'auc'
	params["max_depth"] = dep
	params["min_data_in_leaf"] = 100
	params["learning_rate"] = eta
	params["bagging_fraction"] = 0.7
	params["feature_fraction"] = 0.7
	params["bagging_freq"] = 5
	params["bagging_seed"] = seed_val
	params["verbosity"] = -1
	num_rounds = rounds

	plst = list(params.items())
	lgtrain = lgb.Dataset(train_X, label=train_y)

	if test_y is not None:
		lgtest = lgb.Dataset(test_X, label=test_y)
		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
	else:
		lgtest = lgb.DMatrix(test_X)
		model = lgb.train(params, lgtrain, num_rounds)

	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
	pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)

	loss = 0
	if test_y is not None:
		loss = metrics.roc_auc_score(test_y, pred_test_y)
		print loss
		return pred_test_y, loss, pred_test_y2
	else:
		return pred_test_y, loss, pred_test_y2
Exemplo n.º 5
0
 def test_feature_name(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     feature_names = ['f_' + str(i) for i in range(13)]
     gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
     self.assertListEqual(feature_names, gbm.feature_name())
     # test feature_names with whitespaces
     feature_names_with_space = ['f ' + str(i) for i in range(13)]
     gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
     self.assertListEqual(feature_names, gbm.feature_name())
Exemplo n.º 6
0
def SubmissionSimple(data, submit=False):
  features = [x for x in data.columns if x not in drop_list]

  train = data[data.is_trade.notnull()]
  test = data[data.is_trade.isnull()]

  lgb_train = train[(train['day'] >= 18) & (train['day'] < 24)]
  lgb_valid = train[(train['day'] == 24)]

  lgb_train = lgb.Dataset(lgb_train[features], lgb_train[target], free_raw_data=False)
  lgb_valid = lgb.Dataset(lgb_valid[features], lgb_valid[target], reference=lgb_train, free_raw_data=False)

  print('\ntraining...')
  gbm = lgb.train(params=params, 
                  train_set=lgb_train,
                  valid_sets=[lgb_train, lgb_valid],
                  num_boost_round=10000, 
                  early_stopping_rounds=500,
                  verbose_eval=100)

  if submit:
    print('\npredicting...')
    test['predicted_score'] = gbm.predict(test[features], num_iteration=gbm.best_iteration)
    result = test[['instance_id', 'predicted_score']]
    result = pd.DataFrame(pd.read_csv(wd+test_file[2], sep=' ')['instance_id']).merge(result, on='instance_id', how='left').fillna(0)
    print('\nsaving...')
    result.to_csv(wd+output_file, sep=' ', index=False)
    
  return gbm
Exemplo n.º 7
0
    def test_plot_importance(self):
        gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10)
        ax0 = lgb.plot_importance(gbm0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Feature importance')
        self.assertEqual(ax0.get_xlabel(), 'Feature importance')
        self.assertEqual(ax0.get_ylabel(), 'Features')
        self.assertLessEqual(len(ax0.patches), 30)

        gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm1.fit(self.X_train, self.y_train)

        ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
        self.assertIsInstance(ax1, matplotlib.axes.Axes)
        self.assertEqual(ax1.get_title(), 't')
        self.assertEqual(ax1.get_xlabel(), 'x')
        self.assertEqual(ax1.get_ylabel(), 'y')
        self.assertLessEqual(len(ax1.patches), 30)
        for patch in ax1.patches:
            self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.))  # red

        ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'],
                                  title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
        self.assertLessEqual(len(ax2.patches), 30)
        self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.))  # r
        self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.))  # y
        self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.))  # g
        self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.))  # b
Exemplo n.º 8
0
    def test_multiclass_prediction_early_stopping(self):
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'num_class': 10,
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)

        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
        ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter))
        self.assertLess(ret, 0.8)
        self.assertGreater(ret, 0.5)  # loss will be higher than when evaluating the full model

        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 5.5}
        ret = multi_logloss(y_test, gbm.predict(X_test, pred_parameter=pred_parameter))
        self.assertLess(ret, 0.2)
Exemplo n.º 9
0
 def test_template(params={'objective': 'regression', 'metric': 'l2'},
                   X_y=load_boston(True), feval=mean_squared_error,
                   num_round=100, init_model=None, custom_eval=None,
                   early_stopping_rounds=10,
                   return_data=False, return_model=False):
     params['verbose'], params['seed'] = -1, 42
     X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
     lgb_train = lgb.Dataset(X_train, y_train, params=params)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
     if return_data:
         return lgb_train, lgb_eval
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=num_round,
                     valid_sets=lgb_eval,
                     valid_names='eval',
                     verbose_eval=False,
                     feval=custom_eval,
                     evals_result=evals_result,
                     early_stopping_rounds=early_stopping_rounds,
                     init_model=init_model)
     if return_model:
         return gbm
     else:
         return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
Exemplo n.º 10
0
 def train(self, xtra, ytra, xte, yte):
     ytra = ytra.ravel()
     yte = yte.ravel()
     dtrain = lgb.Dataset(xtra, label=ytra)
     dvalid = lgb.Dataset(xte, label=yte)
     watchlist = [(dtrain, 'train'),(dvalid, 'eval')]
     self.gbdt = lgb.train(self.param, dtrain, self.nrounds)
Exemplo n.º 11
0
def main():
    res = []
    num_iterations = params['num_iterations']
    early_stopping_round = params['early_stopping_round']
    print(params)
    for i in range(cnt):
        train_fea = pd.read_csv(root_path + 'train_score_{}.csv'.format(i))
        train_lab = pd.read_csv(root_path + 'label_{}.csv'.format(i))
        train_lab = train_lab.loc[:, 'label'].values

        lgb_train = lgb.Dataset(train_fea, train_lab)

        solver = lgb.train(params, lgb_train, \
                           valid_sets=[lgb_train], \
                           valid_names=['train'], \
                           verbose_eval=True, \
                           num_boost_round=num_iterations, \
                           early_stopping_rounds=early_stopping_round)

        pred_fea = pd.read_csv(root_path + 'res_score.csv')
        pred_fea = pred_fea.drop([i], axis=1).values
        res.append(solver.predict(pred_fea, num_iteration=solver.best_score))
        pd.DataFrame(np.array(res).T).to_csv(root_path + \
                                             'res_score2.csv', index=False)

    res = np.mean(res, axis=0)
    pred_pair = pd.read_csv(root_path + 'test1.csv')
    pred_pair['score'] = res
    pred_pair['score'] = pred_pair['score'].apply(lambda x: '{:.6f}'.format(x))
    pred_pair.to_csv(root_path + 'submission-5000-layer2.csv', index=False)
Exemplo n.º 12
0
    def test_missing_value_handle_none(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
        y = [0, 1, 1, 1, 0, 0, 0, 0, 0]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'use_missing': False
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        verbose_eval=True,
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
        self.assertAlmostEqual(pred[0], pred[1], places=5)
        self.assertAlmostEqual(pred[-1], pred[0], places=5)
Exemplo n.º 13
0
    def test_simple(self):
        # Load a dataset aleady on disk
        iris = load_iris()

        lgb_train = lgb.Dataset(iris.data[:100], iris.target[:100])
        lgb_eval = lgb.Dataset(iris.data[100:], iris.target[100:], reference=lgb_train)

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'l2', 'auc'},
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 0
        }

        # Run only one round for faster test
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=1)

        self.assertEqual(1, gbm.best_iteration)
Exemplo n.º 14
0
    def test_categorical_handle2(self):
        x = [0, np.nan, 0, np.nan, 0, np.nan]
        y = [0, 1, 0, 1, 0, 1]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'min_data_per_group': 1,
            'cat_smooth': 1,
            'cat_l2': 0,
            'max_cat_to_onehot': 1,
            'zero_as_missing': False,
            'categorical_column': 0
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        verbose_eval=True,
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
        np.testing.assert_almost_equal(pred, y)
Exemplo n.º 15
0
 def test_reference_chain(self):
     X = np.random.normal(size=(100, 2))
     y = np.random.normal(size=100)
     tmp_dat = lgb.Dataset(X, y)
     # take subsets and train
     tmp_dat_train = tmp_dat.subset(np.arange(80))
     tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
     params = {'objective': 'regression_l2', 'metric': 'rmse'}
     gbm = lgb.train(params, tmp_dat_train, num_boost_round=20, valid_sets=[tmp_dat_train, tmp_dat_val])
Exemplo n.º 16
0
def train_gbm_model(train_days_seq, test_day, model_path, best_num, model_type="cv",):
    models_usedfeats_df = pd.read_csv( models_usedfeats_path )
    best_feats = models_usedfeats_df[ models_usedfeats_df['gbm_feats']==1 ]['feat_name'].values

    if os.path.exists(model_path): os.remove(model_path); print "Remove file %s."%(model_path)
    if os.path.exists(model_path): 
        bst = lgb.Booster(model_file=model_path)
    else:
        ################################# 训练数据 ##########################################
        data, labels = get_merge_train_data_set(train_days_seq)
        data = data[ best_feats ]
        ################################## 评估数据 #########################################
        print "all samples:   %d*%d,pos/nag=%f"%(len(data.index),len(data.columns),1.0*len(labels[labels.label==1])/len(labels[labels.label==0]));
        ################################## 评估数据 #########################################
        X_train, X_test, y_train, y_test = train_test_split(data.values, labels['label'].values, test_size=0.15, random_state=0)
        all_samples_train = lgb.Dataset(data.values, labels['label'].values);
        deval = lgb.Dataset( X_test, y_test )
        columns = data.columns; data = None;
        params = {
            'objective': 'regression',
            'boosting_type': 'gbdt',
            'num_leaves': 300,
            'learning_rate': 0.05,
            'verbose': 0,
            'metric': {'binary_logloss'},
            #'device':'gpu',
        }
        num_boost_round = 130
        # if model_type == "cv":
        #     print "start cv, please wait ........"
        #     cv_history = lgb.cv(params, all_samples_train, num_boost_round, nfold=5, seed=2017, metrics = {"binary_logloss"}, early_stopping_rounds= 10, callbacks=[lgb.callback.print_evaluation(show_stdv=True)]);
        #     history_df = pd.DataFrame(cv_history)
        #     num_boost_round = len(history_df.index)
        # else: num_boost_round = 99 # 99
        bst = lgb.train(params, all_samples_train, num_boost_round, valid_sets = deval )
        bst.save_model(model_path)
        #feature_importance2file(bst, history_df, num_boost_round, feature_importance_file_path, columns, model_name='gbm')
    ############################## 查看正确率 #############################################
    test_data, test_labels_df = make_train_set(test_day,test_day+1000000)
    test_data = test_data[ best_feats ]
    print "all samples:   %d*%d."%(len(test_data.index),len(test_data.columns));
    y = bst.predict( test_data.values )
    report( test_labels_df['label'], y )
    test_labels_df, test_data = None, None #优化内存
    exit();
    ############################ 生成提交文件 ###################################
    test_data, test_labels = make_train_set(31000000, 32000000, sub=True)
    instanceID = test_data['instanceID'].copy(); del test_data['instanceID']
    test_data = test_data[ best_feats ]
    print "sub samples:   %d*%d"%(len(test_data.index),len(test_data.columns))
    y = bst.predict( test_data.values )
    pred = pd.concat([instanceID, pd.Series(y, name='prob')], axis=1)
    pred = pred.sort_values('instanceID',ascending=True)
    fun = lambda x: 0.0 if x < 0 else x   # 为什么预测的还有负值
    pred['prob'] = pred['prob'].map(fun)
    pred.to_csv('./sub/submission.csv', index=False, index_label=False)
Exemplo n.º 17
0
def lgb_train_predict(train_x, train_y, test_x, params, rounds):
    start = time.clock()
    log(str(train_x.columns))
    dtrain = lgb.Dataset(train_x, label=train_y)
    valid_sets = [dtrain]
    model = lgb.train(params, dtrain, rounds, valid_sets, feval=eval_auc_f1, verbose_eval=5)
    pred = model.predict(test_x)
    elapsed = (time.clock() - start)
    log('Time used:' + str(elapsed) + 's')
    return model, pred
Exemplo n.º 18
0
 def train(self, x_train, y_train, x_val, y_val):
     print('train with lgb model')
     lgbtrain = lgb.Dataset(x_train, y_train)
     lgbval = lgb.Dataset(x_val, y_val)
     model = lgb.train(self.params, 
                       lgbtrain,
                       valid_sets = lgbval,
                       verbose_eval = self.num_boost_round,
                       num_boost_round = self.num_boost_round)
                       early_stopping_rounds = self.early_stopping_rounds)
Exemplo n.º 19
0
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                      feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10,
                      categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metrics,
        'learning_rate': 0.2,
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric': metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params,
                     xgtrain,
                     valid_sets=[xgtrain, xgvalid],
                     valid_names=['train', 'valid'],
                     evals_result=evals_results,
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10,
                     feval=feval)

    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    print(metrics + ":", evals_results['valid'][metrics][bst1.best_iteration - 1])

    return (bst1, bst1.best_iteration)
Exemplo n.º 20
0
def main():
    num_iterations = params['num_iterations']
    early_stopping_round = params['early_stopping_round']
    print(params)
    for i in range(cnt):
        train_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format(i))
        train_lab = pd.read_csv(root_path + \
            'label_{}.csv'.format(i)).loc[:, 'label'].values

        valid_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format((i + 1) % cnt))
        valid_lab = pd.read_csv(root_path + \
            'label_{}.csv'.format((i + 1) % cnt)).loc[:, 'label'].values

        lgb_train = lgb.Dataset(train_fea, label=train_lab)
        lgb_valid = lgb.Dataset(valid_fea, label=valid_lab, reference=lgb_train)

        print('training cnt={}/{}'.format(i + 1, cnt))

        solver = lgb.train(params, lgb_train, \
            valid_sets=[lgb_train, lgb_valid], \
            valid_names=['train', 'valid'], \
            verbose_eval=True, \
            num_boost_round=num_iterations, \
            early_stopping_rounds=early_stopping_round)

        pred_fea = scipy.sparse.load_npz(root_path + 'pred_{}.npz'.format(i))
        pred_label = solver.predict(pred_fea, num_iteration=solver.best_iteration)
        if os.path.exists(root_path + 'res_score.csv'):
            res = list(pd.read_csv(root_path + 'res_score.csv').values.T)
        else:
            res = []
        res.append(pred_label)
        pd.DataFrame(np.array(res).T).to_csv(root_path + \
            'res_score.csv', index=False)

        for j in range(cnt):
            if j == i:
                continue
            pred_fea = scipy.sparse.load_npz(root_path + 'train_{}.npz'.format(j))
            pred_label = solver.predict(pred_fea, num_iteration=solver.best_iteration)
            if os.path.exists(root_path + 'train_score_{}.csv'.format(j)):
                train_res = list(pd.read_csv(root_path + \
                                'train_score_{}.csv'.format(j)).values.T)
            else:
                train_res = []
            train_res.append(pred_label)
            pd.DataFrame(np.array(train_res).T).to_csv(root_path + \
                'train_score_{}.csv'.format(j), index=False)
        gc.collect()

    res = np.mean(res, axis=0)
    pred_pair = pd.read_csv(root_path + 'test1.csv')
    pred_pair['score'] = res
    pred_pair['score'] = pred_pair['score'].apply(lambda x: '{:.6f}'.format(x))
    pred_pair.to_csv(root_path + 'submission-5000.csv', index=False)
Exemplo n.º 21
0
 def test_template(init_model=None, return_model=False):
     X, y = load_boston(True)
     params = {
         'objective': 'regression',
         'metric': 'l2',
         'verbose': -1
     }
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     lgb_train = lgb.Dataset(X_train, y_train)
     gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
     return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
Exemplo n.º 22
0
 def test_pandas_categorical(self):
     import pandas as pd
     X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                       "B": np.random.permutation([1, 2, 3] * 100),  # int
                       "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
                       "D": np.random.permutation([True, False] * 150)})  # bool
     y = np.random.permutation([0, 1] * 150)
     X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
                            "B": np.random.permutation([1, 3] * 30),
                            "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
                            "D": np.random.permutation([True, False] * 30)})
     for col in ["A", "B", "C", "D"]:
         X[col] = X[col].astype('category')
         X_test[col] = X_test[col].astype('category')
     params = {
         'objective': 'binary',
         'metric': 'binary_logloss',
         'verbose': -1
     }
     lgb_train = lgb.Dataset(X, y)
     gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
     pred0 = list(gbm0.predict(X_test))
     lgb_train = lgb.Dataset(X, y)
     gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
                      categorical_feature=[0])
     pred1 = list(gbm1.predict(X_test))
     lgb_train = lgb.Dataset(X, y)
     gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
                      categorical_feature=['A'])
     pred2 = list(gbm2.predict(X_test))
     lgb_train = lgb.Dataset(X, y)
     gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
                      categorical_feature=['A', 'B', 'C', 'D'])
     pred3 = list(gbm3.predict(X_test))
     gbm3.save_model('categorical.model')
     gbm4 = lgb.Booster(model_file='categorical.model')
     pred4 = list(gbm4.predict(X_test))
     np.testing.assert_almost_equal(pred0, pred1)
     np.testing.assert_almost_equal(pred0, pred2)
     np.testing.assert_almost_equal(pred0, pred3)
     np.testing.assert_almost_equal(pred0, pred4)
Exemplo n.º 23
0
    def test_plot_metrics(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
        train_data = lgb.Dataset(X_train, y_train)
        test_data = lgb.Dataset(X_test, y_test, reference=train_data)

        params = {
            "objective": "binary",
            "metric": {"binary_logloss", "binary_error"},
            "verbose": -1,
            "num_leaves": 3
        }

        evals_result0 = {}
        gbm0 = lgb.train(params, train_data,
                         valid_sets=[train_data, test_data],
                         valid_names=['v1', 'v2'],
                         num_boost_round=10,
                         evals_result=evals_result0,
                         verbose_eval=False)
        ax0 = lgb.plot_metric(evals_result0)
        self.assertIsInstance(ax0, matplotlib.axes.Axes)
        self.assertEqual(ax0.get_title(), 'Metric during training')
        self.assertEqual(ax0.get_xlabel(), 'Iterations')
        self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
        ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
        ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])

        evals_result1 = {}
        gbm1 = lgb.train(params, train_data,
                         num_boost_round=10,
                         evals_result=evals_result1,
                         verbose_eval=False)
        self.assertRaises(ValueError, lgb.plot_metric, evals_result1)

        gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
        gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
        self.assertIsInstance(ax2, matplotlib.axes.Axes)
        self.assertEqual(ax2.get_title(), '')
        self.assertEqual(ax2.get_xlabel(), '')
        self.assertEqual(ax2.get_ylabel(), '')
Exemplo n.º 24
0
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=10, seed=0, rounds=20000): 
	params = {}
	params["objective"] = "regression"
	params['metric'] = 'rmse'
	params["max_depth"] = dep
	params["min_data_in_leaf"] = 100
	params["learning_rate"] = 0.04
	params["bagging_fraction"] = 0.7
	params["feature_fraction"] = 0.5
	params["bagging_freq"] = 5
	params["bagging_seed"] = seed
	#params["lambda_l2"] = 0.01
	params["verbosity"] = -1
	num_rounds = rounds

	plst = list(params.items())
	lgtrain = lgb.Dataset(train_X, label=train_y)

	if test_y is not None:
		lgtest = lgb.Dataset(test_X, label=test_y)
		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
	else:
		lgtest = lgb.Dataset(test_X)
		model = lgb.train(params, lgtrain, num_rounds)

	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
	if test_X2 is not None:
		pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
	imps = model.feature_importance()
	names = model.feature_name()
	for fi, fn in enumerate(names):
		print(fn, imps[fi])

	loss = 0
	if test_y is not None:
		loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y))
		print(loss)
		return pred_test_y, loss, pred_test_y2, model.best_iteration
	else:
		return pred_test_y
Exemplo n.º 25
0
    def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test):

        dtrain = lgbm.Dataset(kfold_X_train, label=y_train)
        dwatch = lgbm.Dataset(kfold_X_valid, label=y_test)

        best = lgbm.train(self.params, dtrain, num_boost_round=4000, verbose_eval=100, valid_sets=dwatch,
                          early_stopping_rounds=100)
        # 对验证集predict

        pred = best.predict(kfold_X_valid)
        results = best.predict(test)

        return pred, results, best
Exemplo n.º 26
0
 def test_continue_train_multiclass(self):
     X, y = load_iris(True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': 'multiclass',
         'metric': 'multi_logloss',
         'num_class': 3,
         'verbose': -1
     }
     lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False)
     init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=30,
                     valid_sets=lgb_eval,
                     verbose_eval=False,
                     evals_result=evals_result,
                     init_model=init_gbm)
     ret = multi_logloss(y_test, gbm.predict(X_test))
     self.assertLess(ret, 1.5)
     self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
Exemplo n.º 27
0
Arquivo: w_lgb.py Projeto: elaeon/ML
    def prepare_model(self, obj_fn=None, num_steps: int = 0, model_params=None, batch_size: int = None):
        data_train = self.ds[self.data_groups["data_train_group"]].to_ndarray()
        target_train = self.ds[self.data_groups["target_train_group"]].to_ndarray()
        data_val = self.ds[self.data_groups["data_validation_group"]].to_ndarray()
        target_val = self.ds[self.data_groups["target_validation_group"]].to_ndarray()
        columns = None
        data_train_ds = lgb.Dataset(data_train, label=target_train, feature_name=columns)
        data_valid_ds = lgb.Dataset(data_val, label=target_val, feature_name=columns)

        num_round = num_steps
        bst = lgb.train(model_params, data_train_ds, num_round, valid_sets=[data_valid_ds],
                        early_stopping_rounds=num_round/2, feval=obj_fn, verbose_eval=True)
        return self.ml_model(lgb, bst=bst)
Exemplo n.º 28
0
def scores_cv_params(params):
    train_scores=[]
    valid_scores=[]
    for train_ind,valid_ind in zip(train_index,valid_index):
        train_x = train_data[train_ind,:]
        train_y = train_label[list(train_ind)]
        valid_x = train_data[valid_ind,:]
        valid_y = train_label[list(valid_ind)]
        df_train = lgb.Dataset(train_x,label=train_y)
        lmodel = lgb.train(params,df_train)
        valid_pred = lmodel.predict(valid_x)
        valid_scores.append(metrics.mean_squared_error(valid_y,valid_pred))
    return np.mean(valid_scores)
Exemplo n.º 29
0
 def train_and_get_predictions(features, labels):
     dataset = lgb.Dataset(features, label=labels)
     lgb_params = {
         'application': 'binary',
         'verbose': -1,
         'min_data': 5,
     }
     lgbm_model = lgb.train(
         params=lgb_params,
         train_set=dataset,
         num_boost_round=10,
     )
     predictions = lgbm_model.predict(features)
     return predictions
Exemplo n.º 30
0
def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False)
        with Timer() as t:
            model = lgb.train(params, lgb_train, num_boost_round = num_boost_round)
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(validation_features)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results
Exemplo n.º 31
0
def main_lgbm(fold_offset):

    for fold_id, (train_index, test_index) in enumerate(
            KFold(n_splits=10).split(all_races_train)):
        all_races_train_train = all_races_train[train_index]
        all_races_train_valid = all_races_train[test_index]
        all_races_rank_train_train = []
        all_races_query_train_train = []
        all_races_target_train_train = []
        all_races_rank_train_valid = []
        all_races_query_train_valid = []
        all_races_target_train_valid = []
        get_race_gets(all_races_train_train, all_races_rank_train_train,
                      all_races_query_train_train,
                      all_races_target_train_train)
        get_race_gets(all_races_train_valid, all_races_rank_train_valid,
                      all_races_query_train_valid,
                      all_races_target_train_valid)
        all_races_rank_train_train = np.array(all_races_rank_train_train)
        all_races_query_train_train = np.array(all_races_query_train_train)
        all_races_target_train_train = np.array(all_races_target_train_train)
        all_races_rank_train_valid = np.array(all_races_rank_train_valid)
        all_races_query_train_valid = np.array(all_races_query_train_valid)
        all_races_target_train_valid = np.array(all_races_target_train_valid)

        lgbm_params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'lambdarank',
            'metric': 'ndcg',  # for lambdarank
            'ndcg_eval_at': [1, 2, 3],  # for lambdarank
            'max_position': max_position,  # for lambdarank
            'learning_rate': 1e-8,
            'min_data': 1,
            'min_data_in_bin': 1,
        }
        lgtrain = lgb.Dataset(all_races_rank_train_train,
                              all_races_target_train_train,
                              categorical_feature=[0, 1, 2, 3, 4, 7] +
                              list(range(8, 23)),
                              group=all_races_query_train_train)
        lgvalid = lgb.Dataset(all_races_rank_train_valid,
                              all_races_target_train_valid,
                              categorical_feature=[0, 1, 2, 3, 4, 7] +
                              list(range(8, 23)),
                              group=all_races_query_train_valid)
        lgb_clf = lgb.train(lgbm_params,
                            lgtrain,
                            categorical_feature=[0, 1, 2, 3, 4] +
                            list(range(6, 21)),
                            num_boost_round=10,
                            valid_sets=[lgtrain, lgvalid],
                            valid_names=['train', 'valid'],
                            early_stopping_rounds=2,
                            verbose_eval=1)

        if len(test_src) > 0:
            dst = norm_racedata(lgb_clf.predict(all_races_rank_test),
                                all_races_query_test)
            for dst_ind in range(len(dst)):
                test_validation_regression[dst_ind][fold_offset +
                                                    fold_id] = dst[dst_ind]
            cur_pos = 0
        if len(in_data) != 0 and len(in_meta) != 0:
            dst = norm_racedata(lgb_clf.predict(predict_races_target),
                                [len(predict_races_target)])
            for dst_ind in range(len(dst)):
                predict_validation_regression[dst_ind][fold_offset +
                                                       fold_id] = dst[dst_ind]
Exemplo n.º 32
0
    def epoch_train(self,
                    dataloader,
                    run_num,
                    is_multi_label=False,
                    info=None,
                    time_remain=None):
        self.is_multi_label = is_multi_label
        X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[
            'train_idxs'], dataloader['cat_cols']
        train_x, train_y = X.loc[train_idxs], y[train_idxs]

        if info['mode'] == 'bagging':
            self.hyperparams = info['lgb'].copy()
            self.hyperparams['seed'] = np.random.randint(0, 2020)
            num_leaves = self.hyperparams['num_leaves']
            self.hyperparams['num_leaves'] += np.random.randint(
                -int(num_leaves / 10), int(num_leaves / 10))
            run_num = 0

        if run_num == self.explore_params_round:
            print('lgb explore_params_round')
            train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y)

            self.log_feat_importances()

            if train_x.shape[1] > 300 and train_x.shape[0] > 20000:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 2w samples')

            elif train_x.shape[0] > 20000:
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 2w samples')

            elif train_x.shape[1] > 300:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')

            print('shape: ', train_x.shape)

            self.bayes_opt(train_x, val_x, train_y, val_y, cat, phase=1)
            self.early_stop_opt(train_x, val_x, train_y, val_y, cat)
            info['lgb'] = self.hyperparams.copy()
            info['imp_cols'] = self.import_cols

        if run_num == self.ensemble_num:
            print('lgb ensemble_num')
            splits = dataloader['splits']
            for i in range(len(splits)):
                train_idxs, val_idxs = splits[i]
                train_x, train_y = X.loc[train_idxs], y[train_idxs]
                hyperparams = self.hyperparams.copy()
                # num_leaves = hyperparams['num_leaves']
                # num_leaves += np.random.randint(-int(num_leaves/10), int(num_leaves/10))
                # hyperparams['num_leaves'] = num_leaves
                # log('model {} leaves {}'.format(i, num_leaves))
                if self.is_multi_label:
                    self.en_models = defaultdict(list)
                    for cls in range(self.num_class):
                        cls_y = train_y[:, cls]
                        lgb_train = lgb.Dataset(train_x, cls_y)
                        if not self.learning_rates:
                            self.en_models[i].append(
                                lgb.train({
                                    **self.params,
                                    **hyperparams
                                },
                                          train_set=lgb_train))
                        else:
                            self.en_models[i].append(
                                lgb.train({
                                    **self.params,
                                    **hyperparams
                                },
                                          train_set=lgb_train,
                                          learning_rates=self.learning_rates))
                else:
                    lgb_train = lgb.Dataset(train_x, ohe2cat(train_y))
                    if not self.learning_rates:
                        self.en_models[i] = lgb.train(
                            {
                                **self.params,
                                **hyperparams
                            },
                            train_set=lgb_train)
                    else:
                        self.en_models[i] = lgb.train(
                            {
                                **self.params,
                                **hyperparams
                            },
                            train_set=lgb_train,
                            learning_rates=self.learning_rates)
                self.ensemble_pred = True

        else:
            print('lgb norm train')
            train_x, train_y = X.loc[train_idxs], y[train_idxs]
            hyperparams = self.hyperparams.copy()
            log('hyperparams {}'.format(hyperparams))
            if run_num == self.all_data_round_pre or run_num == self.all_data_round:
                print('lgb all data round')
                all_train_idxs = dataloader['all_train_idxs']
                train_x = X.loc[all_train_idxs]
                train_y = y[all_train_idxs]
            print('shape: ', train_x.shape)
            if not is_multi_label:
                lgb_train = lgb.Dataset(train_x, ohe2cat(train_y))
                if not self.learning_rates:
                    self._model = lgb.train({
                        **self.params,
                        **hyperparams
                    },
                                            train_set=lgb_train)
                else:
                    self._model = lgb.train({
                        **self.params,
                        **hyperparams
                    },
                                            train_set=lgb_train,
                                            learning_rates=self.learning_rates)
            else:
                self.params['num_class'] = 2
                for cls in range(self.num_class):
                    cls_y = train_y[:, cls]
                    lgb_train = lgb.Dataset(train_x, cls_y)
                    if not self.learning_rates:
                        self.models[cls] = lgb.train(
                            {
                                **self.params,
                                **self.hyperparams
                            },
                            train_set=lgb_train)
                    else:
                        self.models[cls] = lgb.train(
                            {
                                **self.params,
                                **self.hyperparams
                            },
                            train_set=lgb_train,
                            learning_rates=self.learning_rates)
            self.log_feat_importances()
            if self.imp_nums is not None:
                info['imp_nums'] = self.imp_nums
Exemplo n.º 33
0
    'data_random_seed': 1,
    'bagging_fraction': 0.5,
    'nthread': 4
}
params2 = {
    'learning_rate': 0.85,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 130,
    'verbosity': -1,
    'metric': 'RMSE',
    'data_random_seed': 2,
    'bagging_fraction': 1,
    'nthread': 4
}
model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
early_stopping_rounds=1000, verbose_eval=1000)
X_test = get_feature(test_x)
predsL = model.predict(X_test)
print('[{}] Predict lgb 1 completed.'.format(time.time() - start_time))
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=101)
d_train2 = lgb.Dataset(train_X2, label=train_y2)
d_valid2 = lgb.Dataset(valid_X2, label=valid_y2)
watchlist2 = [d_train2, d_valid2]
model = lgb.train(params2, train_set=d_train2, num_boost_round=6000, valid_sets=watchlist2, \
early_stopping_rounds=500, verbose_eval=500)
predsL2 = model.predict(X_test)
print('[{}] Predict lgb 2 completed.'.format(time.time() - start_time))
preds = (predsL * 0.5 + predsL2 * 0.5)
Exemplo n.º 34
0
def main():
    start_time = time.time()
    #  stop-word, can add any wording I want to replace
    stopwords = set([
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
        'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
        'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
        'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
        'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
        'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
        'with', 'about', 'against', 'between', 'into', 'through', 'during',
        'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
        'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
        'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
        'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
        'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
        't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm',
        'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn',
        'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan',
        'shouldn', 'wasn', 'weren', 'won', 'wouldn', '&', 'brand new', 'new',
        '[rm]', 'free ship.*?', 'rm', 'price firm', 'no description yet'
    ])

    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    train = pd.read_csv('../input/train.tsv',
                        sep="\t",
                        encoding='utf-8',
                        converters={
                            'item_description':
                            lambda x: pattern.sub('', x.lower()),
                            'name':
                            lambda x: pattern.sub('', x.lower())
                        })
    print("finished to load train file : {}".format(time.time() - start_time))
    test = pd.read_csv('../input/test.tsv',
                       sep="\t",
                       encoding='utf-8',
                       converters={
                           'item_description':
                           lambda x: pattern.sub('', x.lower()),
                           'name': lambda x: pattern.sub('', x.lower())
                       })
    print("finished to load test file : {}".format(time.time() - start_time))
    train_label = np.log1p(train['price'])
    print("finished to log price : {}".format(time.time() - start_time))
    train_texts = train['name'].tolist()
    test_texts = test['name'].tolist()
    handle_missing(train)
    handle_missing(test)
    print("finished to handle missing : {}".format(time.time() - start_time))
    handle_nm_word_len(train)
    handle_nm_word_len(test)
    handle_desc_word_len(train)
    handle_desc_word_len(test)
    handle_nm_len(train)
    handle_nm_len(test)
    handle_desc_len(train)
    handle_desc_len(test)
    print("finished to handle len : {}".format(time.time() - start_time))
    #    print(train.describe())
    nrow_train = train.shape[0]
    handle_category(train)
    handle_category(test)
    print("finished to handle category : {}".format(time.time() - start_time))
    count = CountVectorizer(min_df=NAME_MIN_DF)
    X_name_mix = count.fit_transform(train['name'].append(test['name']))
    X_name = X_name_mix[:nrow_train]
    X_t_name = X_name_mix[nrow_train:]
    tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 3),
                         stop_words='english')
    gc.collect()
    X_description_mix = tv.fit_transform(train['item_description'].append(
        test['item_description']))
    print("finished to handle tfidf : {}".format(time.time() - start_time))
    X_description = X_description_mix[:nrow_train]
    X_t_description = X_description_mix[nrow_train:]
    print("finished to handle description : {}".format(time.time() -
                                                       start_time))
    #handle label encoder
    cat_features = [
        'subcat_2', 'subcat_1', 'subcat_0', 'brand_name', 'category_name',
        'item_condition_id', 'shipping'
    ]
    handle_laberencoder(train, test, cat_features)
    X_cat, X_test_cat = handle_onehot(train, test, cat_features)
    #   print(train.describe())
    print("finished to label encoder : {}".format(time.time() - start_time))

    train_feature = ['desc_word_len', 'nm_word_len', 'desc_len', 'nm_len']
    train_list = [train[train_feature].values, X_description, X_name, X_cat]
    test_list = [
        test[train_feature].values, X_t_description, X_t_name, X_test_cat
    ]
    X = ssp.hstack(train_list).tocsr()
    X_test = ssp.hstack(test_list).tocsr()
    print("finished to handle features : {}".format(time.time() - start_time))

    kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=128)

    learning_rate = 0.8
    num_leaves = 128
    min_data_in_leaf = 1000
    feature_fraction = 0.5
    bagging_fraction = 0.9
    bagging_freq = 1000
    num_boost_round = 1000
    params = {
        "objective": "regression",
        "boosting_type": "gbdt",
        "learning_rate": learning_rate,
        "num_leaves": num_leaves,
        "feature_fraction": feature_fraction,
        "bagging_freq": bagging_freq,
        "bagging_fraction": bagging_fraction,
        "verbosity": 0,
        "metric": "l2_root",
        "nthread": 4,
        "subsample": 0.9
    }

    test_id = test['test_id']
    cv_pred = np.zeros(len(test_id))

    kf = kfold.split(X)
    for i, (train_fold, test_fold) in enumerate(kf):
        X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[test_fold, :], train_label[train_fold], train_label[test_fold]
        dtrain = lgbm.Dataset(X_train, label_train)
        print('dtrain time: {}'.format(time.time() - start_time))
        dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
        print('dvalid time: {}'.format(time.time() - start_time))
        bst = lgbm.train(params,
                         dtrain,
                         num_boost_round,
                         valid_sets=dvalid,
                         verbose_eval=100,
                         early_stopping_rounds=100)
        print('train time: {}'.format(time.time() - start_time))
        cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
        print('predict time', time.time() - start_time)
        gc.collect()
    cv_pred /= NFOLDS
    cv_pred = np.expm1(cv_pred)
    submission = test[["test_id"]]
    submission["price"] = cv_pred
    submission.to_csv("./submission.csv", index=False)
    print('done', time.time() - start_time)
Exemplo n.º 35
0
def done(istrain=True):

    #    op=['num_trees','max_depth','max_bin','bagging_fraction','lambda']
    #    cv_params['num_trees'] = 315
    cv_params['num_trees'] = 300
    #    cv_params['num_leaves'] = 50
    #    cv_params['max_depth'] = 6
    #    op=['max_bin','bagging_fraction','lambda']
    op = ['x']
    ### 开始训练
    logging.debug('设置参数')
    if istrain:
        for i in [100, 799]:
            train_save, val_save, val_x, val_y = tiny_lightgbm_data_get_train(
                i)
            for oper in op:
                logging.debug("CV:" + oper)
                modelfit_cv(train_save, cv_type=oper)
                ret = dump(
                    cv_params, FLAGS.out_data_path + 'cv_params_' + oper +
                    'lgbm.joblib_dat')
            logging.debug("开始训练")
            try:
                init_model = load(FLAGS.out_data_path + '1-' + str(i) +
                                  '-lgbm.model.joblib_dat')
            except:
                init_model = None
            gbm = lgb.train(
                cv_params,  # 参数字典
                train_save,  # 训练集
                num_boost_round=1000,  # 迭代次数
                valid_sets=val_save,  # 验证集
                init_model=init_model,
                #                        init_model=None,
                #                        learning_rates=0.01,
                verbose_eval=True,
                early_stopping_rounds=60)  # 早停系数

            logging.debug("to save validation predictions ...")
            ret = dump(
                gbm,
                FLAGS.out_data_path + '1-' + str(i) + '-lgbm.model.joblib_dat')
            logging.debug(ret)

            ### 验证
            logging.debug("验证")
            preds_offline = gbm.predict(
                val_x, num_iteration=gbm.best_iteration)  # 输出概率

            logging.debug('log_loss:')
            logging.debug(log_loss(val_y, preds_offline))

            ### 特征选择
            df = pd.DataFrame(val_x.columns.tolist(), columns=['feature'])
            df['importance'] = list(gbm.feature_importance())  # 特征分数
            df = df.sort_values(by='importance', ascending=False)  # 特征排序
            df.to_csv(FLAGS.out_data_path + 'feature_score.csv',
                      index=None,
                      encoding='utf-8')  # 保存分数

            del train_save, val_save, val_x, val_y

    else:
        for i in [100, 799]:
            gbm = load(FLAGS.out_data_path + '1-' + str(i) +
                       '-lgbm.model.joblib_dat')
            #        logging.debug(gbm.get_params())
            ### 线下预测
            test_save = tiny_lightgbm_data_get_test()
            logging.debug("预测")
            dtrain_predprob = gbm.predict(
                test_save, num_iteration=gbm.best_iteration)  # 输出概率

            logging.debug(dtrain_predprob)
            y_pred = [round(value, 4) for value in dtrain_predprob]
            logging.debug('-' * 30)
            y_pred = np.array(y_pred).reshape(-1, 1)
            logging.debug(y_pred.shape)
            test_id = pd.read_csv(FLAGS.test_id_path + 'test_id.csv')
            logging.debug(test_id['id'].shape)
            test_id['id'] = test_id['id'].map(int)
            test_id['click'] = y_pred
            test_id.to_csv(FLAGS.out_data_path + '1-' + str(i) +
                           '-lgbm.test.csv',
                           index=False)

            del test_save
Exemplo n.º 36
0
    'metric': {'binary_logloss'},
    'num_leaves': 63,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
}

# number of leaves,will be used in feature transformation
num_leaf = 63

print('Start training...')
# train
gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

# feature transformation and write result
print('Writing transformed training data')
transformed_training_matrix = np.zeros(
    [len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
    temp = np.arange(len(y_pred[0])) * num_leaf - 1 + np.array(y_pred[i])
Exemplo n.º 37
0
def train(x_train, y_train, x_valid, y_valid):

    usecols = x_train.columns.values
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)

    all_params = {
        'min_child_weight': [25],
        'subsample': [0.7],
        'subsample_freq': [1],
        'seed': [114],
        'colsample_bytree': [0.6],
        'learning_rate': [0.1],
        'max_depth': [-1],
        'min_split_gain': [0.001],
        'reg_alpha': [0.0001],
        'max_bin': [2047],
        'num_leaves': [127],
        'objective': ['binary'],
        'metric': [['binary_logloss', 'auc']],
        'scale_pos_weight': [1],
        'verbose': [-1],
    }

    use_score = 0
    min_score = (100, 100, 100)

    for params in tqdm(list(ParameterGrid(all_params))):
        cnt = -1
        list_score = []
        list_score2 = []
        list_best_iter = []
        all_pred = np.zeros(y_train.shape[0])
        if 1:
            cnt += 1
            trn_x = x_train
            val_x = x_valid
            trn_y = y_train
            val_y = y_valid
            train_data = lgb.Dataset(
                trn_x.values.astype(np.float32),
                label=trn_y,
                categorical_feature=CAT_FEAT,
                feature_name=x_train.columns.values.tolist())
            test_data = lgb.Dataset(
                val_x.values.astype(np.float32),
                label=val_y,
                categorical_feature=CAT_FEAT,
                feature_name=x_train.columns.values.tolist())
            del trn_x
            gc.collect()
            clf = lgb.train(
                params,
                train_data,
                10000,  # params['n_estimators'],
                early_stopping_rounds=30,
                valid_sets=[test_data],
                # feval=cst_metric_xgb,
                # callbacks=[callback],
                verbose_eval=10)
            pred = clf.predict(val_x)

            #all_pred[test] = pred

            _score2 = log_loss(val_y, pred)
            _score = -roc_auc_score(val_y, pred)

            logger.info('   _score: %s' % _score)
            logger.info('   _score2: %s' % _score2)

            list_score.append(_score)
            list_score2.append(_score2)

            if clf.best_iteration != 0:
                list_best_iter.append(clf.best_iteration)
            else:
                list_best_iter.append(params['n_estimators'])
            gc.collect()

        score = (np.mean(list_score), np.min(list_score), np.max(list_score))
        score2 = (np.mean(list_score2), np.min(list_score2),
                  np.max(list_score2))

        if min_score[use_score] > score[use_score]:
            min_score = score
            min_params = params

    imp = pd.DataFrame(clf.feature_importance(), columns=['imp'])
    imp['col'] = usecols
    n_features = imp.shape[0]
    imp = imp.sort_values('imp', ascending=False)
    imp.to_csv(DIR + 'feature_importances_0.csv')

    del val_x
    del trn_y
    del val_y
    del train_data
    del test_data
    gc.collect()

    trees = np.mean(list_best_iter)

    x_train = pd.concat([x_train, x_valid], axis=0, ignore_index=True)
    y_train = np.r_[y_train, y_valid]
    del x_valid
    del y_valid
    gc.collect()

    train_data = lgb.Dataset(x_train.values.astype(np.float32),
                             label=y_train,
                             categorical_feature=CAT_FEAT,
                             feature_name=x_train.columns.values.tolist())
    del x_train
    gc.collect()

    clf = lgb.train(min_params,
                    train_data,
                    int(trees * 1.1),
                    valid_sets=[train_data],
                    verbose_eval=10)

    #del x_train
    gc.collect()
    return min_params
t0 = t - t1
print('val size:', t, 'number of 1:', t1, 'number of 0:', t0)
print('val: 1 in all:', t1/t, '0 in all:', t0/t, '1/0:', t1/t0)
print()
print()

train_set = lgb.Dataset(X_tr, Y_tr)
val_set = lgb.Dataset(X_val, Y_val)
del X_tr, Y_tr, X_val, Y_val


print('Training...')

model = lgb.train(params,
                  train_set,
                  num_boost_round=num_boost_round,
                  early_stopping_rounds=early_stopping_rounds,
                  valid_sets=val_set,
                  verbose_eval=verbose_eval,
                  )

print('best score:', model.best_score['valid_0']['auc'])
print('best iteration:', model.best_iteration)

print()
time_elapsed = time.time() - since
print('[timer]: complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))


Exemplo n.º 39
0
lgb_eval = lgb.Dataset(data=X_test, label=y_test, reference=lgb_train,  free_raw_data=False)
evals_result={}
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1
}

md = XGBClassifier()
mdl = lgb.train(params,
                lgb_train,
                valid_sets = lgb_eval,
                num_boost_round= 150,
                early_stopping_rounds= 25,
                evals_result=evals_result)


plot_model_performance(mdl, X_test, y_test)

fair = get_fair_metrics_and_plot(data_orig_test, mdl)


### Reweighing
RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

data_transf_train = RW.fit_transform(data_orig_train)

# Train and save the model
Exemplo n.º 40
0
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': len(tags2ids),
    'min_data_in_bin': 1,
    'min_data': 1,
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 1,
    'verbose': 0
}

gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=lgb_eval, early_stopping_rounds=20)

gbm.save_model('../model/LGBPC_model.txt')


# yprob = gbm.predict(X_test).reshape(y_test.shape[0], len(tags2ids))
yprob = gbm.predict(X_test)

ylabel = np.argmax(yprob, axis=1)
error = sum( int(ylabel[i]) != y_test[i] for i in range(len(y_test))) / float(len(y_test))
acc = 1. - error
print ('predicting, classification acc=%f' % (acc))



Exemplo n.º 41
0
    def fit_and_predict(self, X_train, X_test, y_train, groups):
        if self.cv == "mcs":
            folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100)
        elif self.cv == "group":
            folds = GroupKFold(n_splits=10)
        elif self.cv == "stratified":
            folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            y_to_stratify = pd.cut(y_train["Global_Sales_log1p"],
                                   bins=7,
                                   labels=False)

        oof = np.zeros(len(X_train))
        predictions = np.zeros(len(X_test))
        feature_importance_df = pd.DataFrame()
        fold_scores = []

        # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)):
        for fold, (train_idx,
                   val_idx) in enumerate(folds.split(X_train, y_to_stratify)):
            self.logger.debug("-" * 100)
            self.logger.debug(f"Fold {fold+1}")
            train_data = lgb.Dataset(X_train.iloc[train_idx],
                                     label=y_train.iloc[train_idx])
            val_data = lgb.Dataset(X_train.iloc[val_idx],
                                   label=y_train.iloc[val_idx])
            callbacks = [log_evaluation(self.logger, period=100)]
            clf = lgb.train(self.params,
                            train_data,
                            valid_sets=[train_data, val_data],
                            verbose_eval=100,
                            early_stopping_rounds=100,
                            callbacks=callbacks)  #, feval=eval_func)
            oof[val_idx] = clf.predict(X_train.iloc[val_idx].values,
                                       num_iteration=clf.best_iteration)
            fold_score = mean_squared_log_error(
                np.expm1(y_train.iloc[val_idx].values),
                np.expm1(oof[val_idx]))**.5
            fold_scores.append(fold_score)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X_train.columns.values
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type="gain")
            fold_importance_df["fold"] = fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)

            predictions += np.expm1(
                clf.predict(X_test,
                            num_iteration=clf.best_iteration)) / folds.n_splits

        _feature_importance_df = feature_importance_df[[
            "feature", "importance"
        ]].groupby("feature").mean().sort_values(by="importance",
                                                 ascending=False)  # .head(50)
        self.logger.debug("##### feature importance #####")
        self.logger.debug(_feature_importance_df.head(50))
        cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}")

        # # RETRAIN
        # # exp057
        # # RETRAIN

        # k = 500
        # topk_features = _feature_importance_df.index[:k]
        # self.logger.debug(f"selected {len(topk_features)} features: {topk_features}")

        # oof = np.zeros(len(X_train))
        # predictions = np.zeros(len(X_test))
        # feature_importance_df = pd.DataFrame()
        # fold_scores = []

        # # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)):
        # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)):
        #     self.logger.debug("-" * 100)
        #     self.logger.debug(f"Fold {fold+1}")
        #     train_data = lgb.Dataset(X_train.loc[train_idx, topk_features], label=y_train.iloc[train_idx])
        #     val_data = lgb.Dataset(X_train.loc[val_idx, topk_features], label=y_train.iloc[val_idx])
        #     callbacks = [log_evaluation(self.logger, period=100)]
        #     clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100, callbacks=callbacks)  #, feval=eval_func)
        #     oof[val_idx] = clf.predict(X_train.loc[val_idx, topk_features].values, num_iteration=clf.best_iteration)
        #     fold_score = mean_squared_log_error(np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx])) ** .5
        #     fold_scores.append(fold_score)

        #     fold_importance_df = pd.DataFrame()
        #     fold_importance_df["feature"] = topk_features
        #     fold_importance_df["importance"] = clf.feature_importance(importance_type="gain")
        #     fold_importance_df["fold"] = fold + 1
        #     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        #     predictions += np.expm1(clf.predict(X_test[topk_features], num_iteration=clf.best_iteration)) / folds.n_splits

        # feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50)
        # self.logger.debug("##### feature importance #####")
        # self.logger.debug(feature_importance_df)
        # cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        # self.logger.debug(f"cv_score_fold_mean: {cv_score_fold_mean}")

        return predictions, cv_score_fold_mean
Exemplo n.º 42
0
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'min_split_gain': 0.0970905919552776,
        'min_child_weight': 9.42012323936088,
    }

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
        train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
        valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]

        lgb_train = lgb.Dataset(train_x, label=train_y)
        lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=lgb_round[sex_age],
                        valid_sets=[lgb_train, lgb_eval],
                        verbose_eval=50)

        oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])

    train['sex_age_bin_prob_oof_' + str(sex_age)] = oof_preds

    #用全部的train来预测test
    lgb_train = lgb.Dataset(X, label=Y)

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=lgb_round[sex_age],
                    valid_sets=lgb_train,
                    verbose_eval=50)
Exemplo n.º 43
0
for loop in range(loop_num):
    kf = StratifiedKFold(train.label,n_folds = every_loop_num, shuffle=True, random_state=520)
    for i, (train_index, test_index) in enumerate(kf):
        print('第{}-{}次训练...'.format(loop,i))
        train_feat1 = train.iloc[train_index].copy()
        train_feat2 = train.iloc[test_index].copy()
        
    
        
        lgb_train1 = lgb.Dataset(train_feat1[predictor], train_feat1['label'])
        lgb_train2 = lgb.Dataset(train_feat2[predictor], train_feat2['label'] )
        gbm = lgb.train(params,
                        lgb_train1,
                        num_boost_round=5000,
                        valid_sets=lgb_train2,
                        verbose_eval=100,
                        feval  =f1_error,
                        early_stopping_rounds=300)
        feat_imp = pd.Series(gbm.feature_importance(), index=predictor).sort_values(ascending=False)
       
        lgb_pre = gbm.predict(train_feat2[predictor])
        train_preds[test_index,loop] = lgb_pre
        
        lgb_pre_test = gbm.predict(test[predictor])
        test_preds[:,i+loop*every_loop_num] = lgb_pre_test

    
    print('线下train得分:    {}'.format(check_f1(train_preds[:,loop],train['label'])))
    scores[loop]=check_f1(train_preds[:,loop],train['label'])
#%%
Exemplo n.º 44
0
###############################################################################
#                          Data Location
###############################################################################

Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xfinal,
                                                YtrainData,
                                                train_size=0.90,
                                                test_size=0.1)
localTrainData = lgb.Dataset(
    Xtrain,
    Ytrain,
)

params = {'max_depth': 14, 'num_leaves': 2048}

localModel = lgb.train(params, localTrainData, num_boost_round=500)

joblib.dump(localModel, GlobalDirectory + 'lgbModel.pkl')

localPrediction = localModel.predict(Xtest)
currentPerformance = CurrentModelMetric(Ytest, localPrediction)

###############################################################################
#                         Test
###############################################################################

WeatherTestDataDir = GlobalDirectory + 'weather_test.csv'
TestDataDir = GlobalDirectory + 'test.csv'

WeatherTest = pd.read_csv(WeatherTestDataDir)
TestData = pd.read_csv(TestDataDir)
Exemplo n.º 45
0
dfTest = df_train['target_2015'].values

#grupos = np.quantile(df_train["target_2015"], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
fold = KFold(n_splits=10, shuffle = True, random_state = 1992)

for train_index, test_index in fold.split(dfTrain):
  
  X_train, X_test = dfTrain.loc[train_index], dfTrain.loc[test_index]
y_train, y_test = dfTest[train_index], dfTest[test_index]

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

clf = lgb.train(params = params, 
                early_stopping_rounds = 500,
                verbose_eval = 200,
                train_set = train_data,
                valid_sets = test_data)

y_pred = clf.predict(X_test) 

print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))

errlgb.append(np.sqrt(mean_squared_error(y_test, y_pred)))

p = clf.predict(Xtest)

y_pred_totlgb.append(p)

# Predicciones promedio
predichos = np.mean(y_pred_totlgb,0)
Exemplo n.º 46
0
def objective(trial, X_train, y_train, params, class_weight_map):
    # x_train, y_train: ndarray
    start_time = timer()    
    global exp_counter
    exp_counter += 1
    param_update = { # api doc - https://lightgbm.readthedocs.io/en/latest/Parameters.html#max_depth
        'learning_rate': 0.06, # trial.suggest_float('learning_rate', 1e-4, 1e-2),
        'max_depth': trial.suggest_int('max_depth', 1, 127), # default: -1 (no limit)
        'num_leaves': trial.suggest_int('num_leaves', 15, 255), # default: 31. Total num of leaves in one tree.
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-7, 1.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-7, 1.0, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 1000)        
        # 'num_leaves': trial.suggest_categorical('num_leaves', [31, 63, 127, 255]), # default: 31
        # 'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0), # default: 0. lambda_l1.
        # 'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0), # default: 0. lambda_l2.
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9), # feature fraction.
        # 'min_child_samples': trial.suggest_int('min_child_samples', 1, 300), # min_data_in_leaf.
        # 'subsample_freq': trial.suggest_int('subsample_freq', 1, 10), # NOTE definition - With subsample (or bagging_fraction)  you can specify the percentage of rows used per tree building iteration. 
        # 'subsample': trial.suggest_float('subsample', 0.3, 0.9), # https://lightgbm.readthedocs.io/en/latest/Parameters.html
        # # 'max_bin': trial.suggest_int('max_bin', 128, 1024), # default: 255. smaller more power to deal with overfitting
        # 'max_bin': trial.suggest_categorical('max_bin', [15, 31, 63, 127, 255]), # default: 255. smaller more power to deal with overfitting
        # 'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200), # default: 100
        # 'cat_smooth': trial.suggest_int('cat_smooth', 10, 100),
        # 'cat_l2': trial.suggest_int('cat_l2', 1, 20) # L2 regularization in categorical split
    }
    params.update(param_update)

    losses = []    
    # pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc") # depends on the choice of eval_metric; "validation_0-logloss"
    rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=SEED)
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    for i, (train_index, valid_index) in enumerate(rskf.split(X_train, y_train)):
        print(f"{exp_counter} - {i}")
        X_A, X_B = X_train.iloc[train_index, :], X_train.iloc[valid_index, :]
        y_A, y_B = y_train.iloc[train_index], y_train.iloc[valid_index]
        
        # # It doesn't work.
        # smo_tek = SMOTETomek(random_state=0) 
        # X_smotek, y_smotek = smo_tek.fit_resample(X_A, y_A)
        
        lgb_train = lgb.Dataset(X_A, y_A, weight=y_A.map(class_weight_map)) # https://tinyurl.com/yzdao9nr
        # lgb_train = lgb.Dataset(X_smotek, y_smotek) # it doesn't work
        lgb_valid = lgb.Dataset(X_B, y_B, reference=lgb_train)   
        lgbm_model = lgb.train(
            params, 
            lgb_train, 
            valid_sets=[lgb_train, lgb_valid],
            valid_names=['train', 'valid_0'],
            num_boost_round=10000,
            verbose_eval = False, # https://tinyurl.com/yhdmtdm8    
            early_stopping_rounds=20,
            callbacks=[pruning_callback]
        )             
        # lgbmClassifier = lgb.LGBMClassifier(**params)
        # lgbmClassifier.fit(
        #     X_A, y_A, eval_set=[(X_B, y_B)], 
        #     early_stopping_rounds=EARLY_STOPPING_ROUNDS, 
        #     verbose=VERBOSE,
        #     callbacks=[pruning_callback])
        y_oof = lgbm_model.predict(X_B) # not needed, num_iteration=lgbm_model.best_iteration
        losses.append(log_loss(y_B, y_oof))

    trial.set_user_attr(key="best_booster", value=lgbm_model) 
    res = np.mean(losses) 

    timer(start_time)
    return res 
        grid_df = grid_df[preds_mask].reset_index(drop=True)
        keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
        grid_df = grid_df[keep_cols]

        d_sales = grid_df[['d', 'sales']]
        substitute = d_sales['sales'].values
        substitute[(d_sales['d'] > END_TRAIN)] = np.nan
        grid_df['sales'] = substitute

        grid_df.to_pickle(processed_data_dir + 'test_' + store_id + '_' +
                          state_id + '.pkl')
        del grid_df, d_sales, substitute

        seed_everything(SEED)
        estimator = lgb.train(lgb_params,
                              train_data,
                              valid_sets=[valid_data],
                              verbose_eval=100)

        # display(pd.DataFrame({'name':estimator.feature_name(),
        #                       'imp':estimator.feature_importance()}).sort_values('imp',ascending=False).head(25))

        model_name = model_dir + 'lgb_model_' + store_id + '_' + state_id + '_v' + str(
            VER) + '.bin'
        pickle.dump(estimator, open(model_name, 'wb'))

        del train_data, valid_data, estimator
        gc.collect()

        MODEL_FEATURES = features_columns
Exemplo n.º 48
0
    'num_class': 4,
    'boosting': 'gbdt',
    'metric': 'multi_logloss'
}
losses = []    
for i, (train_index, valid_index) in enumerate(rskf.split(Xtrn, y)):

    X_A, X_B = Xtrn.iloc[train_index, :], Xtrn.iloc[valid_index, :]
    y_A, y_B = y.iloc[train_index], y.iloc[valid_index]
    lgb_train = lgb.Dataset(X_A, y_A)
    lgb_valid = lgb.Dataset(X_B, y_B, reference=lgb_train)   
    lgbm_model = lgb.train(
        params, 
        lgb_train, 
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train', 'valid_0'],
        num_boost_round=10000,
        verbose_eval = 50, # https://tinyurl.com/yhdmtdm8    
        early_stopping_rounds=20,
        # callbacks=[pruning_callback]
    )             
y_oof = lgbm_model.predict(X_B) # not needed, num_iteration=lgbm_model.best_iteration
losses.append(log_loss(y_B, y_oof))
print(np.mean(losses))
# %%
score = np.mean(losses)
model_pickle = save_trained_classifier(best_model, 'lgbm_8f5r_integration_vallina', score, save_directory)  

model_pickle = '/kaggle/working/may_model/202105271625_lgbm_1min'
lgbm_pickle = pickle.load(open(model_pickle, 'rb'))
lgbm_pickle.predict(Xtst)
Exemplo n.º 49
0
        params = {
            'objective': 'multiclass',
            'num_class': 4,
            'metric': 'None',
            'max_bin': 50,
            'num_leaves': 20,
            'lambda_l2': 0.1,
            'verbose': -1,
            'seed': seed
        }

        # train
        model = lgb.train(params,
                          train_data,
                          valid_sets=[train_data, valid_data],
                          num_boost_round=num_boost_round,
                          early_stopping_rounds=early_stopping_rounds,
                          verbose_eval=10,
                          feval=metric_f1)

        # evaluate
        y_val_pred = np.argmax(model.predict(X_valid), axis=1)
        score = f1_score(y_valid, y_val_pred, average='macro')
        scores.append(score)
        print(f"\nFold-{i+1}: Score: {score:.4f}\n")

        # predict test
        y_test_pred += model.predict(
            X_test, num_iteration=model.best_iteration) / n_folds

    # evaluate
Exemplo n.º 50
0
    def __call__(self, trial):
        # type: (Trial) -> float

        pbar_fmt = "{}, val_score: {:.6f}"

        if self.pbar is not None:
            self.pbar.set_description(pbar_fmt.format(self.step_name, self.best_score))

        if "lambda_l1" in self.target_param_names:
            self.lgbm_params["lambda_l1"] = trial.suggest_loguniform("lambda_l1", 1e-8, 10.0)
        if "lambda_l2" in self.target_param_names:
            self.lgbm_params["lambda_l2"] = trial.suggest_loguniform("lambda_l2", 1e-8, 10.0)
        if "num_leaves" in self.target_param_names:
            tree_depth = self.lgbm_params.get("max_depth", DEFAULT_TUNER_TREE_DEPTH)
            max_num_leaves = 2 ** tree_depth if tree_depth > 0 else 2 ** DEFAULT_TUNER_TREE_DEPTH
            self.lgbm_params["num_leaves"] = trial.suggest_int("num_leaves", 2, max_num_leaves)
        if "feature_fraction" in self.target_param_names:
            # `GridSampler` is used for sampling feature_fraction value.
            # The value 1.0 for the hyperparameter is always sampled.
            param_value = min(trial.suggest_uniform("feature_fraction", 0.4, 1.0 + EPS), 1.0)
            self.lgbm_params["feature_fraction"] = param_value
        if "bagging_fraction" in self.target_param_names:
            # `TPESampler` is used for sampling bagging_fraction value.
            # The value 1.0 for the hyperparameter might by sampled.
            param_value = min(trial.suggest_uniform("bagging_fraction", 0.4, 1.0 + EPS), 1.0)
            self.lgbm_params["bagging_fraction"] = param_value
        if "bagging_freq" in self.target_param_names:
            self.lgbm_params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 7)
        if "min_child_samples" in self.target_param_names:
            # `GridSampler` is used for sampling min_child_samples value.
            # The value 1.0 for the hyperparameter is always sampled.
            param_value = int(trial.suggest_uniform("min_child_samples", 5, 100 + EPS))
            self.lgbm_params["min_child_samples"] = param_value

        start_time = time.time()
        booster = lgb.train(self.lgbm_params, self.train_set, **self.lgbm_kwargs)

        val_score = self._get_booster_best_score(booster)
        elapsed_secs = time.time() - start_time
        average_iteration_time = elapsed_secs / booster.current_iteration()

        if self.model_dir is not None:
            path = os.path.join(self.model_dir, "{}.pkl".format(trial.number))
            with open(path, "wb") as fout:
                pickle.dump(booster, fout)
            _logger.info("The booster of trial#{} was saved as {}.".format(trial.number, path))

        if self.compare_validation_metrics(val_score, self.best_score):
            self.best_score = val_score
            self.best_booster_with_trial_number = (booster, trial.number)

        if self.pbar is not None:
            self.pbar.set_description(pbar_fmt.format(self.step_name, self.best_score))
            self.pbar.update(1)

        self.report.append(
            dict(
                # Since v1.2.0, action was concatenation of parameter names. Currently, it is
                # explicitly given to distinguish steps which tune the same parameters.
                action=self.step_name,
                trial=self.trial_count,
                value=str(trial.params),
                val_score=val_score,
                elapsed_secs=elapsed_secs,
                average_iteration_time=average_iteration_time,
            )
        )

        trial.set_system_attr(_ELAPSED_SECS_KEY, elapsed_secs)
        trial.set_system_attr(_AVERAGE_ITERATION_TIME_KEY, average_iteration_time)
        trial.set_system_attr(_STEP_NAME_KEY, self.step_name)
        trial.set_system_attr(_LGBM_PARAMS_KEY, json.dumps(self.lgbm_params))

        self.trial_count += 1

        return val_score
Exemplo n.º 51
0
 scores = []
 t0 = time.time()
 train_preds = np.zeros(train.shape[0])
 test_preds = np.zeros((test.shape[0], 10))
 # feat_imp = pd.DataFrame()
 kf = KFold(len(train), n_folds=10, shuffle=True, random_state=1024)
 for i, (train_index, test_index) in enumerate(kf):
     # print('第{}次训练...'.format(i))
     train_feat1 = train.iloc[train_index]
     train_feat2 = train.iloc[test_index]
     lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['血糖'])
     lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['血糖'])
     gbm = lgb.train(params,
                     lgb_train1,
                     num_boost_round=3000,
                     valid_sets=lgb_train2,
                     verbose_eval=False,
                     feval=evalerror,
                     early_stopping_rounds=50)
     feat_i = pd.DataFrame(
         pd.Series(gbm.feature_importance(),
                   index=predictors).sort_values(ascending=False))
     # feat_imp = pd.concat([feat_imp, feat_i],axis=1)
     train_preds[test_index] += gbm.predict(
         train_feat2[predictors], num_iteration=gbm.best_iteration)
     test_preds[:, i] = gbm.predict(test[predictors],
                                    num_iteration=gbm.best_iteration)
 # print(feat_imp)
 # feat_imp.to_csv("./feature_imp.csv",header=False)
 print('线下得分:{0}, min_data: {1}'.format(
     (mean_squared_error(train['血糖'], train_preds) * 0.5),
Exemplo n.º 52
0
    y_valid = Y_TEST
    early_stopping_rounds = 20
    num_boost_round = 3500
    metric = 'auc'
    params['metric'] = metric

    #========================================================================
    # Fitting
    #========================================================================
    lgb_train = lgb.Dataset(data=x_train, label=y_train)
    lgb_valid = lgb.Dataset(data=x_valid, label=y_valid)

    with timer("  * Train & Validation"):
        estimator = lgb.train(params=params,
                              train_set=lgb_train,
                              valid_sets=lgb_valid,
                              early_stopping_rounds=early_stopping_rounds,
                              num_boost_round=num_boost_round,
                              verbose_eval=200)
        best_iter = estimator.best_iteration

        oof_pred = estimator.predict(x_valid)
        score = roc_auc_score(y_valid, oof_pred)
        cvs = str(score).replace('.', '-')
        feim = get_tree_importance(estimator=estimator,
                                   use_cols=x_train.columns)
        feim.sort_values(by='importance', ascending=False, inplace=True)
        feim['is_valid'] = feim['feature'].map(valid_map)

    #========================================================================
    # PostProcess
    #========================================================================
Exemplo n.º 53
0
    "lambda_l1": 0.1,
    "verbosity": -1
}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=200,
                    early_stopping_rounds=100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx],
                                   num_iteration=clf.best_iteration)

    predictions_lgb += clf.predict(
        X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target)))

##### xgb
xgb_params = {
    'eta': 0.005,
    'max_depth': 11,
    'subsample': 0.85,
Exemplo n.º 54
0
    def fit(self,
            X_train=None,
            Y_train=None,
            X_test=None,
            Y_test=None,
            dataset_train=None,
            dataset_val=None,
            time_limit=None,
            **kwargs):
        start_time = time.time()
        params = self.params.copy()

        # TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored.
        verbosity = kwargs.get('verbosity', 2)
        params = fixedvals_from_searchspaces(params)

        if verbosity <= 1:
            verbose_eval = False
        elif verbosity == 2:
            verbose_eval = 1000
        elif verbosity == 3:
            verbose_eval = 50
        else:
            verbose_eval = 1

        eval_metric = self.get_eval_metric()
        dataset_train, dataset_val = self.generate_datasets(
            X_train=X_train,
            Y_train=Y_train,
            params=params,
            X_test=X_test,
            Y_test=Y_test,
            dataset_train=dataset_train,
            dataset_val=dataset_val)
        gc.collect()

        num_boost_round = params.pop('num_boost_round', 1000)
        logger.log(
            15, 'Training Gradient Boosting Model for %s rounds...' %
            num_boost_round)
        logger.log(15, "with the following hyperparameter settings:")
        logger.log(15, params)

        num_rows_train = len(dataset_train.data)
        if 'min_data_in_leaf' in params:
            if params[
                    'min_data_in_leaf'] > num_rows_train:  # TODO: may not be necessary
                params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0))

        # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
        if (dataset_val is not None) and (dataset_train is not None):
            if num_rows_train <= 10000:
                modifier = 1
            else:
                modifier = 10000 / num_rows_train
            early_stopping_rounds = max(round(modifier * 150), 10)
        else:
            early_stopping_rounds = 150

        callbacks = []
        valid_names = ['train_set']
        valid_sets = [dataset_train]
        if dataset_val is not None:
            reporter = kwargs.get('reporter', None)
            if reporter is not None:
                train_loss_name = self._get_train_loss_name()
            else:
                train_loss_name = None
            callbacks += [
                early_stopping_custom(early_stopping_rounds,
                                      metrics_to_use=[('valid_set',
                                                       self.eval_metric_name)],
                                      max_diff=None,
                                      start_time=start_time,
                                      time_limit=time_limit,
                                      ignore_dart_warning=True,
                                      verbose=False,
                                      manual_stop_file=False,
                                      reporter=reporter,
                                      train_loss_name=train_loss_name),
            ]
            valid_names = ['valid_set'] + valid_names
            valid_sets = [dataset_val] + valid_sets

        seed_val = params.pop('seed_value', 0)
        train_params = {
            'params': params,
            'train_set': dataset_train,
            'num_boost_round': num_boost_round,
            'valid_sets': valid_sets,
            'valid_names': valid_names,
            'callbacks': callbacks,
            'verbose_eval': verbose_eval,
        }
        if not isinstance(eval_metric, str):
            train_params['feval'] = eval_metric
        if seed_val is not None:
            train_params['params']['seed'] = seed_val
            random.seed(seed_val)
            np.random.seed(seed_val)

        # Train LightGBM model:
        try_import_lightgbm()
        import lightgbm as lgb
        self.model = lgb.train(**train_params)
        self.params_trained['num_boost_round'] = self.model.best_iteration
Exemplo n.º 55
0
        params = {
            'learning_rate':
            p['lgb_lr'],  # caution: params dict is modified by lgb
            'application': 'regression',
            'max_depth': p['lgb_max_depth'],
            'num_leaves': p['lgb_num_leaves'],
            'verbosity': -1,
            'metric': 'RMSE',
        }

        print("Fitting boosted trees")
        model_gb = lgb.train(params,
                             train_set=d_train,
                             num_boost_round=p['lgb_num_trees'],
                             valid_sets=watchlist,
                             early_stopping_rounds=50,
                             verbose_eval=0,
                             callbacks=[])

        print("Evaluating model")
        preds_gb = np.array(model_gb.predict(X_valid))
        preds = preds_gb
        score = mean_squared_log_error(y_valid, preds)**0.5

        if score < best_scores[0]:
            best_scores = (score, )
            best_params = p
            best_model_gb = model_gb

    print('Best score: {}'.format(best_scores[0]))
Exemplo n.º 56
0
    #"num_leaves": 10
}

# In[ ]:

# model = lgb.train(
#     params, lgb_train,
#     #valid_sets=[lgb_train],
#     verbose_eval=1,
#     num_boost_round=1,
#     early_stopping_rounds=8,
# )

# In[ ]:

lgb.train(params,
          lgb.Dataset(pd.DataFrame({'x': [1]}), [1], categorical_feature=None))

# In[ ]:

# In[ ]:

# In[ ]:

# In[ ]:

# In[ ]:

# In[ ]:

pd.set_option('display.max_columns', 100)
print(train[train.user_id == 4421282].sort_values('timestamp').iloc[0:100])
Exemplo n.º 57
0
feature_importance_df = pd.DataFrame()

for fold, (train_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold {fold+1}")
    train_data = lgb.Dataset(train.iloc[train_idx][use_cols],
                             label=log_target[train_idx],
                             categorical_feature=categorical_cols)
    val_data = lgb.Dataset(train.iloc[val_idx][use_cols],
                           label=log_target[val_idx],
                           categorical_feature=categorical_cols)
    num_round = N_ROUNDS
    callbacks = [log_evaluation(logger, period=100)]
    clf = lgb.train(params,
                    train_data,
                    num_round,
                    valid_sets=[train_data, val_data],
                    verbose_eval=False,
                    early_stopping_rounds=100,
                    callbacks=callbacks)
    oof[val_idx] = clf.predict(train[use_cols].values[val_idx],
                               num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = use_cols
    fold_importance_df["importance"] = clf.feature_importance(
        importance_type="gain")
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0)
    feature_importance_df = feature_importance_df[[
        "feature", "importance"
    print('Fold:', fold_n + 1)
    pd.options.mode.chained_assignment = None
    result_table['fold'].loc[fold_n] = fold_n + 1

    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
        valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    y_valid_coll[valid_index] = y_valid

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params,
                    dtrain,
                    10000,
                    valid_sets=[dtrain, dvalid],
                    verbose_eval=4)

    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()

    y_pred_valid = clf.predict(X_valid)

    print(round((log_loss(y_valid, y_pred_valid)), 2))
    result_table['log_loss'].loc[fold_n] = log_loss(y_valid, y_pred_valid)

    y_oof[valid_index] = y_pred_valid

    y_preds += clf.predict(test_df) / NFOLDS

    del X_train, X_valid, y_train, y_valid
Exemplo n.º 59
0


sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), 2000, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    sub['target'] += lgb_model.predict(test.values, num_iteration=lgb_model.best_iteration) / (kfold)
gc.collect()

#for i, (train_index, test_index) in enumerate(skf.split(X, y)):
#    print('[Fold %d/%d]' % (i + 1, kfold))
#    X_train, X_valid = X[train_index], X[test_index]
#    y_train, y_valid = y[train_index], y[test_index]
#    # Convert our data into XGBoost format
#    d_train = xgb.DMatrix(X_train, y_train)
#    d_valid = xgb.DMatrix(X_valid, y_valid)
#    d_test = xgb.DMatrix(test.values)
#    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
Exemplo n.º 60
0
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)

del d_train; gc.collect()
del x_train; gc.collect()

print("\nPrepare for LightGBM prediction ...")
print("   Read sample file ...")
sample = pd.read_csv('../data/sample_submission.csv')
print("   ...")
sample['parcelid'] = sample['ParcelId']
print("   Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print("   ...")
del sample, prop; gc.collect()
print("   ...")
#df_test['Ratio_1'] = df_test['taxvaluedollarcnt']/df_test['taxamount']