def main_crossvalid(frm, to): nfold = 5 df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data( frm, to) lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'num_leaves': 270, # 37, 'feature_fraction': 0.4, 'bagging_fraction': 0.65, 'bagging_freq': 2, 'learning_rate': 0.016, #'max_depth' : 8, #'min_split_gain' : 0.0222415, #'min_child_weight' : 20, 'nthread': 5, 'verbose': 0, #'reg_alpha' : 0.041545473, #'reg_lambda' : 0.0735294, 'drop_rate': 0.08 } skf = StratifiedKFold(y, n_folds=nfold) for i, (train_split, val_split) in enumerate(skf): #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5) print(train_split) X_train = hstack( [csr_matrix(df.iloc[train_split].values), ready_df[train_split]]) X_valid = hstack( [csr_matrix(df.iloc[val_split].values), ready_df[val_split]]) # Sparse Matrix y_train = y[train_split] y_valid = y[val_split] lgtrain = lgb.Dataset(X_train, y_train, feature_name=tfvocab, categorical_feature=categorical) lgvalid = lgb.Dataset(X_valid, y_valid, feature_name=tfvocab, categorical_feature=categorical) modelstart = time.time() lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=26000, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=100) print("Model Evaluation Stage") rmse = np.sqrt( metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))) print('RMSE:', rmse) f, ax = plt.subplots(figsize=[7, 10]) lgb.plot_importance(lgb_clf, max_num_features=100, ax=ax) plt.title("Light GBM Feature Importance") plt.savefig('feature_import.png', bbox_inches='tight') str_now = datetime.now().strftime("%m-%d-%H-%M") if not debug: lgb_clf.save_model('../model/model_{}.txt'.format(i), lgb_clf.best_iteration) else: lgb_clf.save_model('../model/model_debug_{}.txt'.format(i), lgb_clf.best_iteration) lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 subfile = '../result/dense_feature_{}.csv'.format(i) if debug: subfile = '../result/dense_feature_debug{}.csv'.format(i) kaggle_util.save_result(lgsub, subfile, competition='avito-demand-prediction', send=False, index=True) result_list = [] for i in range(nfold): subfile = '../result/dense_feature_{}.csv'.format(i) if debug: subfile = '../result/dense_feature_debug{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, False, competition='avito-demand-prediction', score_col='deal_probability', prefix='lgb_avg')
def main_crossvalid_xgboost(frm, to): import xgboost as xgb nfold = 5 df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data( frm, to) cat_features = [] cols = list(df.columns) for col in categorical: cat_features.append(cols.index(col)) #lgtest = xgb.DMatrix(testing.toarray()) #del testing #gc.collect() skf = StratifiedKFold(y, n_folds=nfold) for i, (train_split, val_split) in enumerate(skf): #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5) print(train_split) X_train = hstack( [csr_matrix(df.iloc[train_split].values), ready_df[train_split]]) X_valid = hstack( [csr_matrix(df.iloc[val_split].values), ready_df[val_split]]) # Sparse Matrix y_train = y[train_split] y_valid = y[val_split] #lgtrain = xgb.DMatrix(X_train.toarray(), label = y_train) #lgvalid = xgb.DMatrix(X_valid.toarray(), label = y_valid) #del X_train, X_valid, y_train #gc.collect() modelstart = time.time() bst = xgb.XGBRegressor(n_estimators=400, booster='gbtree', learning_rate=0.016, gamma=0, subsample=0.75, colsample_bylevel=0.5, max_depth=16, nthread=6) bst.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=False, early_stopping_rounds=100) print("Model Evaluation Stage") ypre = bst.predict(X_valid) rmse = np.sqrt(metrics.mean_squared_error(y_valid, ypre)) print('RMSE:', rmse) """ f, ax = plt.subplots(figsize=[7,10]) xgb.plot_importance(bst, ax=ax, max_num_features = 50) plt.title("Light GBM Feature Importance") plt.savefig('xgb_feature_import.png', bbox_inches='tight') """ lgpred = bst.predict(testing) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 subfile = '../result/xgb_dense_feature_{}.csv'.format(i) if debug: subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i) kaggle_util.save_result(lgsub, subfile, competition='avito-demand-prediction', send=False, index=True) result_list = [] for i in range(nfold): subfile = '../result/xgb_dense_feature_{}.csv'.format(i) if debug: subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, not debug, competition='avito-demand-prediction', score_col='deal_probability', prefix='xgb_avg')
columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 del modelRNN gc.collect() print("Number of folds completed...." + str(k)) #print(Kfold_preds_final[k][0:10]) k += 1 K.clear_session() kaggle_util.save_result(lgsub, '../result/rnn_{}.csv'.format(k), competition='avito-demand-prediction', send=False, index=True) print("All Folds completed" + str(k + 1)) print("RNN FOLD MODEL Done") result_list = [] for i in range(nfold): subfile = 'rnn_{}.csv'.format(i) result_list.append((subfile, 1 / nfold)) kaggle_util.ensemble(result_list, not debug, competition='avito-demand-prediction', score_col='deal_probability', prefix='rnn_avg')