clf_xgb = XGBRegressor(max_depth=3, n_estimators=1000) clf_gbm = GBMRegressor(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.01, num_leaves=255, min_data_in_leaf=1, early_stopping_round=20, verbose=False) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=test_size, random_state=seed) # Training the two models clf_gbm.fit(x_train, y_train, test_data=[(x_test, y_test)]) clf_xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=20, verbose=False) print("xgboost: feature importance") dic_fi = clf_xgb.booster().get_fscore() xgb_fi = [(feature_names[int(k[1:])], dic_fi[k]) for k in dic_fi] xgb_fi = sorted(xgb_fi, key=lambda x: x[1], reverse=True) print(xgb_fi) print("lightgbm: feature importance") gbm_fi = clf_gbm.feature_importance(feature_names) print(gbm_fi)
feature_fraction=0.7, feature_fraction_seed=seed, bagging_fraction=1, bagging_freq=10, bagging_seed=seed, metric_freq=1, early_stopping_round=50) json.dump( gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])]) importance = dict(gbmr.feature_importance(train_features.columns.tolist())) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance']) df['importance'] = df['importance'] / df['importance'].sum() df.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False) val_label = gbmr.predict(validate_features) val_frame = pd.Series(val_label, index=validate_features.index) val_frame.name = probability_consumed_label val_coupons = pd.read_csv(validate_path + 'dataset.csv') val_coupons = val_coupons.join(val_frame).join( val_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join( pd.read_csv(validate_path + 'labels.csv')['Label'])
learning_rate=0.1, tree_learner='serial', min_data_in_leaf=10, metric='auc', feature_fraction=0.7, feature_fraction_seed=seed, bagging_fraction=1, bagging_freq=10, bagging_seed=seed, metric_freq=1, early_stopping_round=50 ) json.dump(gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])]) importance = dict(gbmr.feature_importance(train_features.columns.tolist())) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance']) df['importance'] = df['importance'] / df['importance'].sum() df.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False) val_label = gbmr.predict(validate_features) val_frame = pd.Series(val_label, index=validate_features.index) val_frame.name = probability_consumed_label val_coupons = pd.read_csv(validate_path + 'dataset.csv') val_coupons = val_coupons.join(val_frame).join(val_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(pd.read_csv(validate_path + 'labels.csv')['Label']) val_coupons.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False) print confusion_matrix(val_coupons['Label'], val_coupons['map']) print gbmr.best_round print 'generate submission'