def train_xgb(plot=False): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) # kFold cv models = [] scores = [] checkpoint_predictions = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val params = { 'learning_rate': .05, 'n_estimators': 2000, 'max_depth': 8, 'min_child_weight': 4, 'gamma': .2, 'subsample': .8, 'colsample_bytree': .8, 'n_jobs': -1, 'random_state': 0 } model = XGBClassifier().set_params(**params) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50, verbose=False) ## plot feature importance if plot: fscores = pd.Series(model.feature_importances_, X_train.columns).sort_values(ascending=False) fscores.plot(kind='bar', title='Feature Importance %d' % i, figsize=(20, 10)) plt.ylabel('Feature Importance Score') plt.show() y_pred = model.predict_proba(X_val, ntree_limit=model.best_ntree_limit)[:, 1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % model.__class__.__name__) cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__) return data['y_pred']
def train_tree(model): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[tdx], y.loc[vdx] y_true = y_val # model = RandomForestClassifier() model.fit(X_train, y_train) y_pred = model.predict_proba(X_val)[:,1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % model.__class__.__name__) cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__) return data['y_pred']
def train_cat(plot=False): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val params = { 'learning_rate': .05, 'n_estimators': 2000, 'max_depth': 8, 'max_bin': 127, 'reg_lambda': 2, 'subsample': .7, 'one_hot_max_size': 2, 'bootstrap_type': 'Bernoulli', 'leaf_estimation_method': 'Newton', 'random_state': 0 } model = CatBoostClassifier().set_params(**params) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50, verbose=False) ## plot feature importance if plot: fscores = pd.Series(model.feature_importances_, X_train.columns).sort_values(ascending=False) fscores.plot(kind='bar', title='Feature Importance %d' % i, figsize=(20, 10)) plt.ylabel('Feature Importance Score') plt.show() y_pred = model.predict_proba(X_val)[:, 1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % model.__class__.__name__) cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__) return data['y_pred']
def model_stacking(): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) lgbm = LightGBM.train_lgbm(plot=False) print(lgbm.shape) cat = CatBoost.train_cat(plot=False) print(cat.shape) deepFM = DeepFM.train_DeepFM() print(deepFM.shape) X = pd.concat([lgbm, cat, deepFM], axis=1) print(X.shape) models = [] scores = [] checkpoint_predictions = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_val)[:, 1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % 'Stacking') cal_roc_curve(data['y'], data['y_pred'], 'Stacking') return data['y_pred']
def train_PNN(): X, y, sparse_list, dense_list = feature_engineering.get_NN_data( use_over_sampler=True) data = pd.DataFrame(y) dnn_feature_columns = linear_feature_columns = sparse_list + dense_list feature_names = inputs.get_feature_names(linear_feature_columns + dnn_feature_columns) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val X_train = {name: X_train[name] for name in feature_names} X_val = {name: X_val[name] for name in feature_names} model = PNN(dnn_feature_columns, dnn_hidden_units=(128, 64), task='binary', dnn_dropout=0.5) best_param_path = './workspace/PNN/best_param_PNN.py_%d.h5' % i if os.path.exists(best_param_path): model.load_weights(best_param_path) else: model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy']) es = EarlyStopping(monitor='val_binary_crossentropy', mode='min', patience=15) mc = ModelCheckpoint(best_param_path, monitor='val_binary_crossentropy', mode='min', save_best_only=True, verbose=False, save_weights_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=1024, epochs=1000, verbose=2, callbacks=[es, mc]) model.load_weights(best_param_path) y_pred = model.predict(X_val, batch_size=64).flatten() auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % 'PNN') cal_roc_curve(data['y'], data['y_pred'], 'PNN') return data['y_pred']
def train_DeepFM(): X, y, sparse_list, dense_list = feature_engineering.get_NN_data( use_over_sampler=True) data = pd.DataFrame(y) dnn_feature_columns = linear_feature_columns = sparse_list + dense_list feature_names = inputs.get_feature_names(linear_feature_columns + dnn_feature_columns) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val X_train = {name: X_train[name] for name in feature_names} X_val = {name: X_val[name] for name in feature_names} model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128, 64), dnn_use_bn=True, task='binary', dnn_dropout=0.5) best_param_path = '/Users/a_piao/PycharmProjects/BankMarketing/workspace/DeepFM/best_param_DeepFM.py_%d.h5' % i # best_param_path = 'best_param_%s_%d.h5' % (os.path.basename(__file__), i) if os.path.exists(best_param_path): model.load_weights(best_param_path) else: model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy']) # tb = TensorBoard(log_dir="/Users/a_piao/PycharmProjects/NightLife_recommend/workspace/DeepFM/log", write_images=1, histogram_freq=1) es = EarlyStopping(monitor='val_binary_crossentropy', mode='min', patience=20) mc = ModelCheckpoint(best_param_path, monitor='val_binary_crossentropy', mode='min', save_best_only=False, verbose=False, save_weights_only=True) # model.fit(X_train, y_train, batch_size=512, epochs=1000, verbose=2, validation_split=0.2, callbacks=[es, mc]) model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=1024, epochs=1000, verbose=2, callbacks=[es, mc]) # model.fit(X_train, y_train, batch_size=1024, epochs=100, verbose=2, callbacks=[es, mc]) model.load_weights(best_param_path) y_pred = model.predict(X_val, batch_size=64).flatten() auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) # exit() mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % 'DeepFM') cal_roc_curve(data['y'], data['y_pred'], 'DeepFM') return data['y_pred']