示例#1
0
def train_xgb(plot=False):
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)
    # kFold cv
    models = []
    scores = []
    checkpoint_predictions = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        params = {
            'learning_rate': .05,
            'n_estimators': 2000,
            'max_depth': 8,
            'min_child_weight': 4,
            'gamma': .2,
            'subsample': .8,
            'colsample_bytree': .8,
            'n_jobs': -1,
            'random_state': 0
        }
        model = XGBClassifier().set_params(**params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=50,
                  verbose=False)

        ## plot feature importance
        if plot:
            fscores = pd.Series(model.feature_importances_,
                                X_train.columns).sort_values(ascending=False)
            fscores.plot(kind='bar',
                         title='Feature Importance %d' % i,
                         figsize=(20, 10))
            plt.ylabel('Feature Importance Score')
            plt.show()

        y_pred = model.predict_proba(X_val,
                                     ntree_limit=model.best_ntree_limit)[:, 1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % model.__class__.__name__)
    cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__)

    return data['y_pred']
示例#2
0
def train_tree(model):
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)
    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[tdx], y.loc[vdx]
        y_true = y_val

        # model = RandomForestClassifier()
        model.fit(X_train, y_train)

        y_pred = model.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % model.__class__.__name__)
    cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__)

    return data['y_pred']
示例#3
0
def train_cat(plot=False):
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)

    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        params = {
            'learning_rate': .05,
            'n_estimators': 2000,
            'max_depth': 8,
            'max_bin': 127,
            'reg_lambda': 2,
            'subsample': .7,
            'one_hot_max_size': 2,
            'bootstrap_type': 'Bernoulli',
            'leaf_estimation_method': 'Newton',
            'random_state': 0
        }
        model = CatBoostClassifier().set_params(**params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=50,
                  verbose=False)

        ## plot feature importance
        if plot:
            fscores = pd.Series(model.feature_importances_,
                                X_train.columns).sort_values(ascending=False)
            fscores.plot(kind='bar',
                         title='Feature Importance %d' % i,
                         figsize=(20, 10))
            plt.ylabel('Feature Importance Score')
            plt.show()

        y_pred = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % model.__class__.__name__)
    cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__)

    return data['y_pred']
示例#4
0
def model_stacking():
    X, y = feature_engineering.get_train_data(use_over_sampler=True)
    data = pd.DataFrame(y)

    lgbm = LightGBM.train_lgbm(plot=False)
    print(lgbm.shape)
    cat = CatBoost.train_cat(plot=False)
    print(cat.shape)
    deepFM = DeepFM.train_DeepFM()
    print(deepFM.shape)

    X = pd.concat([lgbm, cat, deepFM], axis=1)
    print(X.shape)

    models = []
    scores = []
    checkpoint_predictions = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % 'Stacking')
    cal_roc_curve(data['y'], data['y_pred'], 'Stacking')

    return data['y_pred']
示例#5
0
def train_PNN():
    X, y, sparse_list, dense_list = feature_engineering.get_NN_data(
        use_over_sampler=True)

    data = pd.DataFrame(y)
    dnn_feature_columns = linear_feature_columns = sparse_list + dense_list
    feature_names = inputs.get_feature_names(linear_feature_columns +
                                             dnn_feature_columns)

    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        X_train = {name: X_train[name] for name in feature_names}
        X_val = {name: X_val[name] for name in feature_names}

        model = PNN(dnn_feature_columns,
                    dnn_hidden_units=(128, 64),
                    task='binary',
                    dnn_dropout=0.5)

        best_param_path = './workspace/PNN/best_param_PNN.py_%d.h5' % i

        if os.path.exists(best_param_path):
            model.load_weights(best_param_path)
        else:
            model.compile("adam",
                          "binary_crossentropy",
                          metrics=['binary_crossentropy'])
            es = EarlyStopping(monitor='val_binary_crossentropy',
                               mode='min',
                               patience=15)
            mc = ModelCheckpoint(best_param_path,
                                 monitor='val_binary_crossentropy',
                                 mode='min',
                                 save_best_only=True,
                                 verbose=False,
                                 save_weights_only=True)
            model.fit(X_train,
                      y_train,
                      validation_data=(X_val, y_val),
                      batch_size=1024,
                      epochs=1000,
                      verbose=2,
                      callbacks=[es, mc])
            model.load_weights(best_param_path)

        y_pred = model.predict(X_val, batch_size=64).flatten()
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % 'PNN')
    cal_roc_curve(data['y'], data['y_pred'], 'PNN')

    return data['y_pred']
示例#6
0
def train_DeepFM():
    X, y, sparse_list, dense_list = feature_engineering.get_NN_data(
        use_over_sampler=True)

    data = pd.DataFrame(y)
    dnn_feature_columns = linear_feature_columns = sparse_list + dense_list
    feature_names = inputs.get_feature_names(linear_feature_columns +
                                             dnn_feature_columns)

    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        X_train = {name: X_train[name] for name in feature_names}
        X_val = {name: X_val[name] for name in feature_names}

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=(128, 64),
                       dnn_use_bn=True,
                       task='binary',
                       dnn_dropout=0.5)

        best_param_path = '/Users/a_piao/PycharmProjects/BankMarketing/workspace/DeepFM/best_param_DeepFM.py_%d.h5' % i
        # best_param_path = 'best_param_%s_%d.h5' % (os.path.basename(__file__), i)

        if os.path.exists(best_param_path):
            model.load_weights(best_param_path)
        else:
            model.compile("adam",
                          "binary_crossentropy",
                          metrics=['binary_crossentropy'])
            # tb = TensorBoard(log_dir="/Users/a_piao/PycharmProjects/NightLife_recommend/workspace/DeepFM/log", write_images=1, histogram_freq=1)
            es = EarlyStopping(monitor='val_binary_crossentropy',
                               mode='min',
                               patience=20)
            mc = ModelCheckpoint(best_param_path,
                                 monitor='val_binary_crossentropy',
                                 mode='min',
                                 save_best_only=False,
                                 verbose=False,
                                 save_weights_only=True)
            # model.fit(X_train, y_train, batch_size=512, epochs=1000, verbose=2, validation_split=0.2, callbacks=[es, mc])
            model.fit(X_train,
                      y_train,
                      validation_data=(X_val, y_val),
                      batch_size=1024,
                      epochs=1000,
                      verbose=2,
                      callbacks=[es, mc])
            # model.fit(X_train, y_train, batch_size=1024, epochs=100, verbose=2, callbacks=[es, mc])
            model.load_weights(best_param_path)

        y_pred = model.predict(X_val, batch_size=64).flatten()
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())
        # exit()

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % 'DeepFM')
    cal_roc_curve(data['y'], data['y_pred'], 'DeepFM')

    return data['y_pred']