示例#1
0
def main_crossvalid(frm, to):
    nfold = 5
    df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data(
        frm, to)

    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 270,  # 37,
        'feature_fraction': 0.4,
        'bagging_fraction': 0.65,
        'bagging_freq': 2,
        'learning_rate': 0.016,
        #'max_depth' : 8,
        #'min_split_gain' : 0.0222415,
        #'min_child_weight' : 20,
        'nthread': 5,
        'verbose': 0,
        #'reg_alpha' : 0.041545473,
        #'reg_lambda' : 0.0735294,
        'drop_rate': 0.08
    }

    skf = StratifiedKFold(y, n_folds=nfold)

    for i, (train_split, val_split) in enumerate(skf):
        #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5)
        print(train_split)
        X_train = hstack(
            [csr_matrix(df.iloc[train_split].values), ready_df[train_split]])
        X_valid = hstack(
            [csr_matrix(df.iloc[val_split].values),
             ready_df[val_split]])  # Sparse Matrix
        y_train = y[train_split]
        y_valid = y[val_split]

        lgtrain = lgb.Dataset(X_train,
                              y_train,
                              feature_name=tfvocab,
                              categorical_feature=categorical)
        lgvalid = lgb.Dataset(X_valid,
                              y_valid,
                              feature_name=tfvocab,
                              categorical_feature=categorical)

        modelstart = time.time()
        lgb_clf = lgb.train(lgbm_params,
                            lgtrain,
                            num_boost_round=26000,
                            valid_sets=[lgtrain, lgvalid],
                            valid_names=['train', 'valid'],
                            early_stopping_rounds=100,
                            verbose_eval=100)

        print("Model Evaluation Stage")
        rmse = np.sqrt(
            metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))
        print('RMSE:', rmse)

        f, ax = plt.subplots(figsize=[7, 10])
        lgb.plot_importance(lgb_clf, max_num_features=100, ax=ax)
        plt.title("Light GBM Feature Importance")
        plt.savefig('feature_import.png', bbox_inches='tight')

        str_now = datetime.now().strftime("%m-%d-%H-%M")
        if not debug:
            lgb_clf.save_model('../model/model_{}.txt'.format(i),
                               lgb_clf.best_iteration)
        else:
            lgb_clf.save_model('../model/model_debug_{}.txt'.format(i),
                               lgb_clf.best_iteration)

        lgpred = lgb_clf.predict(testing, num_iteration=lgb_clf.best_iteration)
        lgsub = pd.DataFrame(lgpred,
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1

        subfile = '../result/dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/dense_feature_debug{}.csv'.format(i)
        kaggle_util.save_result(lgsub,
                                subfile,
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    result_list = []
    for i in range(nfold):
        subfile = '../result/dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/dense_feature_debug{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         False,
                         competition='avito-demand-prediction',
                         score_col='deal_probability',
                         prefix='lgb_avg')
示例#2
0
def main_crossvalid_xgboost(frm, to):
    import xgboost as xgb

    nfold = 5
    df, y, testing, ready_df, tfvocab, predictors, len_train, categorical, tfvocab, testdex = get_crossvalid_data(
        frm, to)

    cat_features = []
    cols = list(df.columns)
    for col in categorical:
        cat_features.append(cols.index(col))

    #lgtest = xgb.DMatrix(testing.toarray())
    #del testing
    #gc.collect()

    skf = StratifiedKFold(y, n_folds=nfold)

    for i, (train_split, val_split) in enumerate(skf):
        #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=5)
        print(train_split)
        X_train = hstack(
            [csr_matrix(df.iloc[train_split].values), ready_df[train_split]])
        X_valid = hstack(
            [csr_matrix(df.iloc[val_split].values),
             ready_df[val_split]])  # Sparse Matrix
        y_train = y[train_split]
        y_valid = y[val_split]

        #lgtrain = xgb.DMatrix(X_train.toarray(), label = y_train)
        #lgvalid = xgb.DMatrix(X_valid.toarray(), label = y_valid)

        #del X_train, X_valid, y_train
        #gc.collect()

        modelstart = time.time()

        bst = xgb.XGBRegressor(n_estimators=400,
                               booster='gbtree',
                               learning_rate=0.016,
                               gamma=0,
                               subsample=0.75,
                               colsample_bylevel=0.5,
                               max_depth=16,
                               nthread=6)

        bst.fit(X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                verbose=False,
                early_stopping_rounds=100)

        print("Model Evaluation Stage")
        ypre = bst.predict(X_valid)
        rmse = np.sqrt(metrics.mean_squared_error(y_valid, ypre))
        print('RMSE:', rmse)
        """
        f, ax = plt.subplots(figsize=[7,10])
        xgb.plot_importance(bst, ax=ax, max_num_features = 50)
        plt.title("Light GBM Feature Importance")
        plt.savefig('xgb_feature_import.png', bbox_inches='tight')
        """

        lgpred = bst.predict(testing)
        lgsub = pd.DataFrame(lgpred,
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1

        subfile = '../result/xgb_dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i)
        kaggle_util.save_result(lgsub,
                                subfile,
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    result_list = []
    for i in range(nfold):
        subfile = '../result/xgb_dense_feature_{}.csv'.format(i)
        if debug:
            subfile = '../result/xgb_dense_feature_debug{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         not debug,
                         competition='avito-demand-prediction',
                         score_col='deal_probability',
                         prefix='xgb_avg')
示例#3
0
文件: rnn_total.py 项目: kownse/Avito
                             columns=["deal_probability"],
                             index=testdex)
        lgsub['deal_probability'].clip(0.0, 1.0,
                                       inplace=True)  # Between 0 and 1
        del modelRNN
        gc.collect()

        print("Number of folds completed...." + str(k))
        #print(Kfold_preds_final[k][0:10])
        k += 1
        K.clear_session()

        kaggle_util.save_result(lgsub,
                                '../result/rnn_{}.csv'.format(k),
                                competition='avito-demand-prediction',
                                send=False,
                                index=True)

    print("All Folds completed" + str(k + 1))
    print("RNN FOLD MODEL Done")

    result_list = []
    for i in range(nfold):
        subfile = 'rnn_{}.csv'.format(i)
        result_list.append((subfile, 1 / nfold))

    kaggle_util.ensemble(result_list,
                         not debug,
                         competition='avito-demand-prediction',
                         score_col='deal_probability',
                         prefix='rnn_avg')