예제 #1
0
def main():
    train, test, macro = data_utils.load_data()
    train.fillna(0, inplace=True)
    test.fillna(0)
    mult = .969

    train['price_doc'] = train["price_doc"] * mult + 10
    # train['price_doc'] = np.log1p(train['price_doc'])
    y_train = train['price_doc']
    id_train = train['id']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    id_test = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.drop(['timestamp'], axis=1, inplace=True)
    print "conbined_data:", conbined_data.shape

    # Deal with categorical values
    for c in conbined_data.columns:
        if conbined_data[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(conbined_data[c].values))
            conbined_data[c] = lbl.transform(list(conbined_data[c].values))

    del conbined_data['school_education_centers_raion_ratio_dis']
    del conbined_data['preschool_education_centers_raion_ratio_dis']
    del conbined_data['sport_objects_raion_ratio_dis']
    del conbined_data['additional_education_raion_ratio_dis']
    del conbined_data['0_6_all_vs_preschool_quota_dis']

    scaler = StandardScaler()
    conbined_data = scaler.fit_transform(conbined_data)

    train = conbined_data[:train.shape[0], :]
    test = conbined_data[train.shape[0]:, :]

    test_size = (1.0 * test.shape[0]) / train.shape[0]
    print "submit test size:", test_size

    ntrain = train.shape[0]
    ntest = test.shape[0]
    n_folds = 5
    random_seed = 0

    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((n_folds, ntest))

    for i, (train_index, test_index) in enumerate(kfold.split(train)):
        print 'fold-{}: train: {}, test: {}'.format(i, train_index, test_index)
        x_tr = train[train_index]
        y_tr = y_train[train_index]
        x_te = train[test_index]

        alphas = np.array([1,0.5, 0.1, 0.05,0.01, 0.005])
        solverOptions = (['svd', 'cholesky', 'sparse_cg', 'sag'])
        # create and fit a ridge regression model, testing each alpha
        model = Ridge(normalize=True, fit_intercept=True) #We have chosen to just normalize the data by default, you could GridsearchCV this is you wanted
        grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas, solver=solverOptions))
        grid.fit(x_tr, y_tr)

        # summarize the results of the grid search
        print 'best_score', grid.best_score_
        print 'alphas:', grid.best_estimator_.alpha
        print 'solverOptions:', grid.best_estimator_.solver

        model = Ridge(normalize=True, alpha=grid.best_estimator_.alpha, fit_intercept=True,
                      solver=grid.best_estimator_.solver)  # paramters tuned using GridSearchCV
        model.fit(x_tr, y_tr)

        train_rmse = mean_squared_error(y_tr, model.predict(x_tr))
        print 'train_rmse =', np.sqrt(train_rmse)

        oof_train[test_index] = model.predict(x_te)
        oof_test_skf[i, :] = model.predict(test)

    oof_test[:] = oof_test_skf.mean(axis=0)

    # 保存 oof 结果
    train_predict = pd.DataFrame({'id': id_train,
                                  'ridge_oof_predict': oof_train})
    test_predict = pd.DataFrame({'id': id_test,
                                 'ridge_oof_predict': oof_test})

    train_predict.to_csv(Configure.train_cv_result_for_model_stacking.format('ridge', time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))),
                         index=False)
    test_predict.to_csv(Configure.test_cv_result_for_model_stacking.format('ridge', time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))),
                         index=False)
sys.path.append(module_path)

import numpy as np
import pandas as pd
# remove warnings
import warnings

warnings.filterwarnings('ignore')
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# my own module
from features import data_utils
from sklearn import preprocessing
from conf.configure import Configure

train, test, macro = data_utils.load_data()
print 'train:', train.shape
print 'test:', test.shape

# Deal with categorical values
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(list(train[c].values))

for c in test.columns:
    if test[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(list(test[c].values))
def main():
    train, test, macro = data_utils.load_data()

    ylog_train_all = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])

    conbined_data.columns = test.columns.values

    str_columns = conbined_data.select_dtypes(include=['object']).columns.values.tolist()

    # dummy code
    dummies_data = pd.get_dummies(conbined_data[str_columns])
    conbined_data[dummies_data.columns] = dummies_data[dummies_data.columns]
    conbined_data.drop(str_columns, axis=1, inplace=True)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    test_size = (1.0 * test.shape[0]) / (train.shape[0] + test.shape[0])
    print "submit test size:", test_size

    # Convert to numpy values
    X_all = train.values
    print(X_all.shape)

    # Create a validation set, with last 20% of data
    num_train = train.shape[0]
    num_val = int(num_train * 0.2)

    X_train_all = X_all[:num_train]
    X_train = X_all[:num_train - num_val]
    X_val = X_all[num_train - num_val:num_train]
    ylog_train = ylog_train_all[:-num_val]
    ylog_val = ylog_train_all[-num_val:]

    X_test = test

    df_columns = train.columns

    print('X_train_all shape is', X_train_all.shape)
    print('X_train shape is', X_train.shape)
    print('y_train shape is', ylog_train.shape)
    print('X_val shape is', X_val.shape)
    print('y_val shape is', ylog_val.shape)
    print('X_test shape is', X_test.shape)

    dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns)
    dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)

    xgb_params = {
        'eta': 0.01,
        'max_depth': 5,
        'subsample': 0.7,
        'booster': 'dart',
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1,
        'seed': 100
    }
    num_round = 500
    xgb_params['nthread'] = 24
    # param['eval_metric'] = "auc"
    plst = xgb_params.items()
    plst += [('eval_metric', 'rmse')]
    evallist = [(dval, 'eval')]

    bst = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=20, verbose_eval=10)

    num_boost_round = bst.best_iteration + 1
    print 'best_iteration: ', num_boost_round
def main():
    train, test, macro = data_utils.load_data()

    mult = .969

    train['price_doc'] = train["price_doc"] * mult + 10
    # train['price_doc'] = np.log1p(train['price_doc'])
    ylog_train_all = train['price_doc']
    id_train = train['id']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    submit_ids = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    # macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
    #               "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
    #               "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build", "timestamp"]
    # conbined_data = pd.merge_ordered(conbined_data, macro[macro_cols], on='timestamp', how='left')

    conbined_data.drop(['timestamp'], axis=1, inplace=True)
    print "conbined_data:", conbined_data.shape

    # Deal with categorical values
    for c in conbined_data.columns:
        if conbined_data[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(conbined_data[c].values))
            conbined_data[c] = lbl.transform(list(conbined_data[c].values))

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    test_size = (1.0 * test.shape[0]) / train.shape[0]
    print "submit test size:", test_size

    # Convert to numpy values
    X_all = train.values

    # Create a validation set, with last 20% of data
    num_train = int(train.shape[0] / (1 + test_size))

    X_train_all = X_all
    X_train = X_all[:num_train]
    X_val = X_all[num_train:]
    ylog_train = ylog_train_all[:num_train]
    ylog_val = ylog_train_all[num_train:]
    X_test = test
    print "validate size:", 1.0 * X_val.shape[0] / X_train.shape[0]

    df_columns = train.columns

    print('X_train_all shape is', X_train_all.shape)
    print('X_train shape is', X_train.shape)
    print('y_train shape is', ylog_train.shape)
    print('X_val shape is', X_val.shape)
    print('y_val shape is', ylog_val.shape)
    print('X_test shape is', X_test.shape)

    dtrain_all = xgb.DMatrix(X_train_all,
                             ylog_train_all,
                             feature_names=df_columns)
    dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns)
    dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)
    dtest = xgb.DMatrix(X_test, feature_names=df_columns)

    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }

    num_round = 1000
    xgb_params['nthread'] = 24
    evallist = [(dval, 'eval')]

    bst = xgb.train(xgb_params,
                    dtrain,
                    num_round,
                    evallist,
                    early_stopping_rounds=40,
                    verbose_eval=10)

    train_rmse = mean_squared_error(ylog_train, bst.predict(dtrain))
    val_rmse = mean_squared_error(ylog_val, bst.predict(dval))
    print 'train_rmse =', np.sqrt(train_rmse), ', val_rmse =', np.sqrt(
        val_rmse)

    num_boost_round = bst.best_iteration
    print 'best_iteration: ', num_boost_round
    model = xgb.train(dict(xgb_params, silent=1),
                      dtrain_all,
                      num_boost_round=num_boost_round)

    print 'predict submit...'
    y_pred = model.predict(dtest)
    # y_pred = np.exp(ylog_pred) - 1
    df_sub = pd.DataFrame({'id': submit_ids, 'price_doc': y_pred})
    df_sub.to_csv('xgboost_model_4.csv', index=False)  # 0.31499

    # save model
    model.save_model('xgboost_model4.model')
def main():
    train, test, macro = data_utils.load_data()

    mult = .969

    train['price_doc'] = train["price_doc"] * mult + 10
    y_train = train['price_doc']
    id_train = train['id']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    id_test = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.drop(['timestamp'], axis=1, inplace=True)
    print "conbined_data:", conbined_data.shape

    # Deal with categorical values
    for c in conbined_data.columns:
        if conbined_data[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(conbined_data[c].values))
            conbined_data[c] = lbl.transform(list(conbined_data[c].values))

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    test_size = (1.0 * test.shape[0]) / train.shape[0]
    print "submit test size:", test_size

    # Convert to numpy values
    train = train.values
    test = test.values

    ntrain = train.shape[0]
    ntest = test.shape[0]
    n_folds = 5
    random_seed = 0

    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)

    oof_train = np.zeros((ntrain, ))
    oof_test = np.zeros((ntest, ))
    oof_test_skf = np.empty((n_folds, ntest))

    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1,
        'nthread': 24
    }
    num_round = 1000

    dtest = xgb.DMatrix(test)

    for i, (train_index, test_index) in enumerate(kfold.split(train)):
        print 'fold-{}: train: {}, test: {}'.format(i, train_index, test_index)
        x_tr = train[train_index]
        y_tr = y_train[train_index]
        x_te = train[test_index]

        dtrain = xgb.DMatrix(x_tr, y_tr)
        dx_te = xgb.DMatrix(x_te)
        cv_output = xgb.cv(dict(xgb_params, silent=1),
                           dtrain,
                           num_boost_round=num_round,
                           early_stopping_rounds=40)
        num_boost_round = len(cv_output)
        print 'best_iteration: ', num_boost_round
        model = xgb.train(dict(xgb_params, silent=1),
                          dtrain,
                          num_boost_round=num_boost_round)
        train_rmse = mean_squared_error(y_tr, model.predict(dtrain))
        print 'train_rmse =', np.sqrt(train_rmse)

        oof_train[test_index] = model.predict(dx_te)
        oof_test_skf[i, :] = model.predict(dtest)

    oof_test[:] = oof_test_skf.mean(axis=0)

    # 保存 oof 结果
    train_predict = pd.DataFrame({
        'id': id_train,
        'xgboost_oof_predict': oof_train
    })
    test_predict = pd.DataFrame({
        'id': id_test,
        'xgboost_oof_predict': oof_test
    })

    train_predict.to_csv(Configure.train_cv_result_for_model_stacking.format(
        'xgboost',
        time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))),
                         index=False)
    test_predict.to_csv(Configure.test_cv_result_for_model_stacking.format(
        'xgboost',
        time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))),
                        index=False)