def main(): train, test, macro = data_utils.load_data() train.fillna(0, inplace=True) test.fillna(0) mult = .969 train['price_doc'] = train["price_doc"] * mult + 10 # train['price_doc'] = np.log1p(train['price_doc']) y_train = train['price_doc'] id_train = train['id'] train.drop(['id', 'price_doc'], axis=1, inplace=True) id_test = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.drop(['timestamp'], axis=1, inplace=True) print "conbined_data:", conbined_data.shape # Deal with categorical values for c in conbined_data.columns: if conbined_data[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(conbined_data[c].values)) conbined_data[c] = lbl.transform(list(conbined_data[c].values)) del conbined_data['school_education_centers_raion_ratio_dis'] del conbined_data['preschool_education_centers_raion_ratio_dis'] del conbined_data['sport_objects_raion_ratio_dis'] del conbined_data['additional_education_raion_ratio_dis'] del conbined_data['0_6_all_vs_preschool_quota_dis'] scaler = StandardScaler() conbined_data = scaler.fit_transform(conbined_data) train = conbined_data[:train.shape[0], :] test = conbined_data[train.shape[0]:, :] test_size = (1.0 * test.shape[0]) / train.shape[0] print "submit test size:", test_size ntrain = train.shape[0] ntest = test.shape[0] n_folds = 5 random_seed = 0 kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed) oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((n_folds, ntest)) for i, (train_index, test_index) in enumerate(kfold.split(train)): print 'fold-{}: train: {}, test: {}'.format(i, train_index, test_index) x_tr = train[train_index] y_tr = y_train[train_index] x_te = train[test_index] alphas = np.array([1,0.5, 0.1, 0.05,0.01, 0.005]) solverOptions = (['svd', 'cholesky', 'sparse_cg', 'sag']) # create and fit a ridge regression model, testing each alpha model = Ridge(normalize=True, fit_intercept=True) #We have chosen to just normalize the data by default, you could GridsearchCV this is you wanted grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas, solver=solverOptions)) grid.fit(x_tr, y_tr) # summarize the results of the grid search print 'best_score', grid.best_score_ print 'alphas:', grid.best_estimator_.alpha print 'solverOptions:', grid.best_estimator_.solver model = Ridge(normalize=True, alpha=grid.best_estimator_.alpha, fit_intercept=True, solver=grid.best_estimator_.solver) # paramters tuned using GridSearchCV model.fit(x_tr, y_tr) train_rmse = mean_squared_error(y_tr, model.predict(x_tr)) print 'train_rmse =', np.sqrt(train_rmse) oof_train[test_index] = model.predict(x_te) oof_test_skf[i, :] = model.predict(test) oof_test[:] = oof_test_skf.mean(axis=0) # 保存 oof 结果 train_predict = pd.DataFrame({'id': id_train, 'ridge_oof_predict': oof_train}) test_predict = pd.DataFrame({'id': id_test, 'ridge_oof_predict': oof_test}) train_predict.to_csv(Configure.train_cv_result_for_model_stacking.format('ridge', time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))), index=False) test_predict.to_csv(Configure.test_cv_result_for_model_stacking.format('ridge', time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))), index=False)
sys.path.append(module_path) import numpy as np import pandas as pd # remove warnings import warnings warnings.filterwarnings('ignore') import xgboost as xgb from sklearn.metrics import mean_squared_error # my own module from features import data_utils from sklearn import preprocessing from conf.configure import Configure train, test, macro = data_utils.load_data() print 'train:', train.shape print 'test:', test.shape # Deal with categorical values for c in train.columns: if train[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(train[c].values)) train[c] = lbl.transform(list(train[c].values)) for c in test.columns: if test[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(test[c].values)) test[c] = lbl.transform(list(test[c].values))
def main(): train, test, macro = data_utils.load_data() ylog_train_all = train['price_doc'] train.drop(['id', 'price_doc'], axis=1, inplace=True) test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values str_columns = conbined_data.select_dtypes(include=['object']).columns.values.tolist() # dummy code dummies_data = pd.get_dummies(conbined_data[str_columns]) conbined_data[dummies_data.columns] = dummies_data[dummies_data.columns] conbined_data.drop(str_columns, axis=1, inplace=True) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] test_size = (1.0 * test.shape[0]) / (train.shape[0] + test.shape[0]) print "submit test size:", test_size # Convert to numpy values X_all = train.values print(X_all.shape) # Create a validation set, with last 20% of data num_train = train.shape[0] num_val = int(num_train * 0.2) X_train_all = X_all[:num_train] X_train = X_all[:num_train - num_val] X_val = X_all[num_train - num_val:num_train] ylog_train = ylog_train_all[:-num_val] ylog_val = ylog_train_all[-num_val:] X_test = test df_columns = train.columns print('X_train_all shape is', X_train_all.shape) print('X_train shape is', X_train.shape) print('y_train shape is', ylog_train.shape) print('X_val shape is', X_val.shape) print('y_val shape is', ylog_val.shape) print('X_test shape is', X_test.shape) dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns) dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns) xgb_params = { 'eta': 0.01, 'max_depth': 5, 'subsample': 0.7, 'booster': 'dart', 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1, 'seed': 100 } num_round = 500 xgb_params['nthread'] = 24 # param['eval_metric'] = "auc" plst = xgb_params.items() plst += [('eval_metric', 'rmse')] evallist = [(dval, 'eval')] bst = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=20, verbose_eval=10) num_boost_round = bst.best_iteration + 1 print 'best_iteration: ', num_boost_round
def main(): train, test, macro = data_utils.load_data() mult = .969 train['price_doc'] = train["price_doc"] * mult + 10 # train['price_doc'] = np.log1p(train['price_doc']) ylog_train_all = train['price_doc'] id_train = train['id'] train.drop(['id', 'price_doc'], axis=1, inplace=True) submit_ids = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) # macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract", # "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate", # "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build", "timestamp"] # conbined_data = pd.merge_ordered(conbined_data, macro[macro_cols], on='timestamp', how='left') conbined_data.drop(['timestamp'], axis=1, inplace=True) print "conbined_data:", conbined_data.shape # Deal with categorical values for c in conbined_data.columns: if conbined_data[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(conbined_data[c].values)) conbined_data[c] = lbl.transform(list(conbined_data[c].values)) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] test_size = (1.0 * test.shape[0]) / train.shape[0] print "submit test size:", test_size # Convert to numpy values X_all = train.values # Create a validation set, with last 20% of data num_train = int(train.shape[0] / (1 + test_size)) X_train_all = X_all X_train = X_all[:num_train] X_val = X_all[num_train:] ylog_train = ylog_train_all[:num_train] ylog_val = ylog_train_all[num_train:] X_test = test print "validate size:", 1.0 * X_val.shape[0] / X_train.shape[0] df_columns = train.columns print('X_train_all shape is', X_train_all.shape) print('X_train shape is', X_train.shape) print('y_train shape is', ylog_train.shape) print('X_val shape is', X_val.shape) print('y_val shape is', ylog_val.shape) print('X_test shape is', X_test.shape) dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all, feature_names=df_columns) dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns) dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns) dtest = xgb.DMatrix(X_test, feature_names=df_columns) xgb_params = { 'eta': 0.05, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1 } num_round = 1000 xgb_params['nthread'] = 24 evallist = [(dval, 'eval')] bst = xgb.train(xgb_params, dtrain, num_round, evallist, early_stopping_rounds=40, verbose_eval=10) train_rmse = mean_squared_error(ylog_train, bst.predict(dtrain)) val_rmse = mean_squared_error(ylog_val, bst.predict(dval)) print 'train_rmse =', np.sqrt(train_rmse), ', val_rmse =', np.sqrt( val_rmse) num_boost_round = bst.best_iteration print 'best_iteration: ', num_boost_round model = xgb.train(dict(xgb_params, silent=1), dtrain_all, num_boost_round=num_boost_round) print 'predict submit...' y_pred = model.predict(dtest) # y_pred = np.exp(ylog_pred) - 1 df_sub = pd.DataFrame({'id': submit_ids, 'price_doc': y_pred}) df_sub.to_csv('xgboost_model_4.csv', index=False) # 0.31499 # save model model.save_model('xgboost_model4.model')
def main(): train, test, macro = data_utils.load_data() mult = .969 train['price_doc'] = train["price_doc"] * mult + 10 y_train = train['price_doc'] id_train = train['id'] train.drop(['id', 'price_doc'], axis=1, inplace=True) id_test = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.drop(['timestamp'], axis=1, inplace=True) print "conbined_data:", conbined_data.shape # Deal with categorical values for c in conbined_data.columns: if conbined_data[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(conbined_data[c].values)) conbined_data[c] = lbl.transform(list(conbined_data[c].values)) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] test_size = (1.0 * test.shape[0]) / train.shape[0] print "submit test size:", test_size # Convert to numpy values train = train.values test = test.values ntrain = train.shape[0] ntest = test.shape[0] n_folds = 5 random_seed = 0 kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed) oof_train = np.zeros((ntrain, )) oof_test = np.zeros((ntest, )) oof_test_skf = np.empty((n_folds, ntest)) xgb_params = { 'eta': 0.05, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1, 'nthread': 24 } num_round = 1000 dtest = xgb.DMatrix(test) for i, (train_index, test_index) in enumerate(kfold.split(train)): print 'fold-{}: train: {}, test: {}'.format(i, train_index, test_index) x_tr = train[train_index] y_tr = y_train[train_index] x_te = train[test_index] dtrain = xgb.DMatrix(x_tr, y_tr) dx_te = xgb.DMatrix(x_te) cv_output = xgb.cv(dict(xgb_params, silent=1), dtrain, num_boost_round=num_round, early_stopping_rounds=40) num_boost_round = len(cv_output) print 'best_iteration: ', num_boost_round model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_round) train_rmse = mean_squared_error(y_tr, model.predict(dtrain)) print 'train_rmse =', np.sqrt(train_rmse) oof_train[test_index] = model.predict(dx_te) oof_test_skf[i, :] = model.predict(dtest) oof_test[:] = oof_test_skf.mean(axis=0) # 保存 oof 结果 train_predict = pd.DataFrame({ 'id': id_train, 'xgboost_oof_predict': oof_train }) test_predict = pd.DataFrame({ 'id': id_test, 'xgboost_oof_predict': oof_test }) train_predict.to_csv(Configure.train_cv_result_for_model_stacking.format( 'xgboost', time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))), index=False) test_predict.to_csv(Configure.test_cv_result_for_model_stacking.format( 'xgboost', time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))), index=False)