def bad_model(df, train_ixs, feats): train = df.iloc[train_ixs, :] model = LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1) model.fit(train[feats], train['logerror']) print nptools.get_mae_loss(train['logerror'].values, model.predict(train[feats])) return model.predict(df[feats])
def optim_func(weights, preds, targets): final_prediction = ktools.ensemble_preds(preds, weights) score = 1000000 * tools.get_mae_loss(final_prediction, targets) return score
cache_dir + 'ps2_test_2ndx{}_f{}.pkl'.format(n_models, n_folds)) print('XGBoost... ') params = model_params.get_lvl2() dtrain = xgb.DMatrix(new_train.values, train_targets) dtest = xgb.DMatrix(new_test.values) preds_train_xgb = np.zeros(len(new_train)) preds_test_xgb = np.zeros(len(new_test)) n_bags = 5 for i in range(n_bags): model = xgb.train(params, dtrain, num_boost_round=300, verbose_eval=2) preds_train_xgb += model.predict(dtrain) preds_test_xgb += model.predict(dtest) preds_train_xgb /= n_bags preds_test_xgb /= n_bags score = tools.get_mae_loss(train_targets, preds_train_xgb) print('train score:{}'.format(score)) # ############Keras print('nnet... ') x_train = new_train.values x_test = new_test.values model = model_params.get_lvl2nn(x_train.shape[1]) batch_size = 256 epochs = 10 history = model.fit(x_train, train_targets, nb_epoch=epochs, batch_size=batch_size) model.history = history preds_train_nn = model.predict(x_train).squeeze()
df = data.select_features(df) df = df.drop(['assessmentyear'], axis=1) print df.columns if cv_flag: df_full_train, targets, df_test = data.split_data(df, logerror) df_train, df_val, train_targets, val_targets = data.split_cv(df_full_train, targets, cv_split_ratio) cv_preds = np.repeat(0., len(df_val)) for i in range(n_bags): x_train, x_val = tools.normalise_data(df_train.values, df_val.values) model = model_params.get_keras(x_train.shape[1]) history = model.fit( x_train, train_targets, nb_epoch=epochs, batch_size=batch_size, validation_data=(x_val, val_targets), verbose=2) model.history = history cv_preds += model.predict(x_val).squeeze() cv_preds /= float(n_bags) mae = tools.get_mae_loss(val_targets, cv_preds) mse = mean_squared_error(val_targets, cv_preds) msg = 'mae: {}, mse: {}, keras! train_data ratio: {}, bags:{}, epochs:{}'.format(mae, mse, cv_split_ratio, n_bags, epochs) print(msg), logger.debug(msg) else: print 'hola' ###training full: #data.generate_simple_kaggle_file(final_preds, 'bagged_{}'.format(n_bags))
model = xgb.train(params, dtrain, num_boost_round=num_boost_rounds, evals=watchlist, early_stopping_rounds=50) cv_preds = model.predict(dtest) + cv_preds #prepare for the next iteration df_bag, bag_targets = delete_some_outliers(df_train, train_targets) dtrain = xgb.DMatrix(df_bag.values, bag_targets) print(i, df_bag.shape) #params['seed'] = i num_boost_rounds = 155 cv_preds = cv_preds / n_bags mae = tools.get_mae_loss(test_targets, cv_preds) mse = mean_squared_error(test_targets, cv_preds) msg = 'mae: {}, mse: {}, train_data ratio: {}, bags:{}, r:{}'.format( mae, mse, cv_split_ratio, n_bags, num_boost_rounds) print(msg), logger.debug(msg) else: ###training full: df_train, targets, df_test = data.split_data(df, logerror) dtest = xgb.DMatrix(df_test.values) dtrain = xgb.DMatrix(df_train.values, targets) params = model_params.get_xtune11k() sub_preds = np.repeat(0, len(df_test)) num_boost_rounds = 110
df_train, targets, df_test = data.split_data(df, logerror) new_train = tools.read_pickle( cache_dir + 'ps_train_2ndx{}_f{}.pkl'.format(n_models, n_folds)) new_test = tools.read_pickle( cache_dir + 'ps_test_2ndx{}_f{}.pkl'.format(n_models, n_folds)) new_train0 = tools.read_pickle( cache_dir + 'ps_train_2ndx{}_f{}.pkl'.format(5, n_folds)) new_test0 = tools.read_pickle(cache_dir + 'ps_test_2ndx{}_f{}.pkl'.format(5, n_folds)) new_train['cat_weird'] = new_train['cat_preds'] + new_train['ker_preds'] new_test['cat_weird'] = new_test['cat_preds'] + new_test['ker_preds'] print 'score cat', tools.get_mae_loss(targets, new_train['cat_preds']) print 'score xgb', tools.get_mae_loss(targets, new_train['xgb_preds']) print 'score lgb', tools.get_mae_loss(targets, new_train['lgb_preds']) print 'score keras', tools.get_mae_loss(targets, new_train['ker_preds']) print 'score cat2', tools.get_mae_loss(targets, new_train['cat2_preds']) print 'score cat3', tools.get_mae_loss(targets, new_train['cat3_preds']) print 'score cat4', tools.get_mae_loss(targets, new_train['cat4_preds']) print 'score cat weird', tools.get_mae_loss(targets, new_train['cat_weird']) train0 = np.zeros(len(new_train)) test0 = np.zeros(len(new_test)) #weirdness didnt work too well with 0.4, try 0.55 #weights = [0.34, 0.02, 0.06, 0.08, 0.02, 0.08, 0.42, .02] (best cv legal) #weights = [.55, .02, .07, 0.09, 0.02] (weird submission)
if evaluate_cv: df, targets, train_ixs, test_ixs = data.get_cv_ixs(df, targets) else: train_ixs, test_ixs = data.get_lb_ixs(targets) df = features.add_features(df, train_ixs) df = data.select_features(df) print df.columns df_train, train_targets = df.iloc[train_ixs], targets[train_ixs] if evaluate_cv: df_test, test_targets = df.iloc[test_ixs], targets[test_ixs] eval_set = [(df_test.values, test_targets)] else: df_test = df.iloc[test_ixs] eval_set = [(df_train.values, train_targets)] params = model_params.get_ltune7k(num_rounds=nrounds) model = LGBMRegressor(**params) model.fit(df_train.values, train_targets, eval_set=eval_set, early_stopping_rounds=80) predictions = model.predict(df_test) if evaluate_cv: print(tools.get_mae_loss(test_targets, predictions)) if not evaluate_cv: predictions = model.predict(df_test) data.generate_simple_kaggle_file(predictions, 'sub_singlelgb_quasies')
df_test = df.iloc[test_ixs] new_train = ss.io.read_pickle( cache_dir + 'ps2_train_2ndx{}_f{}.pkl'.format(n_models, n_folds)) new_test = ss.io.read_pickle( cache_dir + 'ps2_test_2ndx{}_f{}.pkl'.format(n_models, n_folds)) cols = new_train.columns # new_train['cat_weird'] = new_train['cat_preds'] + new_train['ker_preds'] # new_test['cat_weird'] = new_test['cat_preds'] + new_test['ker_preds'] # new_train['zero'] = np.repeat(0, len(new_train)) # new_test['zero'] = np.repeat(0, len(new_test)) # cols = ['cat_weird', 'zero'] + list(cols) for col in cols: print 'score {}'.format(col), tools.get_mae_loss( train_targets, new_train[col].values) new_train = new_train[cols] new_test = new_test[cols] init_weights = np.repeat(0.1, n_models) # init_weights = [0.1 0.1, 0.1, 0.1, 0.06, 0.08, 0.02, 0.08, 0.42] # (best cv legal) all_train_preds = convert_preds_to_list(new_train) optim = optimise_weights(all_train_preds, train_targets, init_weights, minimise=True) print "-", optim.fun optimised_weights = optim.x
params = model_params.get_ctune163b() print df.columns if cv_flag: df_full_train, targets, df_test = data.split_data(df, logerror) df_train, df_test, train_targets, test_targets = data.split_cv( df_full_train, targets, cv_split_ratio) cv_preds = np.repeat(0, len(df_test)) for i in range(n_bags): model = CatBoostRegressor(**params) eval_set = [df_test.values, test_targets] model.fit(df_train.values, train_targets, eval_set=eval_set) predictions = model.predict(df_test) mae = tools.get_mae_loss(test_targets, predictions) mse = mean_squared_error(test_targets, predictions) cv_preds = model.predict(df_test.values) + cv_preds #prepare for the next iteration #df_bag, bag_targets = delete_some_outliers(df_train, train_targets) #print(i, df_bag.shape) #params['seed'] = i cv_preds = cv_preds / n_bags mae = tools.get_mae_loss(test_targets, cv_preds) mse = mean_squared_error(test_targets, cv_preds) msg = 'mae: {}, mse: {}, train_data ratio: {}, bags:{}, r:{}'.format( mae, mse, cv_split_ratio, n_bags, params['iterations']) print(msg), logger.debug(msg)
exc = [ train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O' ] + ['logerror', 'parcelid'] col = [c for c in train.columns if c not in exc] train = reg_features(train[col]) test[ 'transactiondate'] = '2016-01-01' #should use the most common training date test = reg_features(test[col]) reg = ElasticNetCV(normalize=True, l1_ratio=0.8, max_iter=5000) reg.fit(train, y) print('fit...') print(tools.get_mae_loss(targets, reg.predict(train))) ######################## ######################## ## Combine and Save ## ######################## ######################## ##### COMBINE PREDICTIONS print("\nCombining XGBoost, LightGBM, and baseline predicitons ...") lgb_weight = (1 - XGB_WEIGHT - BASELINE_WEIGHT) / float((1 - OLS_WEIGHT)) xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT) baseline_weight0 = BASELINE_WEIGHT / (1 - OLS_WEIGHT) pred0 = xgb_weight0 * xgb_pred + baseline_weight0 * BASELINE_PRED + lgb_weight * p_test
assert len(x_train_small) == len(x_train) # keras keras_ix = 2 batch_size, epochs = 256, 15 model = model_params.get_keras(x_train.shape[1]) history = model.fit(x_train, y_train, nb_epoch=epochs, batch_size=batch_size, validation_data=(x_val, y_val), verbose=2) model.history = history preds_train[val_ix, keras_ix] = model.predict(x_val).squeeze() preds_test[:, keras_ix] += model.predict(df_test.values).squeeze() score = tools.get_mae_loss(y_val, preds_train[val_ix, keras_ix]) print('train rows:{}, val rows:{}, fold:{}, score:{}'.format( len(x_train), len(x_val), i, score)) # svr ! svr_ix = 0 model = SVR(cache_size=600, C=0.1) print x_train_small.shape model.fit(x_train_small[::10, :], y_train[::10]) preds_train[val_ix, svr_ix] = model.predict(x_val_small) preds_test[:, svr_ix] += model.predict(df_test_small.values) score = tools.get_mae_loss(y_val, preds_train[val_ix, svr_ix]) print('train rows:{}, val rows:{}, fold:{}, score:{}'.format( len(x_train), len(x_val), i, score)) # Catboost