from gini_normalized import normalized_gini import numpy as np from pylab import * hold = pd.read_csv('../data/hold_new.csv') preds = pd.read_csv('preds_on_hold/xgbt.csv') def binar(x, a): if 53 < x < a: return 53 elif a <= x < 62: return 62 else: return x x_list = range(54, 62) y_list = [] for a in x_list: y_list += [ normalized_gini(hold['Hazard'], map(lambda x: binar(x, a), preds['Hazard'])) ] print x_list print y_list plot(x_list, y_list) savefig('cuts.png')
labels = y_train[::-1] xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:]) xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset]) watchlist = [(xgtrain, "train"), (xgval, "val")] model = xgb.train( params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120 ) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) preds = 0.5 * preds1 + 0.5 * preds2 tp = normalized_gini(y_test, preds) score += [tp] print tp sc = math.ceil(100000 * np.mean(score)) / 100000 sc_std = math.ceil(100000 * np.std(score)) / 100000 result += [ ( sc, sc_std, min_child_weight, eta, colsample_bytree, max_depth, subsample, gamma,
For example 53-62 is not in the train => looks like an idea to put values in this region to 53 or to 62, depending on the threashold ''' import pandas as pd from gini_normalized import normalized_gini import numpy as np from pylab import * hold = pd.read_csv('../data/hold_new.csv') preds = pd.read_csv('preds_on_hold/xgbt.csv') def binar(x, a): if 53 < x < a: return 53 elif a <= x < 62: return 62 else: return x x_list = range(54, 62) y_list = [] for a in x_list: y_list += [normalized_gini(hold['Hazard'], map(lambda x: binar(x, a), preds['Hazard']))] print x_list print y_list plot(x_list, y_list) savefig('cuts.png')
labels = y_train[::-1] xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:]) xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset]) watchlist = [(xgtrain, "train"), (xgval, "val")] model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) # preds = model.predict(xgval, ntree_limit=model.best_iteration) preds = 0.5 * preds1 + 0.5 * preds2 tp = normalized_gini(y_test, preds) # tp_up = normalized_gini(y_test, map(lambda x: min(69, x), preds)) # tp_down = normalized_gini(y_test, map(lambda x: max(1, x), preds)) tp_both = normalized_gini(y_test, map(lambda x: min(69, max(1, x)), preds)) # tp_both_round = normalized_gini(y_test, map(lambda x: round(min(69, max(1, x))), preds)) # tp_both_int = normalized_gini(y_test, map(lambda x: int(min(69, max(1, x))), preds)) # tp = normalized_gini(y_train[:offset], preds) score += [tp] # score_truncated_up += [tp_up] # score_truncated_down += [tp_down] score_truncated_both += [tp_both] # score_truncated_both_int += [tp_both_int] # score_truncated_both_round += [tp_both_round] print tp
labels = y_train[::-1] xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:]) xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset]) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) # preds = model.predict(xgval, ntree_limit=model.best_iteration) preds = 0.5 * preds1 + 0.5 * preds2 tp = normalized_gini(y_test, preds) # tp_up = normalized_gini(y_test, map(lambda x: min(69, x), preds)) # tp_down = normalized_gini(y_test, map(lambda x: max(1, x), preds)) tp_both = normalized_gini(y_test, map(lambda x: min(69, max(1, x)), preds)) # tp_both_round = normalized_gini(y_test, map(lambda x: round(min(69, max(1, x))), preds)) # tp_both_int = normalized_gini(y_test, map(lambda x: int(min(69, max(1, x))), preds)) # tp = normalized_gini(y_train[:offset], preds) score += [tp] # score_truncated_up += [tp_up] # score_truncated_down += [tp_down] score_truncated_both += [tp_both] # score_truncated_both_int += [tp_both_int] # score_truncated_both_round += [tp_both_round] print tp
clf = RandomForestRegressor( n_estimators=n_estimators, min_samples_split=min_samples_split, max_features=max_features, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=-1, random_state=random_state, ) clf.fit(a_train, b_train) preds = clf.predict(a_test) score += [normalized_gini(b_test, preds)] result += [ ( np.mean(score), np.std(score), n_estimators, min_samples_split, min_samples_leaf, max_depth, max_features, ) ] result.sort() print result
b_train = y.values[train_index] b_test = y.values[test_index] clf = RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, max_features=max_features, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=-1, random_state=random_state) clf.fit(a_train, b_train) preds = clf.predict(a_test) score += [normalized_gini(b_test, preds)] result += [(np.mean(score), np.std(score), n_estimators, min_samples_split, min_samples_leaf, max_depth, max_features)] result.sort() print result elif ind == 3: clf = RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features=0.4, max_depth=7, min_samples_leaf=1, n_jobs=-1, random_state=random_state) clf.fit(X, y)
xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:]) xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset]) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) preds_xgbt = 0.5 * preds1 + 0.5 * np.exp(preds2) alpha = 0 prediction = preds_xgbt tp = normalized_gini(y_test, prediction) score_00 += [normalized_gini(y_test, prediction)] alpha = 0.1 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt score_01 += [normalized_gini(y_test, prediction)] alpha = 0.2 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt score_02 += [normalized_gini(y_test, prediction)] alpha = 0.3 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt score_03 += [normalized_gini(y_test, prediction)] alpha = 0.4 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
b_train = y.values[train_index] b_test = y.values[test_index] clf = RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, max_features=max_features, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=-1, random_state=random_state) clf.fit(a_train, b_train) preds = clf.predict(a_test) score += [normalized_gini(map(lambda x: math.exp(x) - 1, b_test), map(lambda x: math.exp(x) - 1), preds)] result += [(np.mean(score), np.std(score), n_estimators, min_samples_split, min_samples_leaf, max_depth, max_features)] result.sort() print result elif ind == 2: clf = RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features=0.4, max_depth=7, min_samples_leaf=1, n_jobs=-1, random_state=random_state)
X_train = X_train[::-1, :] labels = y_train[::-1] xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:]) xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset]) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) preds = 0.5 * preds1 + 0.5 * preds2 tp = normalized_gini(y_test, preds) score += [tp] print tp sc = math.ceil(100000 * np.mean(score)) / 100000 sc_std = math.ceil(100000 * np.std(score)) / 100000 result += [(sc, sc_std, min_child_weight, eta, colsample_bytree, max_depth, subsample, gamma, n_iter, params['objective'],
a_train = X_train.values[train_index] a_test = X_train.values[test_index] b_train = y_train.values[train_index] b_test = y_train.values[test_index] X = scaler.fit_transform(a_train).astype(np.float32) X_reshaped = X.reshape(-1, 1, 10, 10) test = scaler.transform(a_test).astype(np.float32) test_reshaped = test.reshape(-1, 1, 10, 10) y = b_train[:] y.shape = (y.shape[0], 1) y_mean = y.mean() y_std = y.std() target = (y - y_mean) / y_std net1.fit(X_reshaped, target.astype(np.float32)) def helper(x): return (x * y_std) + y_mean result = net1.predict(test_reshaped) result = np.reshape(result, len(b_test)) result = map(helper, result) score += [normalized_gini(b_test, result)] print np.mean(score), np.std(score)
score = [] for train_index, test_index in rs: a_train = X_train.values[train_index] a_test = X_train.values[test_index] b_train = y_train.values[train_index] b_test = y_train.values[test_index] X = scaler.fit_transform(a_train).astype(np.float32) test = scaler.transform(a_test).astype(np.float32) y = b_train[:] y.shape = (y.shape[0], 1) y_mean = y.mean() y_std = y.std() target = (y - y_mean) / y_std net1.fit(X, target.astype(np.float32)) def helper(x): return (x * y_std) + y_mean result = net1.predict(test) result = np.reshape(result, len(b_test)) result = map(helper, result) score += [normalized_gini(b_test, result)] print np.mean(score), np.std(score)
watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) preds = model.predict( xgtest, ntree_limit=model.best_iteration) # preds = model.predict(xgval, ntree_limit=model.best_iteration) tp = normalized_gini( map(lambda x: math.exp(x) - 1, y_test), map(lambda x: math.exp(x) - 1, preds)) # tp = normalized_gini(y_train[:offset], preds) score += [tp] print tp result += [ (np.mean(score), np.std(score), min_child_weight, eta, colsample_bytree, max_depth, subsample, gamma, n_iter) ] result.sort() print result
# 'max_depth': 9 } score = [] for train_index, test_index in rs: Xc_train = X_cat.values[train_index] Xc_test = X_cat.values[test_index] y_train = y.values[train_index] y_test = y.values[test_index] clf_cat = Ridge(normalize=True, alpha=0.1) clf_cat.fit(Xc_train, y_train) prediction_cat_test = clf_cat.predict(Xc_test) prediction_cat_train = clf_cat.predict(Xc_train) Xn_train = X_num.values[train_index] Xn_test = X_num.values[test_index] Xn_train = pd.DataFrame(Xn_train) Xn_test = pd.DataFrame(Xn_test) Xn_train['cat'] = prediction_cat_train Xn_test['cat'] = prediction_cat_test xgtrain = xgb.DMatrix(Xn_train, label=y_train) xgval = xgb.DMatrix(Xn_test, label=y_test) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=200) preds = model.predict(xgval, ntree_limit=model.best_iteration) score += [normalized_gini(y_test, preds)] print np.mean(score), np.std(score)
b_test = y.values[test_index] clf = RandomForestRegressor( n_estimators=n_estimators, min_samples_split=min_samples_split, max_features=max_features, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=-1, random_state=random_state) clf.fit(a_train, b_train) preds = clf.predict(a_test) score += [normalized_gini(b_test, preds)] result += [(np.mean(score), np.std(score), n_estimators, min_samples_split, min_samples_leaf, max_depth, max_features)] result.sort() print result elif ind == 3: clf = RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features=0.4, max_depth=7, min_samples_leaf=1, n_jobs=-1,
prediction_cat_3 = clf_cat.predict(X3_cat) X2_num = X2[features_num] X2_num['cat'] = prediction_cat_2 X3_num = X3[features_num] X3_num['cat'] = prediction_cat_3 xgtrain = xgb.DMatrix(X2_num, label=y2) xgval = xgb.DMatrix(X3_num, label=y3) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=200) preds = model.predict(xgval, ntree_limit=model.best_iteration) print normalized_gini(y3, preds) # rs = cross_validation.StratifiedKFold(y, n_folds=n_iter, shuffle=True, random_state=random_state) # # num_rounds = 10000 # params = { # 'objective': 'reg:linear', # # 'eta': 0.005, # # 'min_child_weight': 6, # # 'subsample': 0.7, # # 'colsabsample_bytree': 0.7, # # 'scal_pos_weight': 1, # 'silent': 1, # # 'max_depth': 9 # } # # score = []
watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) preds_xgbt = 0.5 * preds1 + 0.5 * np.exp(preds2) alpha = 0 prediction = preds_xgbt tp = normalized_gini(y_test, prediction) score_00 += [normalized_gini(y_test, prediction)] alpha = 0.1 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt score_01 += [normalized_gini(y_test, prediction)] alpha = 0.2 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt score_02 += [normalized_gini(y_test, prediction)] alpha = 0.3 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt score_03 += [normalized_gini(y_test, prediction)] alpha = 0.4 prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
# 'subsample': 0.7, # 'colsabsample_bytree': 0.7, # 'scal_pos_weight': 1, 'silent': 1, # 'max_depth': 9 } score = [] for train_index, test_index in rs: Xc_train = X_cat.values[train_index] Xc_test = X_cat.values[test_index] y_train = y.values[train_index] y_test = y.values[test_index] clf_cat = Ridge(normalize=True, alpha=0.1) clf_cat.fit(Xc_train, y_train) prediction_cat_test = clf_cat.predict(Xc_test) prediction_cat_train = clf_cat.predict(Xc_train) Xn_train = X_num.values[train_index] Xn_test = X_num.values[test_index] Xn_train = pd.DataFrame(Xn_train) Xn_test = pd.DataFrame(Xn_test) Xn_train['cat'] = prediction_cat_train Xn_test['cat'] = prediction_cat_test xgtrain = xgb.DMatrix(Xn_train, label=y_train) xgval = xgb.DMatrix(Xn_test, label=y_test) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=200) preds = model.predict(xgval, ntree_limit=model.best_iteration) score += [normalized_gini(y_test, preds)] print np.mean(score), np.std(score)
X2_num['cat'] = prediction_cat_2 X3_num = X3[features_num] X3_num['cat'] = prediction_cat_3 xgtrain = xgb.DMatrix(X2_num, label=y2) xgval = xgb.DMatrix(X3_num, label=y3) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=200) preds = model.predict(xgval, ntree_limit=model.best_iteration) print normalized_gini(y3, preds) # rs = cross_validation.StratifiedKFold(y, n_folds=n_iter, shuffle=True, random_state=random_state) # # num_rounds = 10000 # params = { # 'objective': 'reg:linear', # # 'eta': 0.005, # # 'min_child_weight': 6, # # 'subsample': 0.7, # # 'colsabsample_bytree': 0.7, # # 'scal_pos_weight': 1, # 'silent': 1, # # 'max_depth': 9 # } # # score = []