def ens_xgb(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) xg_params = { "max_depth": 8, # [4, 6, 8, 12] "min_child_weight": 6, # [4, 6, 8] "learning_rate": 0.1, # [0.05, 0.075, 0.1, 0.2] "colsample_bytree": 0.8, "colsample_bylevel": 0.8, "reg_alpha": 0, "num_estimators": 100 } model = xgb.XGBRegressor(**xg_params) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X.iloc[trn_i] y_trn = trn_y.iloc[trn_i] x_test = trn_X.iloc[test_i] model.fit(x_trn, y_trn) pred_trn_X[test_i] = model.predict(x_test) pred_test_skf[i, :] = model.predict(test_X) pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('RMSE - XGBoost Train: {}, lr: {}, dp: {}'.format( rmse_trn, xg_params["learning_rate"], xg_params["max_depth"])) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def ens_ridge(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows,)) pred_test_X = np.zeros((test_rows,)) pred_test_skf = np.empty((NFOLDS, test_rows)) ridge_params = { 'alpha': 20.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42 } model = Ridge(**ridge_params) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] x_test = trn_X[test_i] model.fit(x_trn, y_trn) pred_trn_X[test_i] = model.predict(x_test) pred_test_skf[i, :] = model.predict(test_X) pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('rmse - Ridge Train: {}'.format(rmse_trn)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def run_xgb(train_X, train_y): xg_params = { "max_depth": [8], # [4, 6, 8] "min_child_weight": [6], # [4, 6, 8] "learning_rate": [0.1], # [0.05, 0.075, 0.1, 0.2] "colsample_bytree": [0.8], "colsample_bylevel": [0.8], "reg_alpha": [0], } trn_X, val_X, trn_y, val_y = train_test_split(train_X, train_y, test_size=0.20, random_state=0) xg_trn = xgb.DMatrix(trn_X, label=trn_y) xg_val = xgb.DMatrix(val_X, label=val_y) watchlist = [(xg_trn, 'train'), (xg_val, 'eval')] logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist) model = xgb.train(min_params, xg_trn, num_boost_round=5000, evals=watchlist, early_stopping_rounds=100, verbose_eval=50) return model
def ens_cat(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) cat_params = { "iterations": 1000, "learning_rate": 0.08, "depth": 10, "eval_metric": 'RMSE', "metric_period": 50, "calc_feature_importance": True } model = CatBoostRegressor(**cat_params) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X.iloc[trn_i] y_trn = trn_y.iloc[trn_i] x_test = trn_X.iloc[test_i] model.fit(x_trn, y_trn, use_best_model=True) pred_trn_X[test_i] = model.predict(x_test) pred_test_skf[i, :] = model.predict(test_X) pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('RMSE - CatBoost Train: {}, lr: {}, dp: {}'.format( rmse_trn, cat_params["learning_rate"], cat_params["depth"])) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def run_lgb(train_X, train_y): lg_params = { "objective": ["regression"], "boosting": ["gbdt"], "metric": ["rmse"], "num_leaves": [128], # [32, 48, 64, 128] "learning_rate": [0.07], # [0.05, 0.07, 0.1, 0.2] "feature_fraction": [0.7], "bagging_freq": [5], "bagging_fraction": [0.7], "bagging_seed": [2018], "verbosity": [-1] } trn_X, val_X, trn_y, val_y = train_test_split(train_X, train_y, test_size=0.20, shuffle=True, random_state=0) lg_trn = lgb.Dataset(trn_X, label=trn_y) lg_val = lgb.Dataset(val_X, label=val_y) logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y) model = lgb.train(min_params, lg_trn, num_boost_round=5000, valid_sets=[lg_val], early_stopping_rounds=100, verbose_eval=50) return model
def ens_en(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) en_params = {'alpha': 1.0} model = ElasticNet(**en_params) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] x_test = trn_X[test_i] # x_trn = trn_X.iloc[trn_i] # y_trn = trn_y.iloc[trn_i] # x_test = trn_X.iloc[test_i] model.fit(x_trn, y_trn) pred_trn_X[test_i] = model.predict(x_test) pred_test_skf[i, :] = model.predict(test_X) pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('RMSE - ElasticNet Train: {}'.format(rmse_trn)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def run_nn(trn_X, trn_y, val_X, val_y): lr = 0.1 bz = int(trn_X.shape[0] / 10) ep = 50 op = ks.optimizers.Adam(lr=lr) # op = ks.optimizers.SGD(lr=0.001, momentum=0.9) # with tf.Session(graph=tf.Graph(), config=config) as sess: early = ks.callbacks.EarlyStopping(monitor='loss', patience=0, mode='min') model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True) out = ks.layers.Dense(192, activation='relu')(model_in) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(1)(out) model = ks.Model(model_in, out) model.compile(loss='mean_squared_error', optimizer=op) model.fit(x=trn_X, y=trn_y, validation_data=(val_X, val_y), batch_size=bz, epochs=ep, callbacks=[early], verbose=1) pred_trn = model.predict(trn_X, batch_size=bz)[:, 0] pred_val = model.predict(val_X, batch_size=bz)[:, 0] rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train:{0:.4f} Valid:{1:.4f}'.format( rmse_trn, rmse_val)) return model
def ens_cat(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) model = CatBoostRegressor(iterations=1000, learning_rate=0.08, depth=10, eval_metric='RMSE', metric_period=50, calc_feature_importance=True) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] x_test = trn_X[test_i] model.fit(x_trn, y_trn, use_best_model=True) pred_trn_X[test_i] = model.predict(x_test) pred_test_skf[i, :] = model.predict(test_X) pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('rmse - CatBoost Feature: {}'.format(rmse_trn)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def run_cat(trn_X, trn_y, val_X, val_y): # cat_params = { # } logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) model = CatBoostRegressor(iterations=1000, learning_rate=0.08, depth=10, eval_metric='RMSE', metric_period=50, calc_feature_importance=True) # Train Start model.fit(trn_X, trn_y, eval_set=(val_X, val_y), use_best_model=True) pred_trn = model.predict(trn_X) pred_val = model.predict(val_X) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) return model
def lgb_gs(set_params, dtrn_X, trn_y, dval_X, val_X, val_y): min_score = 100 for params in tqdm(list(ParameterGrid(set_params))): logger.debug('params:\n {}'.format(params)) model = lgb.train(params, dtrn_X, num_boost_round=1000, valid_sets=[dval_X], early_stopping_rounds=100, verbose_eval=50) pred = model.predict(val_X, num_iteration=model.best_iteration) sc_rmse = rmse(val_y, pred) if min_score > sc_rmse: min_score = sc_rmse min_params = params logger.debug('rmse: {}'.format(sc_rmse)) logger.info('current min rmse: {}'.format(min_score)) logger.info('') logger.info('Top min params:\n {}'.format(min_params)) logger.info('Top min rmse: {}'.format(min_score)) return min_params
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) lr = 0.1 bz = int(trn_X.shape[0] / 10) ep = 50 op = ks.optimizers.Adam(lr=lr) early = ks.callbacks.EarlyStopping(monitor='val_loss', patience=50, mode='min') model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True) out = ks.layers.Dense(192, activation='relu')(model_in) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(1)(out) model = ks.Model(model_in, out) model.compile(loss='mean_squared_error', optimizer=op) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] x_trn, x_val, y_trn, y_val = train_test_split(x_trn, y_trn, test_size=0.10, shuffle=False, random_state=23) x_test = trn_X[test_i] model.fit(x=x_trn, y=y_trn, validation_data=(x_val, y_val), batch_size=bz, epochs=ep, callbacks=[early], verbose=1) pred_trn_X[test_i] = model.predict(x_test, batch_size=bz)[:, 0] pred_test_skf[i, :] = model.predict(test_X, batch_size=bz)[:, 0] pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('rmse - NN Feature: {}'.format(rmse_trn)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def run_lgb(train_X, train_y): lg_params = { "objective": "regression", "boosting": "gbdt", "metric": "rmse", "num_leaves": 128, # [32, 48, 64, 128] "learning_rate": 0.07, # [0.05, 0.07, 0.1, 0.2] "feature_fraction": 0.7, "bagging_freq": 5, "bagging_fraction": 0.7, "bagging_seed": 2018, "verbosity": -1 } trn_X, val_X, trn_y, val_y = train_test_split(train_X, train_y, test_size=0.20, shuffle=True, random_state=0) lg_trn = lgb.Dataset(trn_X, label=trn_y) lg_val = lgb.Dataset(val_X, label=val_y) logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) # GridSearch # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y) model = lgb.train(lg_params, lg_trn, num_boost_round=5000, valid_sets=[lg_val], early_stopping_rounds=100, verbose_eval=50) pred_trn = model.predict(trn_X, num_iteration=model.best_iteration) pred_val = model.predict(val_X, num_iteration=model.best_iteration) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) # Feature Importance logger.debug('Feature Importances') feat_n = model.feature_name() feat_i = list(model.feature_importance()) df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'}) df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'}) df_tmp = df_tmp1.join(df_tmp2, how='inner') df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() for i in range(len(df_tmp.index)): logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model
def run_lgb(trn_X, trn_y, val_X, val_y, tfvocab, cat_vars): lg_params = { "objective": "regression", "boosting": "gbdt", "metric": "rmse", # "max_depth": 15, # [15] "num_leaves": 128, # [256] "learning_rate": 0.07, # [0.018] "feature_fraction": 0.7, # [0.5] "bagging_freq": 5, "bagging_fraction": 0.7, # [0.75] "bagging_seed": 2018, "verbosity": -1, # "verbose": 0 } lg_trn = lgb.Dataset(trn_X, label=trn_y, feature_name=tfvocab, categorical_feature=cat_vars) lg_val = lgb.Dataset(val_X, label=val_y, feature_name=tfvocab, categorical_feature=cat_vars) logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) # GridSearch # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y) # Train Start model = lgb.train(lg_params, lg_trn, num_boost_round=16000, valid_sets=[lg_val], early_stopping_rounds=200, verbose_eval=100) pred_trn = model.predict(trn_X, num_iteration=model.best_iteration) pred_val = model.predict(val_X, num_iteration=model.best_iteration) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) # Feature Importance logger.debug('Feature Importances') feat_n = model.feature_name() feat_i = list(model.feature_importance()) df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'}) df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'}) df_tmp = df_tmp1.join(df_tmp2, how='inner') df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() # for i in range(len(df_tmp.index)): for i in range(50): logger.debug('\t{0:20s} : {1:>10.6f}'.format( df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model
def run_xgb(train_X, train_y): xg_params = { "max_depth": 8, # [4, 6, 8] "min_child_weight": 6, # [4, 6, 8] "learning_rate": 0.1, # [0.05, 0.075, 0.1, 0.2] "colsample_bytree": 0.8, "colsample_bylevel": 0.8, "reg_alpha": 0, } trn_X, val_X, trn_y, val_y = train_test_split(train_X, train_y, test_size=0.20, random_state=0) xg_trn = xgb.DMatrix(trn_X, label=trn_y) xg_val = xgb.DMatrix(val_X, label=val_y) watchlist = [(xg_trn, 'train'), (xg_val, 'eval')] logger.info('split.train: {}'.format(trn_X.shape)) logger.info('split.valid: {}'.format(val_X.shape)) # GridSearch # min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist) model = xgb.train(xg_params, xg_trn, num_boost_round=5000, evals=watchlist, early_stopping_rounds=100, verbose_eval=50) pred_trn = model.predict(xg_trn, ntree_limit=model.best_ntree_limit) pred_val = model.predict(xg_val, ntree_limit=model.best_ntree_limit) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) # Feature Importance create_feats_map(list(trn_X.columns[2:])) feat_i = model.get_fscore(fmap=XGBFMAP) df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i']) df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() for i in range(len(df_tmp.index)): logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows): kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) lr = 0.1 bz = int(trn_X.shape[0] / 10) ep = 50 op = ks.optimizers.Adam(lr=lr) model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True) out = ks.layers.Dense(192, activation='relu')(model_in) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(1)(out) model = ks.Model(model_in, out) model.compile(loss='mean_squared_error', optimizer=op) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] x_test = trn_X[test_i] for j in range(ep): model.fit(x=x_trn, y=y_trn, batch_size=bz, epochs=1, verbose=0) pred_trn = model.predict(x_trn, batch_size=bz)[:, 0] rmse_trn = rmse(y_trn, pred_trn) logger.debug('epochs {0}: rmse - Train:{1:.6f}'.format( j + 1, rmse_trn)) pred_trn_X[test_i] = model.predict(x_test, batch_size=bz)[:, 0] pred_test_skf[i, :] = model.predict(test_X, batch_size=bz)[:, 0] pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('rmse - NN Train: {}'.format(rmse_trn)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
def run_nn(trn_X, trn_y, val_X, val_y): lr = 0.1 bz = int(trn_X.shape[0] / 10) ep = 50 op = ks.optimizers.Adam(lr=lr) # op = ks.optimizers.SGD(lr=0.001, momentum=0.9) # with tf.Session(graph=tf.Graph(), config=config) as sess: model_in = ks.Input(shape=(trn_X.shape[1],), dtype='float32', sparse=True) out = ks.layers.Dense(192, activation='relu')(model_in) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(64, activation='relu')(out) out = ks.layers.Dense(1)(out) model = ks.Model(model_in, out) model.compile(loss='mean_squared_error', optimizer=op) for i in range(ep): model.fit(x=trn_X, y=trn_y, batch_size=bz, epochs=1, verbose=1) pred_trn = model.predict(trn_X, batch_size=bz)[:, 0] pred_val = model.predict(val_X, batch_size=bz)[:, 0] rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('epochs {0}: rmse - Train:{1:.4f} Valid:{2:.4f}'.format(i+1, rmse_trn, rmse_val)) return model
def run_ridge(trn_X, trn_y, val_X, val_y): ridge_params = { 'alpha': 20.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': 42 } model = Ridge(**ridge_params) model.fit(trn_X, trn_y) pred_trn = model.predict(trn_X) pred_val = model.predict(val_X) rmse_trn = rmse(trn_y, pred_trn) rmse_val = rmse(val_y, pred_val) logger.info('rmse - Train: {}'.format(rmse_trn)) logger.info('rmse - valid: {}'.format(rmse_val)) return model
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows): # trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit('param_feat', trn_X, test_X) # trn_X['description'], test_X['description'], tknzr_pf = tknzr_desc_fit('description', trn_X, test_X) trn_X = trn_X.ix[:, [1, 2, 3, 4, 8, 11, 5]] test_X = test_X.ix[:, [1, 2, 3, 4, 8, 11, 5]] kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) pred_trn_X = np.zeros((trn_rows, )) pred_test_X = np.zeros((test_rows, )) pred_test_skf = np.empty((NFOLDS, test_rows)) lr = 0.005 bz = 100000 ep = 500 op = Adam(lr=lr) # early = EarlyStopping(monitor='val_loss', patience=500, mode='min') logger.info('NN Train Shape: {}'.format(trn_X.shape)) logger.info('NN Test Shape : {}'.format(test_X.shape)) '''# model_in = ks.Input(shape=(trn_X.shape[1],), dtype='float32', sparse=True) # model_in = Input(shape=(trn_X.shape[1],), dtype='float32', sparse=False) # out = ks.layers.Dense(192, activation='relu')(model_in) # out = ks.layers.Dense(64, activation='relu')(out) # out = ks.layers.Dense(64, activation='relu')(out) out = Dense(16, activation='relu')(model_in) out = Dense(8, activation='relu')(out) out = Dense(8, activation='relu')(out) out = Dense(1, activation='sigmoid')(out) model = Model(model_in, out)''' model = make_model_nn(trn_X, test_X) model.compile(loss='mean_squared_error', optimizer=op) '''tr_reg, ts_reg = ((trn_X['region']), (test_X['region'])) tr_city, ts_city = (np.array(trn_X['city']), np.array(test_X['city'])) tr_pcn, ts_pcn = (np.array(trn_X['parent_category_name']), np.array(test_X['parent_category_name'])) tr_cn, ts_cn = (np.array(trn_X['category_name']), np.array(test_X['category_name'])) tr_ut, ts_ut = (np.array(trn_X['user_type']), np.array(test_X['user_type'])) tr_pf, ts_pf, tknzr_pf = tknzr_fit('param_feat', trn_X, test_X)''' trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit( 'param_feat', trn_X, test_X) # trn_X = np.array([tr_reg, tr_city, tr_pcn, tr_cn, tr_ut, tr_pf[:,0]]) # test_X = np.array([ts_reg, ts_city, ts_pcn, tr_cn, tr_ut, tr_pf[:,0]]) trn_X = np.array(trn_X) test_X = np.array(test_X) for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))): # for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X.index)))): x_trn = trn_X[trn_i] y_trn = trn_y[trn_i] # x_trn = trn_X.iloc[trn_i] # y_trn = trn_y.iloc[trn_i] x_trn, x_val, y_trn, y_val = train_test_split(x_trn, y_trn, test_size=0.10, shuffle=False, random_state=23) x_test = trn_X[test_i] # x_test = trn_X.iloc[test_i] '''x_trn = np.array(x_trn) y_trn = np.array(y_trn) x_val = np.array(x_val) y_val = np.array(y_val) x_test = np.array(x_test) test_X = np.array(test_X)''' model.fit( x=[ x_trn[:, 0], x_trn[:, 1], x_trn[:, 2], x_trn[:, 3], x_trn[:, 4], x_trn[:, 5] ], y=y_trn, # model.fit(x=[x_trn[:,[0,1,2,3,4,5]]], y=y_trn, validation_data=([ x_val[:, 0], x_val[:, 1], x_val[:, 2], x_val[:, 3], x_val[:, 4], x_val[:, 5] ], y_val), batch_size=bz, epochs=ep, verbose=1) # batch_size=bz, epochs=ep, callbacks=[early], verbose=1) pred_trn_X[test_i] = model.predict([ x_test[:, 0], x_test[:, 1], x_test[:, 2], x_test[:, 3], x_test[:, 4], x_test[:, 5] ], batch_size=bz)[:, 0] pred_test_skf[i, :] = model.predict([ test_X[:, 0], test_X[:, 1], test_X[:, 2], test_X[:, 3], test_X[:, 4], test_X[:, 5] ], batch_size=bz)[:, 0] pred_test_X[:] = pred_test_skf.mean(axis=0) rmse_trn = rmse(trn_y, pred_trn_X) logger.info('RMSE - NN Train: {}, lr: {}, bz: {}, ep: {}'.format( rmse_trn, lr, bz, ep)) return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
# create_feats_map(list(trn_X.columns[2:])) feat_i = model.get_fscore(fmap=XGBFMAP) df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i']) df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() # for i in range(len(df_tmp.index)): for i in range(15): logger.debug('\t{0:20s} : {1:>10.6f}'.format( df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model if __name__ == '__main__': logger.info('Start') # temp1_df = load_train_data(nrows=ROW) # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv') # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left') # del temp1_df, temp2_df train_df = load_train_data(nrows=ROW) logger.info('Train Data load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=ROW) logger.info('test load end {}'.format(test_df.shape)) # test_df = load_period_train_data(nrows=ROW) # logger.info('period train load end {}'.format(test_df.shape)) # pr_test_df = load_period_test_data(nrows=ROW)
feat_i = model.get_fscore(fmap=XGBFMAP) df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i']) df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() # for i in range(len(df_tmp.index)): for i in range(15): logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model if __name__ == '__main__': logger.info('Start') # temp1_df = load_train_data(nrows=ROW) # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv') # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left') # del temp1_df, temp2_df train_df = load_train_data(nrows=ROW) logger.info('Train Data load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=ROW) logger.info('Test load end {}'.format(test_df.shape)) # test_df = load_period_train_data(nrows=ROW) # logger.info('period train load end {}'.format(test_df.shape)) # pr_test_df = load_period_test_data(nrows=ROW)
create_feats_map(list(trn_X.columns[2:])) feat_i = model.get_fscore(fmap=XGBFMAP) df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i']) df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False) df_tmp = df_tmp.reset_index(drop=True) df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum() for i in range(len(df_tmp.index)): logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0], df_tmp.ix[i, 1])) return model if __name__ == '__main__': logger.info('Start') train_df = load_train_data(nrows=ROW) logger.info('train load end {}'.format(train_df.shape)) test_df = load_test_data(nrows=ROW) logger.info('test load end {}'.format(test_df.shape)) # Labels train_y = train_df["deal_probability"].values test_id = test_df["item_id"].values # Feature Weekday train_df["activation_weekday"] = train_df["activation_date"].dt.weekday test_df["activation_weekday"] = test_df["activation_date"].dt.weekday