예제 #1
0
def ens_xgb(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    xg_params = {
        "max_depth": 8,  # [4, 6, 8, 12]
        "min_child_weight": 6,  # [4, 6, 8]
        "learning_rate": 0.1,  # [0.05, 0.075, 0.1, 0.2]
        "colsample_bytree": 0.8,
        "colsample_bylevel": 0.8,
        "reg_alpha": 0,
        "num_estimators": 100
    }

    model = xgb.XGBRegressor(**xg_params)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X.iloc[trn_i]
        y_trn = trn_y.iloc[trn_i]
        x_test = trn_X.iloc[test_i]

        model.fit(x_trn, y_trn)

        pred_trn_X[test_i] = model.predict(x_test)
        pred_test_skf[i, :] = model.predict(test_X)

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('RMSE - XGBoost Train: {}, lr: {}, dp: {}'.format(
        rmse_trn, xg_params["learning_rate"], xg_params["max_depth"]))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #2
0
def ens_ridge(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows,))
    pred_test_X = np.zeros((test_rows,))
    pred_test_skf = np.empty((NFOLDS, test_rows))
    ridge_params = {
        'alpha': 20.0,
        'fit_intercept': True,
        'normalize': False,
        'copy_X': True,
        'max_iter': None,
        'tol': 0.001,
        'solver': 'auto',
        'random_state': 42
        }
    model = Ridge(**ridge_params)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        x_test = trn_X[test_i]

        model.fit(x_trn, y_trn)

        pred_trn_X[test_i] = model.predict(x_test)
        pred_test_skf[i, :] = model.predict(test_X)

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('rmse - Ridge Train: {}'.format(rmse_trn))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #3
0
def run_xgb(train_X, train_y):
    xg_params = {
        "max_depth": [8],  # [4, 6, 8]
        "min_child_weight": [6],  # [4, 6, 8]
        "learning_rate": [0.1],  # [0.05, 0.075, 0.1, 0.2]
        "colsample_bytree": [0.8],
        "colsample_bylevel": [0.8],
        "reg_alpha": [0],
    }

    trn_X, val_X, trn_y, val_y = train_test_split(train_X,
                                                  train_y,
                                                  test_size=0.20,
                                                  random_state=0)
    xg_trn = xgb.DMatrix(trn_X, label=trn_y)
    xg_val = xgb.DMatrix(val_X, label=val_y)
    watchlist = [(xg_trn, 'train'), (xg_val, 'eval')]

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist)

    model = xgb.train(min_params,
                      xg_trn,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=50)

    return model
예제 #4
0
def ens_cat(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    cat_params = {
        "iterations": 1000,
        "learning_rate": 0.08,
        "depth": 10,
        "eval_metric": 'RMSE',
        "metric_period": 50,
        "calc_feature_importance": True
    }

    model = CatBoostRegressor(**cat_params)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X.iloc[trn_i]
        y_trn = trn_y.iloc[trn_i]
        x_test = trn_X.iloc[test_i]

        model.fit(x_trn, y_trn, use_best_model=True)

        pred_trn_X[test_i] = model.predict(x_test)
        pred_test_skf[i, :] = model.predict(test_X)

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('RMSE - CatBoost Train: {}, lr: {}, dp: {}'.format(
        rmse_trn, cat_params["learning_rate"], cat_params["depth"]))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #5
0
def run_lgb(train_X, train_y):
    lg_params = {
        "objective": ["regression"],
        "boosting": ["gbdt"],
        "metric": ["rmse"],
        "num_leaves": [128],  # [32, 48, 64, 128]
        "learning_rate": [0.07],  # [0.05, 0.07, 0.1, 0.2]
        "feature_fraction": [0.7],
        "bagging_freq": [5],
        "bagging_fraction": [0.7],
        "bagging_seed": [2018],
        "verbosity": [-1]
    }

    trn_X, val_X, trn_y, val_y = train_test_split(train_X,
                                                  train_y,
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=0)
    lg_trn = lgb.Dataset(trn_X, label=trn_y)
    lg_val = lgb.Dataset(val_X, label=val_y)

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y)

    model = lgb.train(min_params,
                      lg_trn,
                      num_boost_round=5000,
                      valid_sets=[lg_val],
                      early_stopping_rounds=100,
                      verbose_eval=50)

    return model
예제 #6
0
def ens_en(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    en_params = {'alpha': 1.0}
    model = ElasticNet(**en_params)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        x_test = trn_X[test_i]
        # x_trn = trn_X.iloc[trn_i]
        # y_trn = trn_y.iloc[trn_i]
        # x_test = trn_X.iloc[test_i]

        model.fit(x_trn, y_trn)

        pred_trn_X[test_i] = model.predict(x_test)
        pred_test_skf[i, :] = model.predict(test_X)

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('RMSE - ElasticNet Train: {}'.format(rmse_trn))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #7
0
def run_nn(trn_X, trn_y, val_X, val_y):
    lr = 0.1
    bz = int(trn_X.shape[0] / 10)
    ep = 50
    op = ks.optimizers.Adam(lr=lr)
    # op = ks.optimizers.SGD(lr=0.001, momentum=0.9)
    # with tf.Session(graph=tf.Graph(), config=config) as sess:
    early = ks.callbacks.EarlyStopping(monitor='loss', patience=0, mode='min')

    model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True)
    out = ks.layers.Dense(192, activation='relu')(model_in)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(1)(out)
    model = ks.Model(model_in, out)
    model.compile(loss='mean_squared_error', optimizer=op)
    model.fit(x=trn_X,
              y=trn_y,
              validation_data=(val_X, val_y),
              batch_size=bz,
              epochs=ep,
              callbacks=[early],
              verbose=1)

    pred_trn = model.predict(trn_X, batch_size=bz)[:, 0]
    pred_val = model.predict(val_X, batch_size=bz)[:, 0]
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)

    logger.info('rmse - Train:{0:.4f} Valid:{1:.4f}'.format(
        rmse_trn, rmse_val))
    return model
예제 #8
0
def ens_cat(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    model = CatBoostRegressor(iterations=1000,
                              learning_rate=0.08,
                              depth=10,
                              eval_metric='RMSE',
                              metric_period=50,
                              calc_feature_importance=True)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        x_test = trn_X[test_i]

        model.fit(x_trn, y_trn, use_best_model=True)

        pred_trn_X[test_i] = model.predict(x_test)
        pred_test_skf[i, :] = model.predict(test_X)

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('rmse - CatBoost Feature: {}'.format(rmse_trn))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #9
0
def run_cat(trn_X, trn_y, val_X, val_y):

    # cat_params = {
    #    }

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    model = CatBoostRegressor(iterations=1000,
                              learning_rate=0.08,
                              depth=10,
                              eval_metric='RMSE',
                              metric_period=50,
                              calc_feature_importance=True)
    # Train Start
    model.fit(trn_X, trn_y,
              eval_set=(val_X, val_y),
              use_best_model=True)

    pred_trn = model.predict(trn_X)
    pred_val = model.predict(val_X)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))
    
    return model
예제 #10
0
def lgb_gs(set_params, dtrn_X, trn_y,  dval_X, val_X, val_y):
    min_score = 100
    for params in tqdm(list(ParameterGrid(set_params))):
        logger.debug('params:\n {}'.format(params))
        model = lgb.train(params, dtrn_X,
                          num_boost_round=1000,
                          valid_sets=[dval_X],
                          early_stopping_rounds=100,
                          verbose_eval=50)

        pred = model.predict(val_X,
                             num_iteration=model.best_iteration)
        sc_rmse = rmse(val_y, pred)

        if min_score > sc_rmse:
            min_score = sc_rmse
            min_params = params

        logger.debug('rmse: {}'.format(sc_rmse))
        logger.info('current min rmse: {}'.format(min_score))

    logger.info('')
    logger.info('Top min params:\n {}'.format(min_params))
    logger.info('Top min rmse: {}'.format(min_score))

    return min_params
예제 #11
0
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    lr = 0.1
    bz = int(trn_X.shape[0] / 10)
    ep = 50
    op = ks.optimizers.Adam(lr=lr)
    early = ks.callbacks.EarlyStopping(monitor='val_loss',
                                       patience=50,
                                       mode='min')

    model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True)
    out = ks.layers.Dense(192, activation='relu')(model_in)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(1)(out)
    model = ks.Model(model_in, out)
    model.compile(loss='mean_squared_error', optimizer=op)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        x_trn, x_val, y_trn, y_val = train_test_split(x_trn,
                                                      y_trn,
                                                      test_size=0.10,
                                                      shuffle=False,
                                                      random_state=23)
        x_test = trn_X[test_i]

        model.fit(x=x_trn,
                  y=y_trn,
                  validation_data=(x_val, y_val),
                  batch_size=bz,
                  epochs=ep,
                  callbacks=[early],
                  verbose=1)
        pred_trn_X[test_i] = model.predict(x_test, batch_size=bz)[:, 0]
        pred_test_skf[i, :] = model.predict(test_X, batch_size=bz)[:, 0]

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('rmse - NN Feature: {}'.format(rmse_trn))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #12
0
def run_lgb(train_X, train_y):
    lg_params = {
        "objective": "regression",
        "boosting": "gbdt",
        "metric": "rmse",
        "num_leaves": 128,  # [32, 48, 64, 128]
        "learning_rate": 0.07,  # [0.05, 0.07, 0.1, 0.2]
        "feature_fraction": 0.7,
        "bagging_freq": 5,
        "bagging_fraction": 0.7,
        "bagging_seed": 2018,
        "verbosity": -1
    }

    trn_X, val_X, trn_y, val_y = train_test_split(train_X,
                                                  train_y,
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=0)
    lg_trn = lgb.Dataset(trn_X, label=trn_y)
    lg_val = lgb.Dataset(val_X, label=val_y)

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    # GridSearch
    # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y)

    model = lgb.train(lg_params,
                      lg_trn,
                      num_boost_round=5000,
                      valid_sets=[lg_val],
                      early_stopping_rounds=100,
                      verbose_eval=50)

    pred_trn = model.predict(trn_X, num_iteration=model.best_iteration)
    pred_val = model.predict(val_X, num_iteration=model.best_iteration)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))

    # Feature Importance
    logger.debug('Feature Importances')
    feat_n = model.feature_name()
    feat_i = list(model.feature_importance())

    df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'})
    df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'})
    df_tmp = df_tmp1.join(df_tmp2, how='inner')
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    for i in range(len(df_tmp.index)):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0],
                                                     df_tmp.ix[i, 1]))
    return model
예제 #13
0
def run_lgb(trn_X, trn_y, val_X, val_y, tfvocab, cat_vars):
    lg_params = {
        "objective": "regression",
        "boosting": "gbdt",
        "metric": "rmse",
        # "max_depth": 15,         # [15]
        "num_leaves": 128,      # [256]
        "learning_rate": 0.07,   # [0.018]
        "feature_fraction": 0.7, # [0.5]
        "bagging_freq": 5,
        "bagging_fraction": 0.7, # [0.75]
        "bagging_seed": 2018,
        "verbosity": -1,
        # "verbose": 0
        }
    
    lg_trn = lgb.Dataset(trn_X, label=trn_y, feature_name=tfvocab, categorical_feature=cat_vars)
    lg_val = lgb.Dataset(val_X, label=val_y, feature_name=tfvocab, categorical_feature=cat_vars)
    
    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    # GridSearch
    # min_params = lgb_gs(lg_params, lg_trn, trn_y, lg_val, val_X, val_y)
   
    # Train Start
    model = lgb.train(lg_params, lg_trn,
                      num_boost_round=16000,
                      valid_sets=[lg_val],
                      early_stopping_rounds=200,
                      verbose_eval=100)
    
    pred_trn = model.predict(trn_X, num_iteration=model.best_iteration)
    pred_val = model.predict(val_X, num_iteration=model.best_iteration)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))

    # Feature Importance
    logger.debug('Feature Importances')
    feat_n = model.feature_name()
    feat_i = list(model.feature_importance())

    df_tmp1 = pd.DataFrame(feat_n, columns={'feat_n'})
    df_tmp2 = pd.DataFrame(feat_i, columns={'feat_i'})
    df_tmp = df_tmp1.join(df_tmp2, how='inner')
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    # for i in range(len(df_tmp.index)):
    for i in range(50):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(
                         df_tmp.ix[i, 0], df_tmp.ix[i, 1]))
    return model
예제 #14
0
def run_xgb(train_X, train_y):
    xg_params = {
        "max_depth": 8,  # [4, 6, 8]
        "min_child_weight": 6,  # [4, 6, 8]
        "learning_rate": 0.1,  # [0.05, 0.075, 0.1, 0.2]
        "colsample_bytree": 0.8,
        "colsample_bylevel": 0.8,
        "reg_alpha": 0,
    }

    trn_X, val_X, trn_y, val_y = train_test_split(train_X,
                                                  train_y,
                                                  test_size=0.20,
                                                  random_state=0)
    xg_trn = xgb.DMatrix(trn_X, label=trn_y)
    xg_val = xgb.DMatrix(val_X, label=val_y)
    watchlist = [(xg_trn, 'train'), (xg_val, 'eval')]

    logger.info('split.train: {}'.format(trn_X.shape))
    logger.info('split.valid: {}'.format(val_X.shape))

    # GridSearch
    # min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist)

    model = xgb.train(xg_params,
                      xg_trn,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=50)

    pred_trn = model.predict(xg_trn, ntree_limit=model.best_ntree_limit)
    pred_val = model.predict(xg_val, ntree_limit=model.best_ntree_limit)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))

    # Feature Importance
    create_feats_map(list(trn_X.columns[2:]))
    feat_i = model.get_fscore(fmap=XGBFMAP)

    df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i'])
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    for i in range(len(df_tmp.index)):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0],
                                                     df_tmp.ix[i, 1]))
    return model
예제 #15
0
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows):
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    lr = 0.1
    bz = int(trn_X.shape[0] / 10)
    ep = 50
    op = ks.optimizers.Adam(lr=lr)

    model_in = ks.Input(shape=(trn_X.shape[1], ), dtype='float32', sparse=True)
    out = ks.layers.Dense(192, activation='relu')(model_in)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(1)(out)
    model = ks.Model(model_in, out)
    model.compile(loss='mean_squared_error', optimizer=op)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        x_test = trn_X[test_i]

        for j in range(ep):
            model.fit(x=x_trn, y=y_trn, batch_size=bz, epochs=1, verbose=0)
            pred_trn = model.predict(x_trn, batch_size=bz)[:, 0]
            rmse_trn = rmse(y_trn, pred_trn)
            logger.debug('epochs {0}: rmse - Train:{1:.6f}'.format(
                j + 1, rmse_trn))

        pred_trn_X[test_i] = model.predict(x_test, batch_size=bz)[:, 0]
        pred_test_skf[i, :] = model.predict(test_X, batch_size=bz)[:, 0]

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('rmse - NN Train: {}'.format(rmse_trn))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #16
0
def run_nn(trn_X, trn_y, val_X, val_y):
    lr = 0.1
    bz = int(trn_X.shape[0] / 10)
    ep = 50
    op = ks.optimizers.Adam(lr=lr)
    # op = ks.optimizers.SGD(lr=0.001, momentum=0.9)
    # with tf.Session(graph=tf.Graph(), config=config) as sess:
    model_in = ks.Input(shape=(trn_X.shape[1],), dtype='float32', sparse=True)
    out = ks.layers.Dense(192, activation='relu')(model_in)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.Dense(1)(out)
    model = ks.Model(model_in, out)
    model.compile(loss='mean_squared_error', optimizer=op)
    for i in range(ep):
        model.fit(x=trn_X, y=trn_y, batch_size=bz, epochs=1, verbose=1)
        pred_trn = model.predict(trn_X, batch_size=bz)[:, 0]
        pred_val = model.predict(val_X, batch_size=bz)[:, 0]
        rmse_trn = rmse(trn_y, pred_trn)
        rmse_val = rmse(val_y, pred_val)
        logger.info('epochs {0}: rmse - Train:{1:.4f} Valid:{2:.4f}'.format(i+1, rmse_trn, rmse_val))

    return model
예제 #17
0
def run_ridge(trn_X, trn_y, val_X, val_y):
    ridge_params = {
        'alpha': 20.0,
        'fit_intercept': True,
        'normalize': False,
        'copy_X': True,
        'max_iter': None,
        'tol': 0.001,
        'solver': 'auto',
        'random_state': 42
        }

    model = Ridge(**ridge_params)
    model.fit(trn_X, trn_y)
    
    pred_trn = model.predict(trn_X)
    pred_val = model.predict(val_X)
    rmse_trn = rmse(trn_y, pred_trn)
    rmse_val = rmse(val_y, pred_val)
    logger.info('rmse - Train: {}'.format(rmse_trn))
    logger.info('rmse - valid: {}'.format(rmse_val))
    
    return model
예제 #18
0
def ens_nn(trn_X, trn_y, trn_rows, test_X, test_rows):

    # trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit('param_feat', trn_X, test_X)
    # trn_X['description'], test_X['description'], tknzr_pf = tknzr_desc_fit('description', trn_X, test_X)

    trn_X = trn_X.ix[:, [1, 2, 3, 4, 8, 11, 5]]
    test_X = test_X.ix[:, [1, 2, 3, 4, 8, 11, 5]]

    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
    pred_trn_X = np.zeros((trn_rows, ))
    pred_test_X = np.zeros((test_rows, ))
    pred_test_skf = np.empty((NFOLDS, test_rows))

    lr = 0.005
    bz = 100000
    ep = 500
    op = Adam(lr=lr)
    # early = EarlyStopping(monitor='val_loss', patience=500, mode='min')

    logger.info('NN Train Shape: {}'.format(trn_X.shape))
    logger.info('NN Test Shape : {}'.format(test_X.shape))
    '''# model_in = ks.Input(shape=(trn_X.shape[1],), dtype='float32', sparse=True)
    # model_in = Input(shape=(trn_X.shape[1],), dtype='float32', sparse=False)
    # out = ks.layers.Dense(192, activation='relu')(model_in)
    # out = ks.layers.Dense(64, activation='relu')(out)
    # out = ks.layers.Dense(64, activation='relu')(out)
    out = Dense(16, activation='relu')(model_in)
    out = Dense(8, activation='relu')(out)
    out = Dense(8, activation='relu')(out)
    out = Dense(1, activation='sigmoid')(out)
    model = Model(model_in, out)'''

    model = make_model_nn(trn_X, test_X)
    model.compile(loss='mean_squared_error', optimizer=op)
    '''tr_reg, ts_reg = ((trn_X['region']), (test_X['region']))
    tr_city, ts_city = (np.array(trn_X['city']), np.array(test_X['city']))
    tr_pcn, ts_pcn = (np.array(trn_X['parent_category_name']), np.array(test_X['parent_category_name']))
    tr_cn, ts_cn = (np.array(trn_X['category_name']), np.array(test_X['category_name']))
    tr_ut, ts_ut = (np.array(trn_X['user_type']), np.array(test_X['user_type']))
    tr_pf, ts_pf, tknzr_pf = tknzr_fit('param_feat', trn_X, test_X)'''
    trn_X['param_feat'], test_X['param_feat'], tknzr_pf = tknzr_fit(
        'param_feat', trn_X, test_X)

    # trn_X = np.array([tr_reg, tr_city, tr_pcn, tr_cn, tr_ut, tr_pf[:,0]])
    # test_X = np.array([ts_reg, ts_city, ts_pcn, tr_cn, tr_ut, tr_pf[:,0]])

    trn_X = np.array(trn_X)
    test_X = np.array(test_X)

    for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X)))):
        # for i, (trn_i, test_i) in tqdm(list(enumerate(kf.split(trn_X.index)))):
        x_trn = trn_X[trn_i]
        y_trn = trn_y[trn_i]
        # x_trn = trn_X.iloc[trn_i]
        # y_trn = trn_y.iloc[trn_i]
        x_trn, x_val, y_trn, y_val = train_test_split(x_trn,
                                                      y_trn,
                                                      test_size=0.10,
                                                      shuffle=False,
                                                      random_state=23)
        x_test = trn_X[test_i]
        # x_test = trn_X.iloc[test_i]
        '''x_trn = np.array(x_trn)
        y_trn = np.array(y_trn)
        x_val = np.array(x_val)
        y_val = np.array(y_val)
        x_test = np.array(x_test)
        test_X = np.array(test_X)'''

        model.fit(
            x=[
                x_trn[:, 0], x_trn[:, 1], x_trn[:, 2], x_trn[:, 3],
                x_trn[:, 4], x_trn[:, 5]
            ],
            y=y_trn,
            # model.fit(x=[x_trn[:,[0,1,2,3,4,5]]], y=y_trn,
            validation_data=([
                x_val[:, 0], x_val[:, 1], x_val[:, 2], x_val[:, 3],
                x_val[:, 4], x_val[:, 5]
            ], y_val),
            batch_size=bz,
            epochs=ep,
            verbose=1)
        # batch_size=bz, epochs=ep, callbacks=[early], verbose=1)
        pred_trn_X[test_i] = model.predict([
            x_test[:, 0], x_test[:, 1], x_test[:, 2], x_test[:, 3],
            x_test[:, 4], x_test[:, 5]
        ],
                                           batch_size=bz)[:, 0]
        pred_test_skf[i, :] = model.predict([
            test_X[:, 0], test_X[:, 1], test_X[:, 2], test_X[:, 3],
            test_X[:, 4], test_X[:, 5]
        ],
                                            batch_size=bz)[:, 0]

    pred_test_X[:] = pred_test_skf.mean(axis=0)

    rmse_trn = rmse(trn_y, pred_trn_X)
    logger.info('RMSE - NN Train: {}, lr: {}, bz: {}, ep: {}'.format(
        rmse_trn, lr, bz, ep))

    return pred_trn_X.reshape(-1, 1), pred_test_X.reshape(-1, 1)
예제 #19
0
    # create_feats_map(list(trn_X.columns[2:]))
    feat_i = model.get_fscore(fmap=XGBFMAP)
    
    df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i'])
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()
    
    # for i in range(len(df_tmp.index)):
    for i in range(15):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(
                            df_tmp.ix[i, 0], df_tmp.ix[i, 1]))
    return model

if __name__ == '__main__':
    logger.info('Start')

    # temp1_df = load_train_data(nrows=ROW)
    # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv')
    # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left')
    # del temp1_df, temp2_df
    train_df = load_train_data(nrows=ROW)
    logger.info('Train Data load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=ROW)
    logger.info('test load end {}'.format(test_df.shape))

    # test_df = load_period_train_data(nrows=ROW)
    # logger.info('period train load end {}'.format(test_df.shape))

    # pr_test_df = load_period_test_data(nrows=ROW)
예제 #20
0
    feat_i = model.get_fscore(fmap=XGBFMAP)

    df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i'])
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    # for i in range(len(df_tmp.index)):
    for i in range(15):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0],
                                                     df_tmp.ix[i, 1]))
    return model


if __name__ == '__main__':
    logger.info('Start')

    # temp1_df = load_train_data(nrows=ROW)
    # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv')
    # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left')
    # del temp1_df, temp2_df
    train_df = load_train_data(nrows=ROW)
    logger.info('Train Data load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=ROW)
    logger.info('Test load end {}'.format(test_df.shape))

    # test_df = load_period_train_data(nrows=ROW)
    # logger.info('period train load end {}'.format(test_df.shape))

    # pr_test_df = load_period_test_data(nrows=ROW)
예제 #21
0
    create_feats_map(list(trn_X.columns[2:]))
    feat_i = model.get_fscore(fmap=XGBFMAP)

    df_tmp = pd.DataFrame(list(feat_i.items()), columns=['feat_n', 'feat_i'])
    df_tmp = df_tmp.sort_values(by=['feat_i'], ascending=False)
    df_tmp = df_tmp.reset_index(drop=True)
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()

    for i in range(len(df_tmp.index)):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(df_tmp.ix[i, 0],
                                                     df_tmp.ix[i, 1]))
    return model


if __name__ == '__main__':
    logger.info('Start')

    train_df = load_train_data(nrows=ROW)
    logger.info('train load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=ROW)
    logger.info('test load end {}'.format(test_df.shape))

    # Labels
    train_y = train_df["deal_probability"].values
    test_id = test_df["item_id"].values

    # Feature Weekday
    train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
    test_df["activation_weekday"] = test_df["activation_date"].dt.weekday