def add_group_feature(x_train, x_valid, x_test, x_sub):
    others = [x_valid, x_test, x_sub]
    x_train, others = fe.goal_min_group(x_train, others)
    x_train, others = fe.goal_max_group(x_train, others)
    x_train, others = fe.duration_group(x_train, others)
    x_train, others = fe.text_to_word_count(x_train, others)
    return x_train, others[0], others[1], others[2]
def add_group_feature(x_train, x_valid, x_test, x_sub, y_train):
    others = [x_valid, x_test, x_sub]
    # x_train, others = fe.goal_min_group(x_train, others)
    # x_train, others = fe.goal_max_group(x_train, others)
    # x_train, others = fe.duration_group(x_train, others)

    x_train, others = fe.target_encoding(x_train, y_train, others,
                                         'country_encoding')
    x_train, others = fe.target_encoding(x_train, y_train, others,
                                         'category1_encoding')
    x_train, others = fe.target_encoding(x_train, y_train, others,
                                         'category2_encoding')

    x_train, others = fe.multi_target_encoding(
        x_train, y_train, others,
        ['country_encoding', 'category1_encoding', 'category2_encoding'])

    x_train, others = fe.text_to_word_count(x_train, others)

    return x_train, others[0], others[1], others[2]
예제 #3
0
    target = 'state'


    cv = 10
    train_dfs, valid_dfs, test_dfs = training.cv(train_df, cv)
    scores = list()
    params = None
    tune = True
    name = 'lgb_baseline'
    for cv_idx in range(cv):
        'prepare'
        x_train, y_train = train_dfs[cv_idx][features], train_dfs[cv_idx][target]
        x_valid, y_valid = valid_dfs[cv_idx][features], valid_dfs[cv_idx][target]
        x_test, y_test = test_dfs[cv_idx][features], test_dfs[cv_idx][target]
        
        x_train, others = fe.text_to_word_count(x_train, [x_valid, x_test, test_df[features]])
        x_valid, x_test, x_sub = others[0], others[1], others[2]
        
        lgb_train, lgb_valid = lgb.Dataset(x_train, y_train, categorical_feature=[3, 4, 5], free_raw_data=False), lgb.Dataset(x_valid, y_valid, categorical_feature=[3, 4, 5], free_raw_data=False)

        'train'
        if tune:
            params = training.tuning(lgb_train, lgb_valid, 100)
            pd.to_pickle(params, 'params/{0}_cv{1}.pkl'.format(name, cv_idx))
        model = training.train(lgb_train, lgb_valid, params)
        score = training.evaluation(model, x_test, y_test)
        scores.append(score)
        model.save_model('model/{0}_cv{1}.txt'.format(name, cv_idx), num_iteration=model.best_iteration)

        'predict'
        pred = model.predict(x_sub)
예제 #4
0
    target = 'state'


    cv = 10
    train_dfs, valid_dfs, test_dfs = training.cv(train_df, cv)
    scores = list()
    params = None
    tune = True
    name = 'word_count_plus_h1_word_count'
    for cv_idx in range(cv):
        'prepare'
        x_train, y_train = train_dfs[cv_idx][features], train_dfs[cv_idx][target]
        x_valid, y_valid = valid_dfs[cv_idx][features], valid_dfs[cv_idx][target]
        x_test, y_test = test_dfs[cv_idx][features], test_dfs[cv_idx][target]
        
        x_train, others = fe.text_to_word_count(x_train, [x_valid, x_test, test_df[features]], del_html_content=False)
        x_valid, x_test, x_sub = others[0], others[1], others[2]
        x_train, others = fe.text_to_h1_word_count(x_train, [x_valid, x_test, x_sub])
        x_valid, x_test, x_sub = others[0], others[1], others[2]
        
        
        lgb_train, lgb_valid = lgb.Dataset(x_train, y_train, categorical_feature=[3, 4, 5], free_raw_data=False), lgb.Dataset(x_valid, y_valid, categorical_feature=[3, 4, 5], free_raw_data=False)

        'train'
        if tune:
            params = training.tuning(lgb_train, lgb_valid, 100)
            pd.to_pickle(params, 'params/{0}_cv{1}.pkl'.format(name, cv_idx))
        model = training.train(lgb_train, lgb_valid, params)
        score = training.evaluation(model, x_test, y_test)
        scores.append(score)
        model.save_model('model/{0}_cv{1}.txt'.format(name, cv_idx), num_iteration=model.best_iteration)