예제 #1
0
def get_train_test_features0():
    config.set_feature_name('f0')
    if os.path.exists(config.train_feature_file) and os.path.exists(config.test_feature_file):
        logger.info('loading the training and test features from files.')
        trn = pd.read_csv(config.train_feature_file)
        tst = pd.read_csv(config.test_feature_file)
    
    y = trn['click_mode'].values
    sub = tst[['sid']].copy()

    feat = pd.read_csv('/home/ubuntu/projects/kddcup2019track1/build/feature/od_coord_feature.csv')
    trn = trn.merge(feat, how='left', on='sid')
    tst = tst.merge(feat, how='left', on='sid')

    feat = pd.read_csv('/home/ubuntu/projects/kddcup2019track1/input/data_set_phase1/var_dist_time.csv')
    trn = trn.merge(feat, how='left', on='sid')
    tst = tst.merge(feat, how='left', on='sid')

    feat = pd.read_csv('/home/ubuntu/projects/kddcup2019track1/input/data_set_phase1/var_dist_min.csv')
    trn = trn.merge(feat, how='left', on='sid')
    tst = tst.merge(feat, how='left', on='sid')

    trn.drop(['sid', 'click_mode'], axis=1, inplace=True)
    tst.drop(['sid', 'click_mode'], axis=1, inplace=True)

    return trn, y, tst, sub
예제 #2
0
def get_train_test_features4():
    config.set_feature_name('f4')
    if os.path.exists(config.train_feature_file) and os.path.exists(config.test_feature_file):
        logger.info('loading the training and test features from files.')
        trn = pd.read_csv(config.train_feature_file)
        tst = pd.read_csv(config.test_feature_file)
    
    y = trn['click_mode'].values
    sub = tst[['sid']].copy()

    trn.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True)
    tst.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True)

    return trn, y, tst, sub
예제 #3
0
def get_train_test_features3():
    config.set_feature_name('f3')
    if os.path.exists(config.train_feature_file) and os.path.exists(config.test_feature_file):
        logger.info('loading the training and test features from files.')
        trn = pd.read_csv(config.train_feature_file)
        tst = pd.read_csv(config.test_feature_file)
    else:
        df = merge_raw_data()
        logger.info('generating feature f3.')
        trn, tst = generate_f3(df)

        logger.info('saving the training and test f3 features.')
        trn.to_csv(config.train_feature_file, index=False)
        tst.to_csv(config.test_feature_file, index=False)

    y = trn['click_mode'].values
    sub = tst[['sid']].copy()

    trn.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True)
    tst.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True)

    return trn, y, tst, sub
예제 #4
0
    prob_trn_tst = 0
    for seed in [0, 17, 23, 29]:
        params['seed'] = 2019 + seed
        print(params)
        clf = lgb.train(params,
                        lgb_trn,
                        valid_sets=[lgb_trn],
                        num_boost_round=best_iteration,
                        verbose_eval=50,
                        feval=eval_f)

        prob_trn_tst += clf.predict(tst)

    prob_trn_tst /= 4.0

    np.savetxt(config.predict_trn_tst_bag_file, prob_trn_tst, delimiter=',')

    trn_tst = np.argmax(prob_trn_tst, axis=1)

    return trn_tst


if __name__ == '__main__':

    trn, y, tst, sub = get_train_test_features2a()

    config.set_algo_name('lgb3')
    config.set_feature_name('f2a')
    p_tst = train_lgb(trn, y, tst)

    submit_result(sub, p_tst)
    prob_trn_tst = 0
    for seed in [0, 17, 23, 29]:
        params['seed'] = 2019 + seed
        print(params)
        clf = lgb.train(params,
                        lgb_trn,
                        valid_sets=[lgb_trn],
                        num_boost_round=best_iteration,
                        verbose_eval=50,
                        feval=eval_f)

        prob_trn_tst += clf.predict(tst)

    prob_trn_tst /= 4.0

    np.savetxt(config.predict_trn_tst_bag_file, prob_trn_tst, delimiter=',')

    trn_tst = np.argmax(prob_trn_tst, axis=1)

    return trn_tst


if __name__ == '__main__':

    trn, y, tst, sub = get_train_test_features0()

    config.set_algo_name('lgb5')
    config.set_feature_name('f0')
    p_tst = train_lgb(trn, y, tst)

    submit_result(sub, p_tst)
                
    X_trn, y_trn, X_val, y_val = trn.iloc[:-63388,:], y[:-63388], trn.iloc[-63388:,], y[-63388:]
    
    eval_set = [(X_trn, y_trn), (X_val, y_val)]
    clf.fit(X_trn, y_trn, eval_set=eval_set, eval_metric=f1_weighted, categorical_feature=cat_cols, verbose=10, early_stopping_rounds=100)
    #clf.fit(X_trn, y_trn, eval_set=eval_set, eval_metric=f1_adj_weighted, categorical_feature=cat_cols, verbose=10, early_stopping_rounds=100)    

    feature_importances = list(clf.feature_importances_)
    feature_names = trn.columns.values.tolist()
    imp = pd.DataFrame({'feature_importances': feature_importances, 'feature_names':feature_names})
    imp = imp.sort_values('feature_importances', ascending=False).drop_duplicates()
    print("[+] All feature importances", list(imp.values))

    pred = clf.predict(X_val, num_iteration=clf.best_iteration_)
    print('Val F1: %f',  f1_score(y_val, pred, average='weighted'))
    print(classification_report(y_val, pred))

if __name__ == '__main__':

    trn, y, tst, sub = get_train_test_features2()
    #df = pd.read_csv(config.train_feature_file)
    #df = df[~pd.isnull(df['click_mode'])]

    #trn = df.drop(['sid','req_time', 'click_mode'], axis=1)
    #y = df['click_mode'].values

    config.set_algo_name('lgb4')
    config.set_feature_name('f2') # f2 = 
    train_lgb(trn, y)