示例#1
0
def testF1(iter):
    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)

    data = pd.read_csv('../data/train.csv')
    subject_vocab = list(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break

    # value_list = list()
    # for i in data['sentiment_value']:
    #     value_list.append(i)
    train_vec = Bdc.cal_bdc_with_vec(train_vec, subject_list, 10)

    test_id_csv = data['content_id']
    test_id = list(test_id_csv)
    test_id = test_id[9447:]
    test_vec = train_vec[9447:]
    test_id_single = list()
    test_vec_single = list()
    for l in range(len(test_id)):
        if test_id[l] not in test_id_single:
            test_id_single.append(test_id[l])
            test_vec_single.append(test_vec[l])
    for i in range(100):
        print((i + 1) * iter)
        res_id, res_subject = Lgb.cal_subject(train_vec[:9447], subject_list[:9447], test_id_single, test_vec_single, (i + 1) * iter)
        GetResult.cal_F1(res_id, res_subject, 9447)
示例#2
0
def cvtest():
    res = open('../res.txt1', 'w')
    params = { 'boosting_type':'gbdt', 'num_leaves':55, 'reg_alpha':0.1, 'reg_lambda':0,
              'max_depth':15, 'objective':'binary',
              'subsample':0.8, 'colsample_bytree':0.8, 'subsample_freq':1,
              'learning_rate':0.06, 'min_child_weight':1, 'random_state':20, 'n_jobs':4}

    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)

    data = pd.read_csv('../data/train.csv')
    subject_vocab = list(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break
    print(train_vec)
    train_vec = Bdc.cal_bdc_with_vec(train_vec, subject_list, 10)
    print(train_vec)

    test_res = list()
    for l in range(len(subject_list)):
        test_res.append(list())
    for i in range(10):
        train_label_onehot = subject_list.copy()
        for l in range(len(subject_list)):
            if subject_list[l] != i:
                train_label_onehot[l] = 0
            else:
                train_label_onehot[l] = 1
        # print(train_label_onehot)
        # print(train_subject)
        data_train = lgb.Dataset(train_vec, train_label_onehot)
        clf = lgb.cv(
            params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
            early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
        res.write(str(len(clf['rmse-mean'])))
        res.write(' ')
        res.write(str(clf['rmse-mean'][-1]))
        res.write('\n')
    data_train = lgb.Dataset(train_vec, subject_list)
    clf = lgb.cv(
        params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
        early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
    res.write(str(len(clf['rmse-mean'])))