Exemplo n.º 1
0
def testF1(iter):
    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)

    data = pd.read_csv('../data/train.csv')
    subject_vocab = list(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break

    # value_list = list()
    # for i in data['sentiment_value']:
    #     value_list.append(i)
    train_vec = Bdc.cal_bdc_with_vec(train_vec, subject_list, 10)

    test_id_csv = data['content_id']
    test_id = list(test_id_csv)
    test_id = test_id[9447:]
    test_vec = train_vec[9447:]
    test_id_single = list()
    test_vec_single = list()
    for l in range(len(test_id)):
        if test_id[l] not in test_id_single:
            test_id_single.append(test_id[l])
            test_vec_single.append(test_vec[l])
    for i in range(100):
        print((i + 1) * iter)
        res_id, res_subject = Lgb.cal_subject(train_vec[:9447], subject_list[:9447], test_id_single, test_vec_single, (i + 1) * iter)
        GetResult.cal_F1(res_id, res_subject, 9447)
Exemplo n.º 2
0
def cal_subject(train_vec, test_vec, subject_list):
    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)

    train_vec_tmp = train_vec.copy()

    for i in range(train_vec_tmp.shape[0]):
        for j in range(train_vec_tmp.shape[1]):
            if train_vec_tmp[i][j] != 0:
                train_vec_tmp[i][j] = bdc[j] * train_vec[i][j]

    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] != 0:
                test_vec[i][j] = bdc[j] * test_vec[i][j]

    # clf = svm.LinearSVC()
    clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=55, reg_alpha=0.0, reg_lambda=1,
        max_depth=15, n_estimators=6000, objective='binary',
        subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
        learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4)

    clf.fit(train_vec_tmp, subject_list)
    predict_subject = clf.predict(test_vec)
    print(predict_subject)
    return predict_subject
Exemplo n.º 3
0
def cvtest():
    res = open('../res.txt1', 'w')
    params = { 'boosting_type':'gbdt', 'num_leaves':55, 'reg_alpha':0.1, 'reg_lambda':0,
              'max_depth':15, 'objective':'binary',
              'subsample':0.8, 'colsample_bytree':0.8, 'subsample_freq':1,
              'learning_rate':0.06, 'min_child_weight':1, 'random_state':20, 'n_jobs':4}

    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)

    data = pd.read_csv('../data/train.csv')
    subject_vocab = list(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break
    print(train_vec)
    train_vec = Bdc.cal_bdc_with_vec(train_vec, subject_list, 10)
    print(train_vec)

    test_res = list()
    for l in range(len(subject_list)):
        test_res.append(list())
    for i in range(10):
        train_label_onehot = subject_list.copy()
        for l in range(len(subject_list)):
            if subject_list[l] != i:
                train_label_onehot[l] = 0
            else:
                train_label_onehot[l] = 1
        # print(train_label_onehot)
        # print(train_subject)
        data_train = lgb.Dataset(train_vec, train_label_onehot)
        clf = lgb.cv(
            params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
            early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
        res.write(str(len(clf['rmse-mean'])))
        res.write(' ')
        res.write(str(clf['rmse-mean'][-1]))
        res.write('\n')
    data_train = lgb.Dataset(train_vec, subject_list)
    clf = lgb.cv(
        params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
        early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
    res.write(str(len(clf['rmse-mean'])))
Exemplo n.º 4
0
def get_res(iter, baseline):
    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    test_file = pd.read_csv('../data/test_public.csv')
    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)
    data = pd.read_csv('../data/train.csv')
    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])
    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break

    value_list = list()
    for i in data['sentiment_value']:
        value_list.append(i)

    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)
    for i in range(train_vec.shape[0]):
        for j in range(train_vec.shape[1]):
            if train_vec[i][j] > 0:
                train_vec[i][j] = bdc[j]

    print(train_vec)
    test_vec = Doc2Vec.test2vec()
    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] > 0:
                test_vec[i][j] = bdc[j]

    print(test_vec)
    test_id = list(test_file['content_id'])

    res_id, res_subject, value_list = Lgb.cal_subject_mul(
        train_vec, subject_list, test_id, test_vec, iter, baseline)

    GetResult.res2doc_mul(res_id, res_subject, value_list)
Exemplo n.º 5
0
def cv_test_mul():
    train_vec = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/content_vec_withoutD.csv',
        header=None)
    test_file = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/test_public.csv')

    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)
    data = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/train.csv')
    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break
    subject_list = np.array(subject_list)

    value_list = list()
    for i in data['sentiment_value']:
        value_list.append(i)
    value_list = np.array(value_list)

    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)
    for i in range(train_vec.shape[0]):
        for j in range(train_vec.shape[1]):
            if train_vec[i][j] > 0:
                train_vec[i][j] = bdc[j]

    print(train_vec)
    test_vec = Doc2Vec.test2vec()
    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] > 0:
                test_vec[i][j] = bdc[j]
    test_id = list(test_file['content_id'])
    X, test, y, test_id, y1 = train_vec, test_vec, value_list, test_id, subject_list
    N = 10
    res = open('res2.txt', 'w')
    # kf = StratifiedKFold(n_splits=N, random_state=2018).split(X, y)
    for i in range(10):
        subject_oh = y1.copy()
        for l in range(len(subject_oh)):
            if subject_oh[l] != i:
                subject_oh[l] = 0
            else:
                subject_oh[l] = 1
        params = {
            'boosting_type': 'gbdt',
            'num_leaves': 55,
            'reg_alpha': 0.1,
            'reg_lambda': 1,
            'max_depth': 15,
            'objective': 'binary',
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'subsample_freq': 1,
            'learning_rate': 0.06,
            'min_child_weight': 1,
            'random_state': 20,
            'n_jobs': 4
        }

        data_train = lgb.Dataset(X, subject_oh)
        clf = lgb.cv(params,
                     data_train,
                     num_boost_round=10000,
                     nfold=5,
                     stratified=False,
                     shuffle=True,
                     metrics='rmse',
                     early_stopping_rounds=50,
                     verbose_eval=50,
                     show_stdv=True,
                     seed=0)
        res.write(str(len(clf['rmse-mean'])))
        res.write(' ')
        res.write(str(clf['rmse-mean'][-1]))
        res.write('\n')
Exemplo n.º 6
0
def run_base_bdc():
    train_vec = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/content_vec_withoutD.csv',
        header=None)
    test_file = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/test_public.csv')

    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)
    data = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/train.csv')
    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break
    subject_list = np.array(subject_list)

    value_list = list()
    for i in data['sentiment_value']:
        value_list.append(i)
    value_list = np.array(value_list)

    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)
    for i in range(train_vec.shape[0]):
        for j in range(train_vec.shape[1]):
            if train_vec[i][j] > 0:
                train_vec[i][j] = bdc[j]

    print(train_vec)
    test_vec = Doc2Vec.test2vec()
    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] > 0:
                test_vec[i][j] = bdc[j]

    print(test_vec)
    test_id = list(test_file['content_id'])

    N = 10
    kf = StratifiedKFold(n_splits=N,
                         random_state=2018).split(train_vec, subject_list)

    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=80,
                             reg_alpha=0.1,
                             reg_lambda=1,
                             max_depth=8,
                             n_estimators=500,
                             objective='binary',
                             subsample=0.8,
                             colsample_bytree=0.8,
                             subsample_freq=1,
                             learning_rate=0.06,
                             min_child_weight=1,
                             random_state=20,
                             n_jobs=4)
    clf_1 = lgb.LGBMClassifier(boosting_type='gbdt',
                               num_leaves=80,
                               reg_alpha=0.1,
                               reg_lambda=1,
                               max_depth=8,
                               n_estimators=10,
                               objective='binary',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               subsample_freq=1,
                               learning_rate=0.06,
                               min_child_weight=1,
                               random_state=20,
                               n_jobs=4)
    y_train_oofp = np.zeros_like(subject_list, dtype='float64')
    y_train_oofp1 = np.zeros_like(subject_list, dtype='float64')
    '''
    y_train_oofp: 
    y_y_train_oofp1:
    '''

    y_test_oofp = np.zeros((test_vec.shape[0], N))
    y_test_oofp_1 = np.zeros((test_vec.shape[0], N))

    acc = 0
    vcc = 0

    l = 0
    ll = 0
    for i, (train_fold, test_fold) in enumerate(kf):
        X_train, X_validate, label_train, label_validate, label_1_train, label_1_validate, = \
        train_vec[train_fold, :], train_vec[test_fold,:], value_list[train_fold], value_list[test_fold], subject_list[train_fold], subject_list[test_fold]
        clf.fit(X_train, label_train)

        val_ = clf.predict(X_validate)
        y_train_oofp[test_fold] = val_
        if micro_avg_f1(label_validate, val_) > 0.7:
            l += 1
            print('sentiment_value_f1:%f' % micro_avg_f1(label_validate, val_))
            acc += micro_avg_f1(label_validate, val_)
            result = clf.predict(test_vec)
            y_test_oofp[:, i] = result

        # clf = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6)

        clf_1.fit(X_train, label_1_train)
        val_1 = clf_1.predict(X_validate)
        y_train_oofp1[test_fold] = val_

        if micro_avg_f1(label_1_validate, val_1) > 0.6:
            ll += 1
            vcc += micro_avg_f1(label_1_validate, val_1)
            result = clf_1.predict(test_vec)
            y_test_oofp_1[:, i] = result

    print(acc / l)
    print(vcc / ll)

    lbl = pk.load(open('../tmp/label_encoder.sav', 'rb'))
    res_2 = []
    for i in range(y_test_oofp_1.shape[0]):
        tmp = []
        for j in range(N):
            tmp.append(int(y_test_oofp_1[i][j]))
        word_counts = Counter(tmp)
        yes = word_counts.most_common(1)
        res_2.append(lbl.inverse_transform([yes[0][0]])[0])

    res = []
    for i in range(y_test_oofp.shape[0]):
        tmp = []
        for j in range(N):
            tmp.append(y_test_oofp[i][j])
        res.append(max(set(tmp), key=tmp.count))

    result = pd.DataFrame()
    result['content_id'] = list(test_id)

    result['subject'] = list(res_2)
    result['subject'] = result['subject']

    result['sentiment_value'] = list(res)
    result['sentiment_value'] = result['sentiment_value'].astype(int)

    result['sentiment_word'] = ''
    result.to_csv('../submit_bdc.csv', index=False)
Exemplo n.º 7
0
def cal_value(train_vec, test_vec, subject_list, predict_subject, value_list):
    predict_value = np.zeros(len(predict_subject))

    for i in range(10):
        train_i = 0
        test_i = 0
        for d in subject_list:
            if d == i:
                train_i += 1

        for d in predict_subject:
            if d == i:
                test_i += 1

        train_vec_with_subject = np.zeros([train_i, train_vec.shape[1]])
        value_list_with_subject = np.zeros([train_i])
        test_vec_with_subject = np.zeros([test_i, test_vec.shape[1]])

        k = 0
        for d in range(train_vec.shape[0]):
            if subject_list[d] == i:
                train_vec_with_subject[k] = train_vec[d]
                value_list_with_subject[k] = value_list[d]
                k += 1

        k = 0
        for d in range(test_vec.shape[0]):
            if predict_subject[d] == i:
                test_vec_with_subject[k] = test_vec[d]
                k += 1

        bdc = Bdc.cal_bdc(train_vec_with_subject, value_list_with_subject, 3)

        for k in range(train_vec_with_subject.shape[0]):
            for j in range(train_vec_with_subject.shape[1]):
                if train_vec_with_subject[k][j] != 0:
                    train_vec_with_subject[k][j] = bdc[j] * train_vec_with_subject[k][j]
                # if j in high_weight_list:
                #     train_vec_with_subject[k][j] = 3 * train_vec_with_subject[k][j]

        for k in range(test_vec_with_subject.shape[0]):
            for j in range(test_vec_with_subject.shape[1]):
                if test_vec_with_subject[k][j] != 0:
                    test_vec_with_subject[k][j] = bdc[j] * test_vec_with_subject[k][j]
                # if j in high_weight_list:
                #     test_vec_with_subject[k][j] = 3 * test_vec_with_subject[k][j]

        # clf = svm.LinearSVC()
        clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=30, reg_alpha=0.0, reg_lambda=1,
                                 max_depth=6, n_estimators=6000, objective='binary',
                                 subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
                                 learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4,
                                 min_child_samples=5)
        clf.fit(train_vec_with_subject, value_list_with_subject)
        print(test_vec_with_subject.shape)
        predict_value_i = clf.predict(test_vec_with_subject)

        k = 0
        for d in range(len(predict_subject)):
            if i == predict_subject[d]:
                predict_value[d] = predict_value_i[k]
                k += 1

    return predict_value
Exemplo n.º 8
0
        test_vec_with_subject = np.zeros([test_i, test_vec.shape[1]])

        k = 0
        for d in range(train_vec.shape[0]):
            if subject_list[d] == i:
                train_vec_with_subject[k] = train_vec[d]
                value_list_with_subject[k] = value_list[d]
                k += 1

        k = 0
        for d in range(test_vec.shape[0]):
            if predict_subject[d] == i:
                test_vec_with_subject[k] = test_vec[d]
                k += 1

        bdc = Bdc.cal_bdc(train_vec_with_subject, value_list_with_subject, 3)

        for k in range(train_vec_with_subject.shape[0]):
            for j in range(train_vec_with_subject.shape[1]):
                if train_vec_with_subject[k][j] != 0:
                    train_vec_with_subject[k][
                        j] = bdc[j] * train_vec_with_subject[k][j]
                # if j in high_weight_list:
                #     train_vec_with_subject[k][j] = 3 * train_vec_with_subject[k][j]

        for k in range(test_vec_with_subject.shape[0]):
            for j in range(test_vec_with_subject.shape[1]):
                if test_vec_with_subject[k][j] != 0:
                    test_vec_with_subject[k][
                        j] = bdc[j] * test_vec_with_subject[k][j]
                # if j in high_weight_list: