예제 #1
0
        '--varthresh', help='variance thresh (default 0 means take all)', type=float, default=0)

    args = parser.parse_args()

    rand_state = 1

    n_cv = args.cv
    n_iter_search = args.iter
    sample_rate = args.sample
    sub_sample = (False if sample_rate < 0 else True)
    var_thresh = args.varthresh
    scoring = 'log_loss'
    verbose = 10

    du = DataUtil()
    du.load_data(sub_sample=sub_sample, sample_rate=sample_rate)

    x_train, x_test = du.vectorize_x(
        ['brand_code', 'model_code', 'label_id_bag'], variance_thresh=var_thresh)
    print('train set shape: ', x_train.shape)
    print('test set shape: ', x_test.shape)

    # xgb seems have issue detecting number of columns with sparse matrix
    x_train_xgb = sp.hstack(
        (x_train, sp.csr_matrix(np.ones((x_train.shape[0], 1)))))
    print(
        'patching train data with non-zero column to get around xgb sparse issue')

    y_train = du.get_y_train()
    print('y_train shape: ', y_train.shape)
예제 #2
0
__email__ = '*****@*****.**'

from data_util import DataUtil
final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/NLPCC2016_Stance_Detection_Task_A_Testdata.txt'
final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/TaskA_all_testdata_15000.csv'

# 去进行分类的句子
final_test_classify_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/TaskA_all_testdata_14966.csv'
# 分类结果的标签
clasify_result_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/cp_L_rf_1000tree_classify_label.csv'



data_util = DataUtil()

final_test_data = data_util.load_data(final_test_file_path)
print(final_test_data.head())
print(final_test_data.shape)
# quit()
# final_test_data = final_test_data[[]]
print(final_test_data[final_test_data['WORDS'].isnull()].shape)
print(final_test_data[final_test_data['WORDS'].isnull()])
final_test_data = final_test_data[final_test_data['WORDS'].notnull()]
data_util.save_data(final_test_data,'result/TaskA_all_testdata_15000_A.csv')
# print(final_test_data.tail())
# print(final_test_data.sort_values(by=['ID']).tail())
quit()

final_test_classify_data = data_util.load_data(final_test_classify_file_path)
clasify_result_data = data_util.load_data(clasify_result_file_path)
final_test_classify_data[u'STANCE'] = clasify_result_data[u'PREDICT']
예제 #3
0
from data_util import DataUtil
from lstm import SemiLSTM

if __name__ == '__main__':
    # 根据微博设定,截取文本最长长度为 140
    data_util = DataUtil()
    # 1. 建立 LSTM 网络
    lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50)
    feature, label = data_util.load_data('data/train.txt', True)
    unlabeled_data, _ = data_util.load_data('data/unlabeled.txt', False)
    test_data, test_label = data_util.load_data('data/test.txt', True)
    lstm.build_lstm([32])
    lstm.train_semi(feature,
                    label,
                    test_data,
                    test_label,
                    unlabeled_data,
                    round=5,
                    saved_model='my-lstm')
    lstm.test(test_data, test_label)
    # 2. 根据训练好的模型,预测是否为不良言论
    saved_lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50)
    text = '如何真正为自己的利益发声,而不被境外势力利用?那些势力并不关心你想要的民主,它们只想要中国弱下去'
    feature = data_util.extract_feature(text)
    result = saved_lstm.test_text(feature, saved_model='my-lstm')
    print(result)
    text = '菅义伟在开记者会,两次鞠躬、向国民道歉,“没能解除紧急事态,我非常抱歉”。记者问,“没能解除紧急事态的原因是什么?您自己觉得充分向国民说明了吗?”v光计划 。'
    feature = data_util.extract_feature(text)
    result = saved_lstm.test_text(feature, saved_model='my-lstm')
    print(result)
예제 #4
0
__date__ = 'create date: 2016-07-05'
__email__ = '*****@*****.**'

from data_util import DataUtil

final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/NLPCC2016_Stance_Detection_Task_A_Testdata.txt'
final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/TaskA_all_testdata_15000.csv'

# 去进行分类的句子
final_test_classify_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/TaskA_all_testdata_14966.csv'
# 分类结果的标签
clasify_result_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/cp_L_rf_1000tree_classify_label.csv'

data_util = DataUtil()

final_test_data = data_util.load_data(final_test_file_path)
print(final_test_data.head())
print(final_test_data.shape)
# quit()
# final_test_data = final_test_data[[]]
print(final_test_data[final_test_data['WORDS'].isnull()].shape)
print(final_test_data[final_test_data['WORDS'].isnull()])
final_test_data = final_test_data[final_test_data['WORDS'].notnull()]
data_util.save_data(final_test_data, 'result/TaskA_all_testdata_15000_A.csv')
# print(final_test_data.tail())
# print(final_test_data.sort_values(by=['ID']).tail())
quit()

final_test_classify_data = data_util.load_data(final_test_classify_file_path)
clasify_result_data = data_util.load_data(clasify_result_file_path)
final_test_classify_data[u'STANCE'] = clasify_result_data[u'PREDICT']