'--varthresh', help='variance thresh (default 0 means take all)', type=float, default=0) args = parser.parse_args() rand_state = 1 n_cv = args.cv n_iter_search = args.iter sample_rate = args.sample sub_sample = (False if sample_rate < 0 else True) var_thresh = args.varthresh scoring = 'log_loss' verbose = 10 du = DataUtil() du.load_data(sub_sample=sub_sample, sample_rate=sample_rate) x_train, x_test = du.vectorize_x( ['brand_code', 'model_code', 'label_id_bag'], variance_thresh=var_thresh) print('train set shape: ', x_train.shape) print('test set shape: ', x_test.shape) # xgb seems have issue detecting number of columns with sparse matrix x_train_xgb = sp.hstack( (x_train, sp.csr_matrix(np.ones((x_train.shape[0], 1))))) print( 'patching train data with non-zero column to get around xgb sparse issue') y_train = du.get_y_train() print('y_train shape: ', y_train.shape)
__email__ = '*****@*****.**' from data_util import DataUtil final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/NLPCC2016_Stance_Detection_Task_A_Testdata.txt' final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/TaskA_all_testdata_15000.csv' # 去进行分类的句子 final_test_classify_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/TaskA_all_testdata_14966.csv' # 分类结果的标签 clasify_result_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/cp_L_rf_1000tree_classify_label.csv' data_util = DataUtil() final_test_data = data_util.load_data(final_test_file_path) print(final_test_data.head()) print(final_test_data.shape) # quit() # final_test_data = final_test_data[[]] print(final_test_data[final_test_data['WORDS'].isnull()].shape) print(final_test_data[final_test_data['WORDS'].isnull()]) final_test_data = final_test_data[final_test_data['WORDS'].notnull()] data_util.save_data(final_test_data,'result/TaskA_all_testdata_15000_A.csv') # print(final_test_data.tail()) # print(final_test_data.sort_values(by=['ID']).tail()) quit() final_test_classify_data = data_util.load_data(final_test_classify_file_path) clasify_result_data = data_util.load_data(clasify_result_file_path) final_test_classify_data[u'STANCE'] = clasify_result_data[u'PREDICT']
from data_util import DataUtil from lstm import SemiLSTM if __name__ == '__main__': # 根据微博设定,截取文本最长长度为 140 data_util = DataUtil() # 1. 建立 LSTM 网络 lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50) feature, label = data_util.load_data('data/train.txt', True) unlabeled_data, _ = data_util.load_data('data/unlabeled.txt', False) test_data, test_label = data_util.load_data('data/test.txt', True) lstm.build_lstm([32]) lstm.train_semi(feature, label, test_data, test_label, unlabeled_data, round=5, saved_model='my-lstm') lstm.test(test_data, test_label) # 2. 根据训练好的模型,预测是否为不良言论 saved_lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50) text = '如何真正为自己的利益发声,而不被境外势力利用?那些势力并不关心你想要的民主,它们只想要中国弱下去' feature = data_util.extract_feature(text) result = saved_lstm.test_text(feature, saved_model='my-lstm') print(result) text = '菅义伟在开记者会,两次鞠躬、向国民道歉,“没能解除紧急事态,我非常抱歉”。记者问,“没能解除紧急事态的原因是什么?您自己觉得充分向国民说明了吗?”v光计划 。' feature = data_util.extract_feature(text) result = saved_lstm.test_text(feature, saved_model='my-lstm') print(result)
__date__ = 'create date: 2016-07-05' __email__ = '*****@*****.**' from data_util import DataUtil final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/NLPCC2016_Stance_Detection_Task_A_Testdata.txt' final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/TaskA_all_testdata_15000.csv' # 去进行分类的句子 final_test_classify_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/TaskA_all_testdata_14966.csv' # 分类结果的标签 clasify_result_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/cp_L_rf_1000tree_classify_label.csv' data_util = DataUtil() final_test_data = data_util.load_data(final_test_file_path) print(final_test_data.head()) print(final_test_data.shape) # quit() # final_test_data = final_test_data[[]] print(final_test_data[final_test_data['WORDS'].isnull()].shape) print(final_test_data[final_test_data['WORDS'].isnull()]) final_test_data = final_test_data[final_test_data['WORDS'].notnull()] data_util.save_data(final_test_data, 'result/TaskA_all_testdata_15000_A.csv') # print(final_test_data.tail()) # print(final_test_data.sort_values(by=['ID']).tail()) quit() final_test_classify_data = data_util.load_data(final_test_classify_file_path) clasify_result_data = data_util.load_data(clasify_result_file_path) final_test_classify_data[u'STANCE'] = clasify_result_data[u'PREDICT']