data_bundle = data_pipe.process(data_bundle) data_bundle.rename_field(field_name=Const.CHAR_INPUT, new_field_name=Const.INPUT, ignore_miss_dataset=True, rename_vocab=True) print_data_bundle(data_bundle) model_path = './data/UCAS_NLP_TC/model_textcnn_topk' init_file_path(model_path) logger.add_file_handler( os.path.join( model_path, 'log_{}.txt'.format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())))) # 日志写入文件 char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl') target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl') logger.warn('获取词典') char_vocab = data_bundle.get_vocab('words') logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = data_bundle.get_vocab('target') logger.info('target_vocab:{}'.format(target_vocab)) save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d') logger.warn('神经网络模型') model = CNNText(word2vec_embed, num_classes=len(target_vocab)) logger.info(model) logger.warn('训练超参数设定') loss = CrossEntropyLoss()
if __name__ == "__main__": """ 预测示例输出结果:{"id": 0, "label": "102", "label_desc": "news_entertainment"} """ model_path = './data/tnews_public/model_textcnn' test_data_json_file_name = './data/tnews_public/test.json' label_json_file_name = './data/tnews_public/labels.json' char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl') target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl') model_name = os.path.join(model_path, 'best_CNNText_f_2020-05-14-23-33-55') predict_output_json_file_name = os.path.join( model_path, 'pred_2020-05-14-23-33-55.json') predict_output_file_name = os.path.join(model_path, 'pred_2020-05-14-23-33-55.txt') logger.warn('加载标签映射关系') json_file_iter = read_json_file_iter(label_json_file_name) label_link_dict = dict() for row_json in json_file_iter: label_link_dict[row_json['label_desc']] = row_json['label'] logger.info(label_link_dict) logger.warn('开始加载模型') model = torch.load(model_name) model.eval() logger.info('模型加载完毕:\n{}'.format(model)) logger.warn('获取词典') char_vocab = load_serialize_obj(char_vocab_pkl_file) logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = load_serialize_obj(target_vocab_pkl_file) logger.info('target_vocab:{}'.format(target_vocab)) logger.warn('加载测试数据')