] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) # Define a sequence labeling finetune task by PaddleHub's API seq_label_task = hub.SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config) # test data data = [ ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], ["为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], ["其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"], ["有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"], ["不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"], ] run_states = seq_label_task.predict(data=data) results = [run_state.run_results for run_state in run_states]
# Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=False, use_cuda=args.use_gpu, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) # Define a sequence labeling fine-tune task by PaddleHub's API # if add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask( feature=sequence_output, max_seq_len=args.max_seq_len, num_classes=num_classes, config=config, add_crf=False) # Data to be predicted text_a = [ "我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。", "为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。", "其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。", "有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。", "不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。", ] # Add 0x02 between characters to match the format of training data, # otherwise the length of prediction results will not match the input string # if the input string contains non-Chinese characters.
def main(): # Load Paddlehub pretrained model # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel #model_name = "ernie_tiny" model_name = "chinese-roberta-wwm-ext-large" module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # Download dataset and use SequenceLabelReader to read dataset dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = hub.reader.SequenceLabelReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # Select a finetune strategy strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a sequence labeling finetune task by PaddleHub's API # If add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) # Finetune and evaluate model by PaddleHub's API # will finish training, evaluation, testing, save model automatically if args.do_train: print("start finetune and eval process") seq_label_task.finetune_and_eval() if args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] run_states = seq_label_task.predict(data=input_data[1:]) results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, r_label in zip(predict_sents, results): sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines("{}.{}.pred".format(args.predict_data, args.do_model), ret)
def get_task(args, schema_labels, id): # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" model_name = "chinese-roberta-wwm-ext-large" module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # 加载数据并通过SequenceLabelReader读取数据 dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = SequenceLabelReader(dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 构建序列标注任务迁移网络 # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入 sequence_output = outputs["sequence_output"] # sequence_output = fluid.layers.dropout( # x=sequence_output , # dropout_prob=args.dropout, # dropout_implementation="upscale_in_train") # 设置模型program需要输入的变量feed_list # 必须按照以下顺序设置 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # 选择优化策略 strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # 配置运行设置 config = hub.RunConfig( log_interval=100, eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, # enable_memory_optim=True, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # 构建序列标注迁移任务 seq_label_task = hub.SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) seq_label_task.main_program.random_seed = args.random_seed add_hook(args, seq_label_task, id) return seq_label_task, reader