def predict_by_model_path(args, model_path, schema_labels, predict_data, predict_sents, id): seq_label_task, reader = get_task(args, schema_labels, predict_data, predict_sents, id) seq_label_task.init_if_necessary() seq_label_task.load_parameters(model_path) logger.info("PaddleHub has loaded model from %s" % model_path) if args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] run_states = seq_label_task.predict(data=input_data[1:]) results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, r_label in zip(predict_sents, results): sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines( "{}.{}.{}.pred".format(output_predict_data_path, args.do_model, id), ret)
def get_submit_postprocess(args, id): results = read_by_lines("{}.{}.{}.pred".format(output_predict_data_path, args.do_model, id)) submit = [] count = 0 for j in range(len(results)): json_result = json.loads(results[j]) text = json_result['text'] label = json_result["labels"] now_label = '' now_entity = '' count = 0 # print(len(text),len(label)) for i, l in enumerate(label): # print(l,text[i]) if (l == 'O'): if (now_label != ''): count += 1 submit.append('\t'.join( [str(json_result['id']), now_label, now_entity])) now_label = '' now_entity = '' else: if (l.startswith('B-')): now_label = l[2:] now_entity = text[i] else: now_entity += text[i] # if(count==0): # submit.append('\t'.join([str(json_result['id']),'','',text,str(label)])) # print(submit) write_by_lines("{}/{}ucas_valid_result.csv".format(output_path, id), submit)
def one_autofinetune(args, schema_labels, predict_data, predict_sents, id): seq_label_task, reader = get_task(args, schema_labels, id) # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" # PaddleHub Finetune API # 将自动训练、评测并保存模型 if args.do_train: print("start finetune and eval process") seq_label_task.finetune_and_eval() write_log('./work/log/' + args.do_model + '.txt', args, str(seq_label_task.best_score)) if args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] run_states = seq_label_task.predict(data=input_data[1:]) results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, r_label in zip(predict_sents, results): sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines( "{}.{}.{}.pred".format(args.predict_data, args.do_model, id), ret) # Load model from the defined model path or not # # seq_label_task.finetune_and_eval() # run_states = seq_label_task.eval() # eval_avg_score, eval_avg_loss, eval_run_speed =seq_label_task._calculate_metrics( # run_states) # Move ckpt/best_model to the defined saved parameters directory best_model_dir = os.path.join(args.checkpoint_dir, "best_model") if is_path_valid(args.saved_params_dir) and os.path.exists(best_model_dir): shutil.copytree(best_model_dir, args.saved_params_dir) shutil.rmtree(args.checkpoint_dir) write_log('./work/log/' + args.do_model + '.txt', args, id + ',' + str(seq_label_task.best_score)) print(seq_label_task.best_score) hub.report_final_result(seq_label_task.best_score)
def process_data(args): # get_train_dev() predict_data, predict_sents = get_predict() # write_by_lines("{}/{}_train.tsv".format(args.data_dir, args.do_model), train_data) # write_by_lines("{}/{}_dev.tsv".format(args.data_dir, args.do_model), dev_data) # write_by_lines("{}/{}_test.tsv".format(args.data_dir, args.do_model), test_data) write_by_lines("{}/predict.txt".format(args.data_dir), predict_data) schema_labels = read_label('{}/entity2id.txt'.format(args.data_dir)) return schema_labels, predict_data, predict_sents
def one(args, schema_labels, predict_data, predict_sents, id): seq_label_task, reader = get_task(args, schema_labels, id) # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" # PaddleHub Finetune API # 将自动训练、评测并保存模型 if args.do_train: print("start finetune and eval process") seq_label_task.finetune_and_eval() write_log('./work/log/' + args.do_model + '.txt', args, id + ',' + str(seq_label_task.best_score)) if args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] # print(input_data[:10]) run_states = seq_label_task.predict(data=input_data) results = [] for batch_states in run_states: batch_results = batch_states.run_results # print('batch_infers',batch_results ) batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, input, r_label in zip(predict_sents, input_data, results): sent["input"] = input sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines( "{}.{}.{}.pred".format(output_predict_data_path, args.do_model, id), ret) get_submit_postprocess(args, id) get_submit_postprocess(args, id, check=True)
def get_check_postprocess(args, id): import json results = read_by_lines("{}.{}.{}.pred".format(output_predict_data_path, args.do_model, id)) print(results[0]) submit = [] count = 0 for j in range(len(results)): json_result = json.loads(results[j]) text = json_result['text'] label = json_result["labels"] now_label = '' now_entity = -1 count = 0 # print(len(text),len(label)) for i, l in enumerate(label): # print(l,text[i]) if (l == 'O'): if (now_label != ''): count += 1 submit.append('\t'.join([ str(json_result['id']), now_label, text[now_entity:i], str(json_result['input'][0][now_entity * 2:i * 2]), str(json_result['input'][0]), text, str(label) ])) now_label = '' # now_entity=-1 else: if (l.startswith('B-')): now_label = l[2:] now_entity = i elif (now_label != ''): # if(len(submit)>1): # submit.pop() now_label = label[i][2:] # if(count==0): # submit.append('\t'.join([str(json_result['id']),'','',text,str(label)])) # print(submit) write_by_lines("{}/{}ucas_valid_result.csv".format(output_path, id), submit)
def process_data(args, i=4): if args.do_model == 'mcls': train1, dev1 = get_mcls_train_dev(args, i) predict_data, predict_sents = get_mcls_predict(args) # write_by_lines("{}/predict_mcls.txt".format(args.data_dir), predict_data) schema_labels = read_label('{}/event2id.txt'.format(args.data_dir))[1:] elif args.do_model == 'role': train1, dev1 = get_train_dev(args, i) predict_data, predict_sents = get_predict(args) write_by_lines("{}/train.txt".format(args.data_dir), train1) write_by_lines("{}/dev.txt".format(args.data_dir), dev1) # write_by_lines("{}/{}_test.tsv".format(args.data_dir, args.do_model), test_data) write_by_lines("{}/predict.txt".format(args.data_dir), predict_data) if (args.change_event == 'BIO_event'): schema_labels = read_label('{}/entity2id.txt'.format( args.data_dir)) elif (args.change_event == 'no'): schema_labels = read_label('{}/event2id.txt'.format(args.data_dir)) else: schema_labels = ['O', 'B', 'I'] elif args.do_model == 'mrc_relation': train1, dev1 = get_mrc_relation_train_dev(args, i) predict_data, predict_sents = get_mrc_relation_predict(args) schema_labels = ['0', '1'] else: #if args.do_model=='mcls_onlysentence' train1, dev1 = get_mcls_onlysentence_train_dev(args, i) predict_data, predict_sents = get_mcls_onlysentence_predict(args) # write_by_lines("{}/predict_mcls.txt".format(args.data_dir), predict_data) schema_labels = read_label('{}/event2id.txt'.format(args.data_dir))[1:] return schema_labels, predict_data, predict_sents
def get_submit_postprocess(args, id, check=False, mcls=False): results = read_by_lines("{}.{}.{}.pred".format(output_predict_data_path, args.do_model, id)) submit = [] count = 0 # print(results) for j in range(len(results)): json_result = json.loads(results[j]) text = json_result['text'] label = json_result["labels"] now_label = '' now_entity = -1 count = 0 # print(len(text),len(label)) for i, l in enumerate(label): # print(l,text[i]) if (l == 'O' or l == '<NA>'): if (now_label != ''): count += 1 if (check): submit.append('\t'.join([ str(json_result['id']), now_label, text[now_entity:i], str(json_result['input'][0][now_entity * 2:i * 2]), str(json_result['input'][0]), text, str(label) ])) elif (mcls): submit.append('\t'.join([ str(json_result['id']), text, text[now_entity:i], now_label ])) else: submit.append('\t'.join([ str(json_result['id']), now_label, text[now_entity:i] ])) now_label = '' else: # print(l,text[i]) if (now_label == '' and l != '<NA>'): now_label = l now_entity = i elif (l.startswith('B')): if (args.change_event == 'BIO_event'): now_label = l[2:] else: now_label = l now_entity = i # print() elif (args.add_rule and now_label == ''): # print(args.change_event) if (args.change_event == 'BIO_event' and label[i][2:] == label[now_entity][2:]): now_label = label[i][2:] submit.pop(-1) # if(count==0): # submit.append('\t'.join([str(json_result['id']),'','',text,str(label)])) # print(submit) if (check): write_by_lines( "{}/{}ucas_valid_result_check.csv".format(output_path, id), submit) else: write_by_lines("{}/{}ucas_valid_result.csv".format(output_path, id), submit)
def one(args, schema_labels, predict_data, predict_sents, id): seq_label_task, reader = get_task(args, schema_labels, id) # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" # PaddleHub Finetune API # 将自动训练、评测并保存模型 if args.do_train: print("start finetune and eval process") seq_label_task.finetune_and_eval() write_log('./work/log/' + args.do_model + '.txt', args, id + ',' + str(seq_label_task.best_score)) if args.do_model == 'role' and args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] # print(input_data[:10]) run_states = seq_label_task.predict(data=input_data) results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() # print(batch_results) seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, input, r_label in zip(predict_sents, input_data, results): sent["input"] = input sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines( "{}.{}.{}.pred".format(output_predict_data_path, args.do_model, id), ret) get_submit_postprocess(args, id) get_submit_postprocess(args, id, check=True) if args.do_model in ['mcls', "mcls_onlysentence"] and args.do_predict: input_data = predict_data result = seq_label_task.predict(data=input_data, return_result=True) ret = [] submit = [] for s, r in zip(predict_sents, result): s['labels'] = [] # print(r) for r0 in r: # print(r0) for k, v in r0.items(): # print(k,v) if (v == 1): s['labels'].append(k) if (args.do_model == 'mcls_onlysentence'): submit.append('\t'.join([str(s["id"]), k])) else: submit.append('\t'.join( [str(s["id"]), k, s["entity"]])) ret.append(json.dumps(s, ensure_ascii=False)) write_by_lines( "{}.{}.{}.40-55.pred".format(output_predict_data_path, args.do_model, id), ret) write_by_lines( "{}{}.{}.40-55.ucas_valid_result.csv".format( output_path, args.do_model, id), submit)