def do_predict(): paddle.set_device(args.device) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map)) no_entity_label = "O" ignore_label = len(label_map) print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = paddle.load(args.init_ckpt) model.set_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"].replace(" ", "\002") input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids Stack(dtype='int64') # sequence lens ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size)] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids, seq_lens = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids) probs = F.softmax(logits, axis=-1) probs_ids = paddle.argmax(probs, -1).numpy() probs = probs.numpy() for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()): prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])] label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]] results.append({"probs": prob_one, "labels": label_one}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def predict_data_process(trigger_file, role_file, schema_file, save_path): """predict_data_process""" pred_ret = [] trigger_datas = read_by_lines(trigger_file) role_data = read_by_lines(role_file) schema_datas = read_by_lines(schema_file) print("trigger predict {} load from {}".format(len(trigger_datas), trigger_file)) print("role predict {} load from {}".format(len(role_data), role_file)) print("schema {} load from {}".format(len(schema_datas), schema_file)) schema = {} for s in schema_datas: d_json = json.loads(s) schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]] # process the role data sent_role_mapping = {} for d in role_data: d_json = json.loads(d) r_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) role_ret = {} for r in r_ret: role_type = r["type"] if role_type not in role_ret: role_ret[role_type] = [] role_ret[role_type].append("".join(r["text"])) sent_role_mapping[d_json["id"]] = role_ret for d in trigger_datas: d_json = json.loads(d) t_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) pred_event_types = list(set([t["type"] for t in t_ret])) event_list = [] for event_type in pred_event_types: role_list = schema[event_type] arguments = [] for role_type, ags in sent_role_mapping[d_json["id"]].items(): if role_type not in role_list: continue for arg in ags: if len(arg) == 1: continue arguments.append({"role": role_type, "argument": arg}) event = {"event_type": event_type, "arguments": arguments} event_list.append(event) pred_ret.append({ "id": d_json["id"], "text": d_json["text"], "event_list": event_list }) pred_ret = [json.dumps(r, ensure_ascii=False) for r in pred_ret] print("submit data {} save to {}".format(len(pred_ret), save_path)) write_by_lines(save_path, pred_ret)
tags = [] for index, label in enumerate(labels): tags.append("{}\t{}".format(index, label)) return tags if __name__ == "__main__": print("\n=================DUEE 1.0 DATASET==============") conf_dir = "./conf/DuEE1.0" schema_path = "{}/event_schema.json".format(conf_dir) tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir) tags_role_path = "{}/role_tag.dict".format(conf_dir) print("\n=================start schema process==============") print('input path {}'.format(schema_path)) tags_trigger = schema_process(schema_path, "trigger") write_by_lines(tags_trigger_path, tags_trigger) print("save trigger tag {} at {}".format(len(tags_trigger), tags_trigger_path)) tags_role = schema_process(schema_path, "role") write_by_lines(tags_role_path, tags_role) print("save trigger tag {} at {}".format(len(tags_role), tags_role_path)) print("=================end schema process===============") # data process data_dir = "./data/DuEE1.0" trigger_save_dir = "{}/trigger".format(data_dir) role_save_dir = "{}/role".format(data_dir) print("\n=================start schema process==============") if not os.path.exists(trigger_save_dir): os.makedirs(trigger_save_dir) if not os.path.exists(role_save_dir):
def predict_data_process(trigger_file, role_file, schema_file, save_path): """predict_data_process""" pred_ret = [] trigger_datas = read_by_lines(trigger_file) role_data = read_by_lines(role_file) schema_datas = read_by_lines(schema_file) print("trigger predict {} load from {}".format(len(trigger_datas), trigger_file)) print("role predict {} load from {}".format(len(role_data), role_file)) print("schema {} load from {}".format(len(schema_datas), schema_file)) schema = {} for s in schema_datas: d_json = json.loads(s) schema[d_json["event_type"]] = [ r["role"] for r in d_json["role_list"] ] # 字典:key-event_type,value-role_list, {event_type: [role1, role2, ...]} # process the role data sent_role_mapping = {} for d in role_data: d_json = json.loads(d) r_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) role_ret = {} for r in r_ret: # 对每个预测到的论元组,即字典{'start': , 'text': , 'type': } role_type = r["type"] # 获取预测的role if role_type not in role_ret: role_ret[role_type] = [] role_ret[role_type].append( "".join(r["text"]) ) # 汇集论元对应的实例化的词,即{'role_type': [role_arg1, role_arg2, ...]} sent_role_mapping[d_json[ "id"]] = role_ret # 当前句子的{id : {'role_type': [role_arg1, role_arg2, ...]}} for d in trigger_datas: d_json = json.loads(d) t_ret = extract_result( d_json["text"], d_json["pred"]["labels"]) # [{'start': , 'text': , 'type': }, ...] pred_event_types = list(set([ t["type"] for t in t_ret ])) # 获取预测的trigger类型,即事件类型。利用触发词来拿到事件类型,一个句子可以有多个事件类型,所以用了序列标注的方法 event_list = [] for event_type in pred_event_types: role_list = schema[event_type] arguments = [] for role_type, ags in sent_role_mapping[d_json["id"]].items(): if role_type not in role_list: # 这里的判断可以排除理应不在本事件类型中出现的论元;我们只关心本事件类型出现的论元是否被预测到 continue for arg in ags: if len(arg) == 1: continue arguments.append({"role": role_type, "argument": arg}) event = {"event_type": event_type, "arguments": arguments} event_list.append(event) pred_ret.append({ "id": d_json["id"], "text": d_json["text"], "event_list": event_list }) pred_ret = [ json.dumps(r, ensure_ascii=False) for r in pred_ret ] # 最后的预测数据里的event_list是没有index的,只有event_type, {role, argument} print("submit data {} save to {}".format(len(pred_ret), save_path)) write_by_lines(save_path, pred_ret)
def do_predict(): set_seed(args.seed) device = args.device label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForSequenceClassification(num_classes=len(label_map)) model.to(torch.device(device)) tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0") print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = torch.load(args.init_ckpt) model.load_state_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"] input_sent = [sent] # only text_a if "text_b" in sent: input_sent = [[sent, sent["text_b"]]] # add text_b example = data_2_examples(input_sent)[0] input_ids, token_type_ids = convert_example( example, tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [ encoded_inputs_list[i:i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size) ] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids = batchify_fn(batch) input_ids = to_var(input_ids, device=device).long() token_type_ids = to_var(token_type_ids, device=device).long() logits = model(input_ids, token_type_ids) probs = torch.softmax(logits, dim=1) if device == "cuda": probs_ids = torch.argmax(probs, -1).detach().cpu().numpy() probs = probs.detach().cpu().numpy() else: probs_ids = torch.argmax(probs, -1).detach().numpy() probs = probs.detach().numpy() for prob_one, p_id in zip(probs.tolist(), probs_ids.tolist()): label_probs = {} for idx, p in enumerate(prob_one): label_probs[id2label[idx]] = p results.append({"probs": label_probs, "label": id2label[p_id]}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
sentences = [json.dumps(s, ensure_ascii=False) for s in sentences] return sentences if __name__ == "__main__": # schema process print("\n=================DUEE FINANCE DATASET==============") conf_dir = "./conf/DuEE-Fin" schema_path = "{}/event_schema.json".format(conf_dir) tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir) tags_role_path = "{}/role_tag.dict".format(conf_dir) tags_enum_path = "{}/enum_tag.dict".format(conf_dir) print("\n=================start schema process==============") print('input path {}'.format(schema_path)) tags_trigger = schema_process(schema_path, "trigger") write_by_lines(tags_trigger_path, tags_trigger) print("save trigger tag {} at {}".format(len(tags_trigger), tags_trigger_path)) tags_role = schema_process(schema_path, "role") write_by_lines(tags_role_path, tags_role) print("save trigger tag {} at {}".format(len(tags_role), tags_role_path)) tags_enum = schema_process(schema_path, "enum") write_by_lines(tags_enum_path, tags_enum) print("save enum enum tag {} at {}".format(len(tags_enum), tags_enum_path)) print("=================end schema process===============") # data process data_dir = "./data/DuEE-Fin" sentence_dir = "{}/sentence".format(data_dir) trigger_save_dir = "{}/trigger".format(data_dir) role_save_dir = "{}/role".format(data_dir)
def predict_data_process(trigger_file, role_file, enum_file, schema_file, save_path): """predict_data_process""" pred_ret = [] trigger_data = read_by_lines(trigger_file) role_data = read_by_lines(role_file) enum_data = read_by_lines(enum_file) schema_data = read_by_lines(schema_file) print("trigger predict {} load from {}".format(len(trigger_data), trigger_file)) print("role predict {} load from {}".format(len(role_data), role_file)) print("enum predict {} load from {}".format(len(enum_data), enum_file)) print("schema {} load from {}".format(len(schema_data), schema_file)) schema, sent_role_mapping, sent_enum_mapping = {}, {}, {} for s in schema_data: d_json = json.loads(s) schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]] # role depends on id and sent_id for d in role_data: d_json = json.loads(d) r_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) role_ret = {} for r in r_ret: role_type = r["type"] if role_type not in role_ret: role_ret[role_type] = [] role_ret[role_type].append("".join(r["text"])) _id = "{}\t{}".format(d_json["id"], d_json["sent_id"]) sent_role_mapping[_id] = role_ret # process the enum_role data for d in enum_data: d_json = json.loads(d) _id = "{}\t{}".format(d_json["id"], d_json["sent_id"]) label = d_json["pred"]["label"] sent_enum_mapping[_id] = label # process trigger data for d in trigger_data: d_json = json.loads(d) t_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) pred_event_types = list(set([t["type"] for t in t_ret])) event_list = [] _id = "{}\t{}".format(d_json["id"], d_json["sent_id"]) for event_type in pred_event_types: role_list = schema[event_type] arguments = [] for role_type, ags in sent_role_mapping[_id].items(): if role_type not in role_list: continue for arg in ags: arguments.append({"role": role_type, "argument": arg}) # 特殊处理环节 if event_type == enum_event_type: arguments.append({ "role": enum_role, "argument": sent_enum_mapping[_id] }) event = { "event_type": event_type, "arguments": arguments, "text": d_json["text"] } event_list.append(event) pred_ret.append({ "id": d_json["id"], "sent_id": d_json["sent_id"], "text": d_json["text"], "event_list": event_list }) doc_pred = {} for d in pred_ret: if d["id"] not in doc_pred: doc_pred[d["id"]] = {"id": d["id"], "event_list": []} doc_pred[d["id"]]["event_list"].extend(d["event_list"]) # unfiy the all prediction results and save them doc_pred = [ json.dumps(event_normalization(r), ensure_ascii=False) for r in doc_pred.values() ] print("submit data {} save to {}".format(len(doc_pred), save_path)) write_by_lines(save_path, doc_pred)