def predict_data_process(trigger_file, role_file, schema_file, save_path): """predict_data_process""" pred_ret = [] trigger_datas = read_by_lines(trigger_file) role_data = read_by_lines(role_file) schema_datas = read_by_lines(schema_file) print("trigger predict {} load from {}".format(len(trigger_datas), trigger_file)) print("role predict {} load from {}".format(len(role_data), role_file)) print("schema {} load from {}".format(len(schema_datas), schema_file)) schema = {} for s in schema_datas: d_json = json.loads(s) schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]] # process the role data sent_role_mapping = {} for d in role_data: d_json = json.loads(d) r_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) role_ret = {} for r in r_ret: role_type = r["type"] if role_type not in role_ret: role_ret[role_type] = [] role_ret[role_type].append("".join(r["text"])) sent_role_mapping[d_json["id"]] = role_ret for d in trigger_datas: d_json = json.loads(d) t_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) pred_event_types = list(set([t["type"] for t in t_ret])) event_list = [] for event_type in pred_event_types: role_list = schema[event_type] arguments = [] for role_type, ags in sent_role_mapping[d_json["id"]].items(): if role_type not in role_list: continue for arg in ags: if len(arg) == 1: continue arguments.append({"role": role_type, "argument": arg}) event = {"event_type": event_type, "arguments": arguments} event_list.append(event) pred_ret.append({ "id": d_json["id"], "text": d_json["text"], "event_list": event_list }) pred_ret = [json.dumps(r, ensure_ascii=False) for r in pred_ret] print("submit data {} save to {}".format(len(pred_ret), save_path)) write_by_lines(save_path, pred_ret)
def schema_process(path, model="trigger"): """schema_process""" def label_add(labels, _type): """label_add""" if "B-{}".format(_type) not in labels: labels.extend(["B-{}".format(_type), "I-{}".format(_type)]) return labels labels = [] for line in read_by_lines(path): d_json = json.loads(line.strip()) if model == "trigger": labels = label_add(labels, d_json["event_type"]) elif model == "role": for role in d_json["role_list"]: if role["role"] == enum_role: continue labels = label_add(labels, role["role"]) elif model == "enum": for role in d_json["role_list"]: if role["role"] == enum_role: labels = role["enum_items"] labels.append("O") tags = [] for index, label in enumerate(labels): tags.append("{}\t{}".format(index, label)) if model == "enum": tags = tags[:-1] return tags
def do_predict(): paddle.set_device(args.device) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map)) no_entity_label = "O" ignore_label = len(label_map) print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = paddle.load(args.init_ckpt) model.set_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"].replace(" ", "\002") input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids Stack(dtype='int64') # sequence lens ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size)] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids, seq_lens = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids) probs = F.softmax(logits, axis=-1) probs_ids = paddle.argmax(probs, -1).numpy() probs = probs.numpy() for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()): prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])] label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]] results.append({"probs": prob_one, "labels": label_one}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def docs_data_process(path): """docs_data_process""" lines = read_by_lines(path) sentences = [] for line in lines: d_json = json.loads(line) sentences.extend(marked_doc_2_sentence(d_json)) sentences = [json.dumps(s, ensure_ascii=False) for s in sentences] return sentences
def data_process(path, model="trigger", is_predict=False): """data_process""" def label_data(data, start, l, _type): """label_data""" for i in range(start, start + l): suffix = "B-" if i == start else "I-" data[i] = "{}{}".format(suffix, _type) return data sentences = [] output = ["text_a"] if is_predict else ["text_a\tlabel"] for line in read_by_lines(path): d_json = json.loads(line) _id = d_json["id"] text_a = [ "," if t == " " or t == "\n" or t == "\t" else t for t in list(d_json["text"].lower()) ] if is_predict: sentences.append({"text": d_json["text"], "id": _id}) output.append('\002'.join(text_a)) else: if model == u"trigger": labels = ["O"] * len(text_a) if len(d_json.get("event_list", [])) == 0: continue for event in d_json.get("event_list", []): event_type = event["event_type"] start = event["trigger_start_index"] trigger = event["trigger"] labels = label_data(labels, start, len(trigger), event_type) output.append("{}\t{}".format('\002'.join(text_a), '\002'.join(labels))) elif model == u"role": for event in d_json.get("event_list", []): labels = ["O"] * len(text_a) for arg in event["arguments"]: role_type = arg["role"] if role_type == enum_role: continue argument = arg["argument"] start = arg["argument_start_index"] labels = label_data(labels, start, len(argument), role_type) output.append("{}\t{}".format('\002'.join(text_a), '\002'.join(labels))) return output
def __init__(self, label_map_config=None, max_seq_len=512, do_lower_case=True, in_tokens=False, is_inference=False, random_seed=None, tokenizer="FullTokenizer", is_classify=True, is_regression=False, for_cn=True, task_id=0): self.tokenizer = BertTokenizer.from_pretrained(config.model_name) self.max_seq_len = max_seq_len labels_map = {} # label for line in utils.read_by_lines(config.trigger_label_map): arr = line.split("\t") labels_map[arr[0]] = int(arr[1]) # self.tokenizer = tokenization.FullTokenizer( # vocab_file=vocab_path, do_lower_case=do_lower_case) # self.vocab = self.tokenizer.vocab # self.pad_id = self.vocab["[PAD]"] # self.cls_id = self.vocab["[CLS]"] # self.sep_id = self.vocab["[SEP]"] # self.in_tokens = in_tokens # self.is_inference = is_inference # self.for_cn = for_cn # self.task_id = task_id # np.random.seed(random_seed) # self.is_classify = is_classify # self.is_regression = is_regression # self.current_example = 0 # self.current_epoch = 0 # self.num_examples = 0 self.label_map = labels_map
def enum_data_process(path, is_predict=False): """enum_data_process""" output = ["text_a"] if is_predict else ["label\ttext_a"] for line in read_by_lines(path): d_json = json.loads(line) text = d_json["text"].lower().replace("\t", " ") if is_predict: output.append(text) continue if len(d_json.get("event_list", [])) == 0: continue label = None for event in d_json["event_list"]: if event["event_type"] != "公司上市": continue for argument in event["arguments"]: role_type = argument["role"] if role_type == enum_role: label = argument["argument"] if label: output.append("{}\t{}".format(label, text)) return output
def predict_data_process(trigger_file, role_file, schema_file, save_path): """predict_data_process""" pred_ret = [] trigger_datas = read_by_lines(trigger_file) role_data = read_by_lines(role_file) schema_datas = read_by_lines(schema_file) print("trigger predict {} load from {}".format(len(trigger_datas), trigger_file)) print("role predict {} load from {}".format(len(role_data), role_file)) print("schema {} load from {}".format(len(schema_datas), schema_file)) schema = {} for s in schema_datas: d_json = json.loads(s) schema[d_json["event_type"]] = [ r["role"] for r in d_json["role_list"] ] # 字典:key-event_type,value-role_list, {event_type: [role1, role2, ...]} # process the role data sent_role_mapping = {} for d in role_data: d_json = json.loads(d) r_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) role_ret = {} for r in r_ret: # 对每个预测到的论元组,即字典{'start': , 'text': , 'type': } role_type = r["type"] # 获取预测的role if role_type not in role_ret: role_ret[role_type] = [] role_ret[role_type].append( "".join(r["text"]) ) # 汇集论元对应的实例化的词,即{'role_type': [role_arg1, role_arg2, ...]} sent_role_mapping[d_json[ "id"]] = role_ret # 当前句子的{id : {'role_type': [role_arg1, role_arg2, ...]}} for d in trigger_datas: d_json = json.loads(d) t_ret = extract_result( d_json["text"], d_json["pred"]["labels"]) # [{'start': , 'text': , 'type': }, ...] pred_event_types = list(set([ t["type"] for t in t_ret ])) # 获取预测的trigger类型,即事件类型。利用触发词来拿到事件类型,一个句子可以有多个事件类型,所以用了序列标注的方法 event_list = [] for event_type in pred_event_types: role_list = schema[event_type] arguments = [] for role_type, ags in sent_role_mapping[d_json["id"]].items(): if role_type not in role_list: # 这里的判断可以排除理应不在本事件类型中出现的论元;我们只关心本事件类型出现的论元是否被预测到 continue for arg in ags: if len(arg) == 1: continue arguments.append({"role": role_type, "argument": arg}) event = {"event_type": event_type, "arguments": arguments} event_list.append(event) pred_ret.append({ "id": d_json["id"], "text": d_json["text"], "event_list": event_list }) pred_ret = [ json.dumps(r, ensure_ascii=False) for r in pred_ret ] # 最后的预测数据里的event_list是没有index的,只有event_type, {role, argument} print("submit data {} save to {}".format(len(pred_ret), save_path)) write_by_lines(save_path, pred_ret)
def do_predict(): set_seed(args.seed) device = args.device label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForSequenceClassification(num_classes=len(label_map)) model.to(torch.device(device)) tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0") print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = torch.load(args.init_ckpt) model.load_state_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"] input_sent = [sent] # only text_a if "text_b" in sent: input_sent = [[sent, sent["text_b"]]] # add text_b example = data_2_examples(input_sent)[0] input_ids, token_type_ids = convert_example( example, tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [ encoded_inputs_list[i:i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size) ] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids = batchify_fn(batch) input_ids = to_var(input_ids, device=device).long() token_type_ids = to_var(token_type_ids, device=device).long() logits = model(input_ids, token_type_ids) probs = torch.softmax(logits, dim=1) if device == "cuda": probs_ids = torch.argmax(probs, -1).detach().cpu().numpy() probs = probs.detach().cpu().numpy() else: probs_ids = torch.argmax(probs, -1).detach().numpy() probs = probs.detach().numpy() for prob_one, p_id in zip(probs.tolist(), probs_ids.tolist()): label_probs = {} for idx, p in enumerate(prob_one): label_probs[id2label[idx]] = p results.append({"probs": label_probs, "label": id2label[p_id]}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def predict_data_process(trigger_file, role_file, enum_file, schema_file, save_path): """predict_data_process""" pred_ret = [] trigger_data = read_by_lines(trigger_file) role_data = read_by_lines(role_file) enum_data = read_by_lines(enum_file) schema_data = read_by_lines(schema_file) print("trigger predict {} load from {}".format(len(trigger_data), trigger_file)) print("role predict {} load from {}".format(len(role_data), role_file)) print("enum predict {} load from {}".format(len(enum_data), enum_file)) print("schema {} load from {}".format(len(schema_data), schema_file)) schema, sent_role_mapping, sent_enum_mapping = {}, {}, {} for s in schema_data: d_json = json.loads(s) schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]] # role depends on id and sent_id for d in role_data: d_json = json.loads(d) r_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) role_ret = {} for r in r_ret: role_type = r["type"] if role_type not in role_ret: role_ret[role_type] = [] role_ret[role_type].append("".join(r["text"])) _id = "{}\t{}".format(d_json["id"], d_json["sent_id"]) sent_role_mapping[_id] = role_ret # process the enum_role data for d in enum_data: d_json = json.loads(d) _id = "{}\t{}".format(d_json["id"], d_json["sent_id"]) label = d_json["pred"]["label"] sent_enum_mapping[_id] = label # process trigger data for d in trigger_data: d_json = json.loads(d) t_ret = extract_result(d_json["text"], d_json["pred"]["labels"]) pred_event_types = list(set([t["type"] for t in t_ret])) event_list = [] _id = "{}\t{}".format(d_json["id"], d_json["sent_id"]) for event_type in pred_event_types: role_list = schema[event_type] arguments = [] for role_type, ags in sent_role_mapping[_id].items(): if role_type not in role_list: continue for arg in ags: arguments.append({"role": role_type, "argument": arg}) # 特殊处理环节 if event_type == enum_event_type: arguments.append({ "role": enum_role, "argument": sent_enum_mapping[_id] }) event = { "event_type": event_type, "arguments": arguments, "text": d_json["text"] } event_list.append(event) pred_ret.append({ "id": d_json["id"], "sent_id": d_json["sent_id"], "text": d_json["text"], "event_list": event_list }) doc_pred = {} for d in pred_ret: if d["id"] not in doc_pred: doc_pred[d["id"]] = {"id": d["id"], "event_list": []} doc_pred[d["id"]]["event_list"].extend(d["event_list"]) # unfiy the all prediction results and save them doc_pred = [ json.dumps(event_normalization(r), ensure_ascii=False) for r in doc_pred.values() ] print("submit data {} save to {}".format(len(doc_pred), save_path)) write_by_lines(save_path, doc_pred)