def _prep_sent(self, sent: Sent): conf: MySRLConf = self.conf slen = len(sent) _loss_weight_non = getattr(sent, "_loss_weight_non", 1.) # todo(+N): special name; loss_weight_non # note: for simplicity, assume no loss_weight_non for args # first for events evt_arr = np.full([slen], 0, dtype=np.int) # [evt] arg_arr = np.full([slen, slen], 0, dtype=np.int) # [evt, arg] evt_items = np.full([slen], None, dtype=object) # [evt] for f in sent.get_frames(conf.evt_ftag): # note: assume no overlapping # predicate evt_widx, evt_wlen = self.evt_span_getter(f.mention) evt_label = f.label_idx assert evt_wlen==1 and evt_label>0, "For simplicity!!" evt_items[evt_widx] = f evt_arr[evt_widx] = evt_label # arguments if conf.arg_only_rank1: cur_args = [a for a in f.args if a.info.get("rank", 1) == 1] else: cur_args = f.args # bio or not if conf.arg_use_bio: # special arg_spans = [self.arg_span_getter(a.mention) + (a.label_idx,) for a in cur_args] tag_layers = self.vocab_arg.spans2tags_idx(arg_spans, slen) if len(tag_layers) > 1: zwarn(f"Warning: 'Full args require multiple layers with {arg_spans}") arg_arr[evt_widx, :] = tag_layers[0][0] # directly assign it! else: # plain ones for a in cur_args: arg_role = a.label_idx arg_widx, arg_wlen = self.arg_span_getter(a.mention) arg_arr[evt_widx, arg_widx:arg_widx+arg_wlen] = arg_role return ZObject(sent=sent, slen=slen, loss_weight_non=_loss_weight_non, evt_items=evt_items, evt_arr=evt_arr, arg_arr=arg_arr)
def convert(self, src_sent: Sent, trg_sent: Sent, cc: Counter): cc["sent"] += 1 assert len(src_sent) == len(trg_sent) src_tree = src_sent.tree_dep trg_tree = trg_sent.tree_dep # -- # copy trg sent ret = Sent.create(trg_sent.seq_word.vals.copy()) if trg_sent.seq_upos is not None: ret.build_uposes(trg_sent.seq_upos.vals) ret.build_dep_tree(trg_tree.seq_head.vals, trg_tree.seq_label.vals) # -- # map items # first get everyone's desc set src_desc = self.get_desc(src_tree) trg_desc = self.get_desc(trg_tree) for src_evt in src_sent.events: cc["evt"] += 1 _ewidx, _ewlen = src_evt.mention.get_span() assert _ewlen == 1 trg_evt = ret.make_event(_ewidx, _ewlen, type=src_evt.type) for src_arg in src_evt.args: cc["arg"] += 1 _awidx, _awlen = src_arg.mention.get_span() assert _awlen == 1 _new_awidx = self.ff(_ewidx, _awidx, src_tree, trg_tree, src_desc, trg_desc, cc) trg_ef = ret.make_entity_filler(_new_awidx, 1, type=src_arg.arg.type) trg_evt.add_arg(trg_ef, src_arg.role) # -- return ret
def semafor2sent(d: Dict): tokens = d["tokens"] ret = Sent.create(words=tokens) # ----- def _read_mention(_spans): assert len(_spans) == 1, "Assume single span!" _span = _spans[0] _start, _end, _text = _span["start"], _span["end"], _span["text"] assert StrHelper.delete_spaces(_text) == StrHelper.delete_spaces( ''.join(tokens[_start:_end])) # check without spaces return _start, _end - _start # widx, wlen # ----- for frame in d["frames"]: frame_target, frame_asets = frame["target"], frame[ "annotationSets"] # target evt_widx, evt_wlen = _read_mention(frame_target["spans"]) evt = ret.make_event(evt_widx, evt_wlen, type=frame_target["name"]) # roles assert len(frame_asets) == 1 and frame_asets[0][ "rank"] == 0, "Assume only one rank=0 annotationSets!" for frame_role in frame_asets[0]["frameElements"]: ef_widx, ef_wlen = _read_mention(frame_role["spans"]) ef = ret.make_entity_filler( ef_widx, ef_wlen) # make new ef for each arg evt.add_arg(ef, role=frame_role["name"]) return ret
def _eval_one(self, gold_inst: Sent, pred_inst: Sent): conf: DparEvalConf = self.conf # assert gold_inst.id == pred_inst.id, "Err: SentID mismatch!" # assert gold_inst.seq_word.vals == pred_inst.seq_word.vals, "Err: sent text mismatch!" # -- gold_tokens = gold_inst.get_tokens() pred_tokens = pred_inst.get_tokens() assert len(gold_tokens) == len(pred_tokens) if conf.exclude_punct: res = DparEvalResult(conf, [ (a, b) for a, b in zip(gold_tokens, pred_tokens) if a.upos != "PUNCT" ]) else: res = DparEvalResult(conf, [(a, b) for a, b in zip(gold_tokens, pred_tokens)]) return res
def _new_frame(self, s: Sent, one_widx: int, one_wlen: int, one_lab: int, one_score: float, vocab=None): if vocab is None: vocab = self.vocab # -- f_type = vocab.idx2word(one_lab) f = s.make_frame(one_widx, one_wlen, self.conf.ftag, type=f_type, score=one_score) f.set_label_idx(one_lab) self.core_span_setter(f.mention, one_widx, one_wlen) # core_span return f
def put_doc(self, orig_doc: Doc, nlp_doc): assert orig_doc.get_text() == nlp_doc.text, "Error: Input & Output text not match!" orig_doc.clear_sents() # clean sents if there are originally # process nlp_doc sent_positions = [] new_nlp_sents = nlp_doc.sentences for nlp_sent in new_nlp_sents: if len(nlp_sent.tokens) == 0: continue # ignore empty ones sent_start_char, sent_end_char = nlp_sent.tokens[0].start_char, nlp_sent.tokens[-1].end_char new_s = Sent.create(text=nlp_doc.text[sent_start_char:sent_end_char]) self.put_sent(new_s, nlp_sent) # annotate sents orig_doc.add_sent(new_s) # add sent sent_positions.append((sent_start_char, sent_end_char-sent_start_char)) orig_doc.build_sent_positions(sent_positions) # put positions
def main(args): # conf conf: ZOverallConf = init_everything(ZOverallConf(), args) # task t_center = TaskCenter(conf.tconf) # data d_center = DataCenter(conf.dconf, specified_wset=[]) # nothing to load here! # load vocab t_center.load_vocabs(t_center.conf.vocab_load_dir) # prepare datasets t_center.prepare_datasets(d_center.get_datasets()) # build model model = ZModel(conf.mconf) t_center.build_mods(model) model.finish_sr() # note: build sr before possible loading in testing!! # run r_center = RunCenter(conf.rconf, model, t_center, d_center) if conf.rconf.model_load_name != "": r_center.load(conf.rconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ===== cc = Counter() BATCH_LINE = os.environ.get('ZMSP_BATCH_LINE', 1000) # 1000 sents once time test_dataset = ZDataset(d_center.conf.testM, 'testM', 'decode', _no_load=True) # use testM for other options! for lines in yield_lines(sys.stdin, BATCH_LINE): insts = [Sent.create(one.split()) for one in lines] # note: simply split as sentence!! test_dataset.set_insts(insts) # directly set it! cc["sent"] += len(insts) if cc["sent"] % 50000 == 0: zlog(f"Decode for {cc}") # -- t_center.prepare_datasets([test_dataset]) # re-prepare!! for ibatch in test_dataset.yield_batches(loop=False): one_res = model.predict_on_batch(ibatch) # -- for inst in insts: sys.stdout.write( json.dumps(inst.to_json(), ensure_ascii=False) + "\n") # ===== zlog(f"The end of Decoding: {cc}")
def span2feat(self, sent: Sent, widx: int, wlen: int): # from a span to feat # def span2feat(self, sent: Sent, widx: int, wlen: int, try_head=True): # from a span to feat conf: LexConstrainerConf = self.conf hwidx = self.hf.find_shead(sent, widx, wlen) # try to find head word if conf.use_fn_style: hpos = sent.seq_upos.vals[hwidx] lu_name = " ".join(sent.seq_lemma.vals[widx:widx+wlen]).lower() + "." \ + UD2FN_POS_MAP.get(hpos, hpos.lower()) feat = self.lu2feat(lu_name) else: tokens = sent.get_tokens(widx, widx + wlen) feat = " ".join([self.lex_feat_f(t) for t in tokens]) # my own feat! # special try_head if not found # if try_head and wlen>0 and feat not in self.cmap: # return self.span2feat(sent, hwidx, 1, False) return feat
def from_obj(self, s: str) -> DataInstance: conf: ConllFormatorConf = self.conf # -- lines = s.rstrip().split("\n") all_fields = [line.split(conf.sep_in) for line in lines] num_col = 0 if len(all_fields) > 0: num_col = len(all_fields[0]) # assert all(len(z)<=num_col for z in all_fields) for z in all_fields: if len(z) != num_col: zwarn(f"Line length not match ({len(z)} vs {num_col})") # -- sent = Sent.create() # make an empty one!! # -> read in conll fields # doc id if conf.f_doc is not None: f_doc = int(conf.f_doc) doc_id = ConllHelper.get_f_doc([z[f_doc] for z in all_fields]) sent.info["doc_id"] = doc_id # temporaly put it here! # part id if conf.f_part is not None: f_part = int(conf.f_part) part_id = ConllHelper.get_f_doc([z[f_part] for z in all_fields]) sent.info["part_id"] = part_id # word idx if conf.f_widx is not None: f_widx = int(conf.f_widx) valids = ConllHelper.get_f_widx([z[f_widx] for z in all_fields], conf.widx_start) # note: filtering lines!! all_fields = [z for z, v in zip(all_fields, valids) if v] # words if conf.f_word is not None: f_word = int(conf.f_word) words = [z[f_word] for z in all_fields] sent.build_words(words) # pred + predid + args if conf.f_pred is not None: # frames f_pred = int(conf.f_pred) f_pred_id = int(conf.f_pred_id) preds = ConllHelper.get_preds([z[f_pred] for z in all_fields], [z[f_pred_id] for z in all_fields], conf.combine_lemma_id, nil_vals=conf.pred_nil_vals) new_frames = [ sent.make_event(p_widx, 1, type=p_lab) for p_widx, p_lab in preds ] # note: wlen==1 # args? if conf.f_arg_start is not None: f_arg_start = int(conf.f_arg_start) # -- if num_col - conf.num_extra_field - f_arg_start != len( new_frames): zwarn( f"Unequal num of args: {num_col - conf.num_extra_field - f_arg_start} vs {len(new_frames)}" ) # -- for one_new_frame in new_frames: # read args _get_f = ConllHelper.get_f_args_dep if conf.arg_is_dep else ConllHelper.get_f_args args = _get_f(one_new_frame.mention.widx, [z[f_arg_start] for z in all_fields]) for a_widx, a_wlen, a_lab in args: new_ef = sent.make_entity_filler(a_widx, a_wlen, type="UNK") one_new_frame.add_arg(new_ef, a_lab) # add one field further f_arg_start += 1 # todo(+W): currently putting others at info for info_name, f_field in zip( ["xpos", "parse", "sense", "speaker", "ne", "coref"], [ conf.f_xpos, conf.f_parse, conf.f_sense, conf.f_speaker, conf.f_ne, conf.f_coref ]): if f_field is not None: _tmp_idx = int(f_field) _tmp_items = [z[_tmp_idx] for z in all_fields] sent.info[info_name] = _tmp_items # -- # finally UD related fields # upos if conf.f_upos is not None: f_upos = int(conf.f_upos) upos = [z[f_upos] for z in all_fields] sent.build_uposes(upos) # dep if conf.f_dep_head is not None: f_dep_head = int(conf.f_dep_head) dep_head = [int(z[f_dep_head]) for z in all_fields] if conf.f_dep_label is not None: f_dep_label = int(conf.f_dep_label) dep_label = [z[f_dep_label] for z in all_fields] else: dep_label = None sent.build_dep_tree(dep_head, dep_label) # -- # other info for f_idx in conf.f_others: f_idx = int(f_idx) sent.info[f_idx] = [z[f_idx] for z in all_fields] # simply put it at info! # -- return sent
def _approx_prev_next(insts: List): if len(insts) > 0 and isinstance(insts[0], Sent): for ii in range(len(insts) - 1): Sent.assign_prev_next(insts[ii], insts[ii + 1]) return insts
def from_obj(self, s: str) -> Doc: d = json.loads(s) doc = Doc.create(id=d["doc_id"]) doc.info.update({k: d.get(k) for k in ZDocDataFormator._OTHER_DOC_FIELDS}) # add sents for one_sent in d["sents"]: sent = Sent.create(one_sent["text"], id=one_sent.get("id")) if "positions" in one_sent: sent.build_word_positions(one_sent["positions"]) if "lemma" in one_sent: sent.build_lemmas(one_sent["lemma"]) if "upos" in one_sent: sent.build_uposes(one_sent["upos"]) if "governor" in one_sent and "dependency_relation" in one_sent: sent.build_dep_tree(one_sent["governor"], one_sent["dependency_relation"]) doc.add_sent(sent) # -- failed_items = {"ef": [], "evt": [], "arg": []} args_maps = {} # id -> Frame # entities and fillers if d.get("entity_mentions") is None and d.get("fillers") is None: # no entities info for sent in doc.sents: sent.mark_no_entity_fillers() else: ef_items = d.get("entity_mentions", []) + d.get("fillers", []) for one_ef_item in ef_items: mention = self._parse_mention(one_ef_item, doc) if mention is None: failed_items["ef"].append(one_ef_item) else: ef = Frame.create(mention, type=one_ef_item["type"], score=one_ef_item.get("score", 0.), id=one_ef_item["id"]) ef.info.update({k: one_ef_item[k] for k in ["extra_info", "gid"] if k in one_ef_item}) # todo(note): no checking for possibly repeat efs assert ef.id not in args_maps args_maps[ef.id] = ef mention.sent.add_entity_filler(ef) # events if d.get("event_mentions") is None: # no events info for sent in doc.sents: sent.mark_no_events() else: for one_evt_item in d["event_mentions"]: mention = self._parse_mention(one_evt_item["trigger"], doc) if mention is None: failed_items["evt"].append(one_evt_item) else: evt = Frame.create(mention, type=one_evt_item["type"], score=one_evt_item.get("score", 0.), id=one_evt_item["id"]) evt.info.update({k: one_evt_item[k] for k in ["extra_info", "gid", "realis", "realis_score"] if k in one_evt_item}) assert evt.id not in args_maps args_maps[evt.id] = evt mention.sent.add_event(evt) # args for one_evt_item in d.get("event_mentions", []): if one_evt_item["id"] not in args_maps: assert one_evt_item["trigger"]["posi"] is None continue evt = args_maps[one_evt_item["id"]] # must be there em_args = one_evt_item.get("em_arg", None) if em_args is None: evt.mark_no_args() else: for one_arg in em_args: aid, role = one_arg["aid"], one_arg["role"] if aid not in args_maps: failed_items["arg"].append(one_arg) else: arg_arg = args_maps[aid] arglink = evt.add_arg(arg_arg, role, score=one_arg.get("score", 0.)) arglink.info.update({k: one_arg[k] for k in ["is_aug", "extra_info"] if k in one_arg}) # -- if any(len(v)>0 for k,v in failed_items.items()): zwarn(f"Failed when reading Doc({doc.id}): {[(k,len(v)) for k,v in failed_items.items()]}") return doc
def _get_frames(self, s: Sent): return s.get_frames(self.conf.ftag)
def from_obj(self, s: str): ret = Sent.create(text=s) if self.do_tok_sep: # simple split words = s.split(self.tok_sep) ret.build_words(words) return ret
def _align_sents(self, sent, cand, align_res): _dels = self.delete_char_set matched_pairs0 = [(a, b) for a, b in zip(align_res[0], align_res[1]) if (a is not None and b is not None)] matched_pairs = self._delete_single_match(matched_pairs0) # -- map1to2, map2to1 = {}, {} # word idx maps words1, words2 = sent.seq_word.vals, cand.seq_word.vals tree1, tree2 = sent.tree_dep, cand.tree_dep lp1, lp2 = -1, -1 for p1, p2 in matched_pairs + [(len(align_res[2]), len(align_res[3])) ]: # aligned at end # check the mismatched one idxes1, idxes2 = list(range(lp1 + 1, p1)), list(range(lp2 + 1, p2)) if len(idxes1) > 0 or len(idxes2) > 0: _toks1 = [ "".join([c for c in words1[z] if c not in _dels]) for z in idxes1 ] _toks2 = [ "".join([c for c in words2[z] if c not in _dels]) for z in idxes2 ] if ''.join(_toks1) != ''.join(_toks2): zwarn(f"Piece mismatched: {_toks1} vs {_toks2}") # breakpoint() # sub align _subaligns = self._sub_align_toks(_toks1, _toks2) for _iis1, _iis2 in _subaligns: _cur_idxes1, _cur_idxes2 = [idxes1[z] for z in _iis1 ], [idxes2[z] for z in _iis2] if len(_cur_idxes1) > 0 and len( _cur_idxes2 ) > 0: # only possible to align if both have words h1, h2 = self._get_head(_cur_idxes1, tree1), self._get_head( _cur_idxes2, tree2) assert h1 not in map1to2 and h2 not in map2to1 map1to2[h1] = h2 # note: specifically map more from 2 to 1 for _hh in _cur_idxes2: if h2 in tree2.get_spine(_hh): map2to1[_hh] = h1 # add the matched one assert p1 not in map1to2 and p2 not in map2to1 map1to2[p1] = p2 map2to1[p2] = p1 # next lp1, lp2 = p1, p2 # -- # assign deps _backoff_labmap = MyIndexer2._BACKOFF_LABMAP _res_heads, _res_labs, _res_poses = [], [], [] for i1 in range(len(words1)): mapped_i2 = map1to2.get(i1) # upos if mapped_i2 is None: _res_poses.append("X") # todo(+N): simply put an "X" here! else: _res_poses.append(cand.seq_upos.vals[mapped_i2]) # dep mapped_i2_hidx = tree2.seq_head.vals[ mapped_i2] - 1 if mapped_i2 is not None else None back_i1_hidx = -1 if mapped_i2_hidx == -1 else map2to1.get( mapped_i2_hidx) # -- if back_i1_hidx is None: # no map: directly put original ones! _res_heads.append(tree1.seq_head.vals[i1]) _old_lab = tree1.seq_label.vals[i1] if _old_lab not in _backoff_labmap: zwarn(f"Unknown old label: {_old_lab}") # breakpoint() _res_labs.append(_backoff_labmap.get( _old_lab, "dep")) # by default "dep" else: _res_heads.append(back_i1_hidx + 1) # note: remember +1 _res_labs.append(tree2.seq_label.vals[mapped_i2]) # -- # get a new sent! res = Sent.create(words1) res.build_uposes(_res_poses) res.build_dep_tree(_res_heads, _res_labs) self._check_no_cycle(_res_heads) return res
def feat_toks(self, s: Sent): sent_toks = s.get_tokens() sent_tok_feats = [self.feat_tok_f(t) for t in sent_toks] return sent_toks, sent_tok_feats
def put_sent(self, orig_sent: Sent, nlp_sent): text = orig_sent.get_text() # here we process the words! list_words = [] list_uposes = [] list_lemmas = [] list_dep_heads = [] list_dep_labels = [] list_word_positions = [] cur_word_start = 0 # find them!! for w in nlp_sent.words: list_words.append(w.text) list_uposes.append(w.upos) list_lemmas.append(w.lemma) list_dep_heads.append(w.head) list_dep_labels.append(w.deprel) try: # todo(+N): some words can map to the same token if using MWT! t = w.parent tok_start = text.index(t.text, cur_word_start) # idx inside the sentence list_word_positions.append((tok_start, t.end_char-t.start_char)) # [widx, wlen] cur_word_start = sum(list_word_positions[-1]) # start with next one except: list_word_positions = None # add them orig_sent.build_words(list_words) if self.pred_upos: orig_sent.build_uposes(list_uposes) if self.pred_lemma: orig_sent.build_lemmas(list_lemmas) if self.pred_dep: orig_sent.build_dep_tree(list_dep_heads, list_dep_labels) if list_word_positions is not None: orig_sent.build_word_positions(list_word_positions)
def to_obj(self, inst: Sent) -> str: if self.do_tok_sep: sep = " " if self.tok_sep is None else self.tok_sep return sep.join(inst.seq_word.vals) else: return inst.get_text()