def put_sent(self, orig_sent: Sent, nlp_sent): text = orig_sent.get_text() # here we process the words! list_words = [] list_uposes = [] list_lemmas = [] list_dep_heads = [] list_dep_labels = [] list_word_positions = [] cur_word_start = 0 # find them!! for w in nlp_sent.words: list_words.append(w.text) list_uposes.append(w.upos) list_lemmas.append(w.lemma) list_dep_heads.append(w.head) list_dep_labels.append(w.deprel) try: # todo(+N): some words can map to the same token if using MWT! t = w.parent tok_start = text.index(t.text, cur_word_start) # idx inside the sentence list_word_positions.append((tok_start, t.end_char-t.start_char)) # [widx, wlen] cur_word_start = sum(list_word_positions[-1]) # start with next one except: list_word_positions = None # add them orig_sent.build_words(list_words) if self.pred_upos: orig_sent.build_uposes(list_uposes) if self.pred_lemma: orig_sent.build_lemmas(list_lemmas) if self.pred_dep: orig_sent.build_dep_tree(list_dep_heads, list_dep_labels) if list_word_positions is not None: orig_sent.build_word_positions(list_word_positions)
def to_obj(self, inst: Sent) -> str: if self.do_tok_sep: sep = " " if self.tok_sep is None else self.tok_sep return sep.join(inst.seq_word.vals) else: return inst.get_text()