def normalize_toks(self): """ If the token is not a word piece, then find its lemma If it is, combine pieces into a word, and then find its lemma E.g., a ##b ##c will be normalized as "abc", "", "" NOTE: this is only used for schema linking """ self.startidx2pieces = dict() self.pieces2startidx = dict() cache_start = None for i, piece in enumerate(self.pieces + [""]): if piece.startswith("##"): if cache_start is None: cache_start = i - 1 self.pieces2startidx[i] = cache_start self.pieces2startidx[i - 1] = cache_start else: if cache_start is not None: self.startidx2pieces[cache_start] = i cache_start = None assert cache_start is None # combine pieces, "abc", "", "" combined_word = {} for start, end in self.startidx2pieces.items(): assert end - start + 1 < 10 pieces = [self.pieces[start]] + [self.pieces[_id].strip("##") for _id in range(start + 1, end)] word = "".join(pieces) combined_word[start] = word # remove "", only keep "abc" idx_map = {} new_toks = [] for i, piece in enumerate(self.pieces): if i in combined_word: idx_map[len(new_toks)] = i new_toks.append(combined_word[i]) elif i in self.pieces2startidx: # remove it pass else: idx_map[len(new_toks)] = i new_toks.append(piece) self.idx_map = idx_map # lemmatize "abc" normalized_toks = [] for i, tok in enumerate(new_toks): ann = corenlp.annotate(tok, annotators=['tokenize', 'ssplit', 'lemma']) lemmas = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token] lemma_word = " ".join(lemmas) normalized_toks.append(lemma_word) self.normalized_pieces = normalized_toks self.recovered_pieces = new_toks
def tokenize_field_value(cls, field_value): assert isinstance(field_value, str) # TODO: Tokenization should be customizable ann = corenlp.annotate(field_value, annotators=['tokenize']) result = [] for token in ann.sentencelessToken: # .before is the string between this token and the previous one (typically whitespace) result += list(token.before) # .originalText so that (e.g.) \u2014 doesn't get turned into -- # .lower() because references in question don't match? result.append(token.originalText.lower()) return result
def normalize_toks(self): """ If the token is not a word piece, then find its lemma If it is, combine pieces into a word, and then find its lemma E.g., a ##b ##c will be normalized as "abc", "", "" NOTE: this is only used for schema linking """ self.startidx2pieces = dict() self.pieces2startidx = dict() cache_start = None for i, piece in enumerate(self.pieces + [""]): if piece.startswith("##"): if cache_start is None: cache_start = i - 1 self.pieces2startidx[i] = cache_start self.pieces2startidx[i - 1] = cache_start else: if cache_start is not None: self.startidx2pieces[cache_start] = i cache_start = None assert cache_start is None # combine pieces, "abc", "", "" combined_word = {} for start, end in self.startidx2pieces.items(): assert end - start + 1 < 10 pieces = [self.pieces[start]] + [ self.pieces[_id].strip("##") for _id in range(start + 1, end) ] word = "".join(pieces) combined_word[start] = word # remove "", only keep "abc" idx_map = {} new_toks = [] for i, piece in enumerate(self.pieces): if i in combined_word: idx_map[len(new_toks)] = i new_toks.append(combined_word[i]) elif i in self.pieces2startidx: # remove it pass else: idx_map[len(new_toks)] = i new_toks.append(piece) self.idx_map = idx_map ### 将self.normalized_pieces 对应于bert_tok的编号,是个字典 #question "How many acting statuses are there?" #self.pieces ['how', 'many', 'acting', 'status', '##es', 'are', 'there', '?'] #idx_map {0: 0, 1: 1, 2: 2, 3: 3, 4: 5, 5: 6, 6: 7} #new_toks ['how', 'many', 'acting', 'statuses', 'are', 'there', '?'] #self.normalized_pieces ['how', 'many', 'act', 'status', 'be', 'there', '?'] #column "points won" #self.pieces ['points', 'won'] #idx_map {0: 0, 1: 1} #new_toks ['points', 'won'] #self.normalized_pieces ['point', 'win'] # lemmatize "abc" normalized_toks = [] for i, tok in enumerate(new_toks): ann = corenlp.annotate(tok, annotators=['tokenize', 'ssplit', 'lemma']) lemmas = [ tok.lemma.lower() for sent in ann.sentence for tok in sent.token ] lemma_word = " ".join(lemmas) normalized_toks.append(lemma_word) self.normalized_pieces = normalized_toks ##词形还原后的toks