Exemplo n.º 1
0
    def normalize_toks(self):
        """
        If the token is not a word piece, then find its lemma
        If it is, combine pieces into a word, and then find its lemma
        E.g., a ##b ##c will be normalized as "abc", "", ""
        NOTE: this is only used for schema linking
        """
        self.startidx2pieces = dict()
        self.pieces2startidx = dict()
        cache_start = None
        for i, piece in enumerate(self.pieces + [""]):
            if piece.startswith("##"):
                if cache_start is None:
                    cache_start = i - 1

                self.pieces2startidx[i] = cache_start
                self.pieces2startidx[i - 1] = cache_start
            else:
                if cache_start is not None:
                    self.startidx2pieces[cache_start] = i
                cache_start = None
        assert cache_start is None

        # combine pieces, "abc", "", ""
        combined_word = {}
        for start, end in self.startidx2pieces.items():
            assert end - start + 1 < 10
            pieces = [self.pieces[start]] + [self.pieces[_id].strip("##") for _id in range(start + 1, end)]
            word = "".join(pieces)
            combined_word[start] = word

        # remove "", only keep "abc"
        idx_map = {}
        new_toks = []
        for i, piece in enumerate(self.pieces):
            if i in combined_word:
                idx_map[len(new_toks)] = i
                new_toks.append(combined_word[i])
            elif i in self.pieces2startidx:
                # remove it
                pass
            else:
                idx_map[len(new_toks)] = i
                new_toks.append(piece)
        self.idx_map = idx_map

        # lemmatize "abc"
        normalized_toks = []
        for i, tok in enumerate(new_toks):
            ann = corenlp.annotate(tok, annotators=['tokenize', 'ssplit', 'lemma'])
            lemmas = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token]
            lemma_word = " ".join(lemmas)
            normalized_toks.append(lemma_word)

        self.normalized_pieces = normalized_toks
        self.recovered_pieces = new_toks
Exemplo n.º 2
0
    def tokenize_field_value(cls, field_value):
        assert isinstance(field_value, str)

        # TODO: Tokenization should be customizable
        ann = corenlp.annotate(field_value, annotators=['tokenize'])
        result = []
        for token in ann.sentencelessToken:
            # .before is the string between this token and the previous one (typically whitespace)
            result += list(token.before)
            # .originalText so that (e.g.) \u2014 doesn't get turned into --
            # .lower() because references in question don't match?
            result.append(token.originalText.lower())
        return result
Exemplo n.º 3
0
    def normalize_toks(self):
        """
        If the token is not a word piece, then find its lemma
        If it is, combine pieces into a word, and then find its lemma
        E.g., a ##b ##c will be normalized as "abc", "", ""
        NOTE: this is only used for schema linking
        """
        self.startidx2pieces = dict()
        self.pieces2startidx = dict()
        cache_start = None
        for i, piece in enumerate(self.pieces + [""]):
            if piece.startswith("##"):
                if cache_start is None:
                    cache_start = i - 1

                self.pieces2startidx[i] = cache_start
                self.pieces2startidx[i - 1] = cache_start
            else:
                if cache_start is not None:
                    self.startidx2pieces[cache_start] = i
                cache_start = None
        assert cache_start is None

        # combine pieces, "abc", "", ""
        combined_word = {}
        for start, end in self.startidx2pieces.items():
            assert end - start + 1 < 10
            pieces = [self.pieces[start]] + [
                self.pieces[_id].strip("##") for _id in range(start + 1, end)
            ]
            word = "".join(pieces)
            combined_word[start] = word

        # remove "", only keep "abc"
        idx_map = {}
        new_toks = []
        for i, piece in enumerate(self.pieces):
            if i in combined_word:
                idx_map[len(new_toks)] = i
                new_toks.append(combined_word[i])
            elif i in self.pieces2startidx:
                # remove it
                pass
            else:
                idx_map[len(new_toks)] = i
                new_toks.append(piece)
        self.idx_map = idx_map  ### 将self.normalized_pieces 对应于bert_tok的编号,是个字典

        #question     "How many acting statuses are there?"
        #self.pieces  ['how', 'many', 'acting', 'status', '##es', 'are', 'there', '?']
        #idx_map      {0: 0, 1: 1, 2: 2, 3: 3, 4: 5, 5: 6, 6: 7}
        #new_toks     ['how', 'many', 'acting', 'statuses', 'are', 'there', '?']
        #self.normalized_pieces ['how', 'many', 'act', 'status', 'be', 'there', '?']

        #column       "points won"
        #self.pieces  ['points', 'won']
        #idx_map      {0: 0, 1: 1}
        #new_toks     ['points', 'won']
        #self.normalized_pieces ['point', 'win']

        # lemmatize "abc"
        normalized_toks = []
        for i, tok in enumerate(new_toks):
            ann = corenlp.annotate(tok,
                                   annotators=['tokenize', 'ssplit', 'lemma'])
            lemmas = [
                tok.lemma.lower() for sent in ann.sentence
                for tok in sent.token
            ]
            lemma_word = " ".join(lemmas)
            normalized_toks.append(lemma_word)

        self.normalized_pieces = normalized_toks  ##词形还原后的toks