示例#1
0
    def test_truncate_window_fit_two_wordpieces(self):
        """
        Tests if the both `use_starting_offsets` options work properly when last
        word in the truncated sentence consists of two wordpieces.
        """

        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(
            str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=13
        )

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["input_ids"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17]
        assert indexed_tokens["offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10]
        assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

        token_indexer = PretrainedBertIndexer(
            str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=13
        )

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["input_ids"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17]
        assert indexed_tokens["offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]
示例#2
0
    def test_truncate_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
示例#3
0
    def test_do_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = WordTokenizer()

        # Quick is UNK because of capitalization
        #           2   1     5     6   8      9    2  15 10 11 14   1
        sentence = "the Quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              do_lowercase=False)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # Quick should get 1 == OOV
        assert indexed_tokens["bert"] == [
            16, 2, 1, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]

        # Does lowercasing by default
        token_indexer = PretrainedBertIndexer(str(vocab_path))
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # Now Quick should get indexed correctly as 3 ( == "quick")
        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
示例#4
0
    def test_never_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = WordTokenizer()

        #            2 15 10 11  6
        sentence = "the laziest fox"

        tokens = tokenizer.tokenize(sentence)
        tokens.append(
            Token("[PAD]"))  # have to do this b/c tokenizer splits it in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              do_lowercase=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # PAD should get recognized and not lowercased      # [PAD]
        assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 0, 17]

        # Unless we manually override the never lowercases
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              do_lowercase=True,
                                              never_lowercase=())
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # now PAD should get lowercased and be UNK          # [UNK]
        assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 1, 17]
示例#5
0
    def test_never_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = WordTokenizer()

        #            2 15 10 11  6
        sentence = "the laziest fox"

        tokens = tokenizer.tokenize(sentence)
        tokens.append(Token("[PAD]"))  # have to do this b/c tokenizer splits it in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # PAD should get recognized and not lowercased      # [PAD]
        assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 0, 17]

        # Unless we manually override the never lowercases
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True, never_lowercase=())
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # now PAD should get lowercased and be UNK          # [UNK]
        assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 1, 17]
示例#6
0
    def test_starting_ending_offsets(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
        assert indexed_tokens["bert-offsets"] == [
            1, 2, 3, 4, 5, 6, 7, 10, 11, 12
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
        assert indexed_tokens["bert-offsets"] == [
            1, 2, 3, 4, 5, 6, 7, 8, 11, 12
        ]
 def batch_to_ids(stncs, tgt_flag=False):
     """
     convert list of text into ids that elmo accepts
     :param stncs: [['I', 'Like', 'you'],['Yes'] ]
     :param tgt_flag: indicates if the inputs is a target sentences, if it is,
                     use only the previous words as context, and neglect last word
     :return ids: indices to feed into elmo
     """
     tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
     tokens = tokenizer.tokenize(stncs)
     vocab = Vocabulary()
     vocab_path = ""
     token_indexer = PretrainedBertIndexer(str(vocab_path))
     ids = token_indexer.tokens_to_indices(tokens, vocab, "bert")
     if tgt_flag:
         ids = ids[:, :-1, :]  # neglect the last word
         b_size, _len, dim = ids.shape
         expand_ids = torch.zeros(b_size * _len,
                                  _len,
                                  dim,
                                  dtype=torch.long)
         for i in range(1, _len + 1):
             expand_ids[b_size * (i - 1):b_size * i, :i, :] = ids[:, :i, :]
         return expand_ids
     return ids
示例#8
0
    def test_sliding_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown [SEP] jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              use_starting_offsets=False,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
                                        # [CLS] the quick est quick brown [SEP] jumped over [SEP]
        assert indexed_tokens["bert"] == [16,   2,  3,    4,  3,    5,    17,   8,     9,   17,
                                        # [CLS] brown [SEP] jumped over the lazy dog [SEP]
                                          16,   5,    17,   8,     9,   2,  14,  12, 17]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]

        # The extra [SEP]s shouldn't pollute the token-type-ids
                                                 # [CLS] the quick est quick brown [SEP] jumped over [SEP]
        assert indexed_tokens["bert-type-ids"] == [0,    0,  0,    0,  0,    0,    0,    1,     1,   1,
                                                 # [CLS] brown [SEP] jumped over the lazy dog [SEP]
                                                   0,    0,    0,    1,     1,   1,  1,   1,  1]
示例#9
0
    def test_truncate_window_dont_split_wordpieces(self):
        """
        Tests if the sentence is not truncated inside of the word with 2 or
        more wordpieces.
        """

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
示例#10
0
    def test_indexes_empty_sequence(self):
        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices([], vocab)
        assert indexed_tokens == {
            "input_ids": [16, 17],  # [CLS], [SEP]
            "offsets": [],  # no tokens => no offsets
            "token_type_ids": [0, 0],  # just 0s for start and end
            "mask": [],  # no tokens => no mask
        }
示例#11
0
    def test_indexes_empty_sequence(self):
        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices([], vocab, "bert")
        assert indexed_tokens == {
                'bert': [16, 17],           # [CLS], [SEP]
                'bert-offsets': [],         # no tokens => no offsets
                'bert-type-ids': [0, 0],    # just 0s for start and end
                'mask': []                  # no tokens => no mask
        }
示例#12
0
    def test_starting_ending_offsets(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        assert indexed_tokens["bert"] == [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        assert indexed_tokens["bert-offsets"] == [0, 1, 2, 3, 4, 5, 6, 9, 10, 11]

        token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        assert indexed_tokens["bert"] == [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        assert indexed_tokens["bert-offsets"] == [0, 1, 2, 3, 4, 5, 6, 7, 10, 11]
示例#13
0
    def test_starting_ending_offsets(self):
        tokenizer = BertPreTokenizer()

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # 16 = [CLS], 17 = [SEP]
        assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
        assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 10, 11, 12]

        token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
        assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 8, 11, 12]
示例#14
0
    def test_do_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = WordTokenizer()

        # Quick is UNK because of capitalization
        #           2   1     5     6   8      9    2  15 10 11 14   1
        sentence = "the Quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=False)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # Quick should get 1 == OOV
        assert indexed_tokens["bert"] == [16, 2, 1, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]

        # Does lowercasing by default
        token_indexer = PretrainedBertIndexer(str(vocab_path))
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # Now Quick should get indexed correctly as 3 ( == "quick")
        assert indexed_tokens["bert"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
示例#15
0
    def test_token_type_ids(self):
        tokenizer = WordTokenizer()

        sentence = "the laziest  fox"

        tokens = tokenizer.tokenize(sentence)
        #           2   15 10 11  6   17    2   15 10 11  6
        #           the laziest   fox [SEP] the laziest   fox
        tokens = tokens + [Token("[SEP]")] + tokens  # have to do this b/c tokenizer splits `[SEP]` in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        #                                          [CLS] 2, 15, 10, 11, 6, 17, 2  15, 10, 11, 6, [SEP]
        assert indexed_tokens["bert-type-ids"] == [0,    0, 0,  0,  0,  0, 0,  1, 1,  1,  1,  1, 1]  #pylint: disable=bad-whitespace
示例#16
0
    def test_token_type_ids(self):
        tokenizer = SpacyTokenizer()

        sentence = "the laziest  fox"

        tokens = tokenizer.tokenize(sentence)
        #           2   15 10 11  6   17    2   15 10 11  6
        #           the laziest   fox [SEP] the laziest   fox
        tokens = (
            tokens + [Token("[SEP]")] + tokens
        )  # have to do this b/c tokenizer splits `[SEP]` in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        #                                          [CLS] 2, 15, 10, 11, 6, 17, 2  15, 10, 11, 6, [SEP]
        assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
示例#17
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        doc_encoder: Seq2VecEncoder,
        const_path: str,
        tokens_namespace: str,
        use_sim: bool = True,
        use_classifier: bool = True,
    ) -> None:
        super().__init__(vocab)
        self.vocab = vocab
        self.num_tags = vocab.get_vocab_size("labels")

        self._token_embedder = text_field_embedder
        self._doc_encoder = doc_encoder

        if not use_sim:
            raise Exception(
                "use_sim option is false, but it must be true for this to work"
            )

        if use_classifier:
            print("Warning: use_classifier option does nothing now...")

        self.use_sim = use_sim
        self.use_classifier = use_classifier

        # I actually want to use the one from the config, but not sure how to do that.
        _spacy_word_splitter = SpacyWordSplitter()
        token_indexer = PretrainedBertIndexer("bert-base-cased",
                                              do_lowercase=False,
                                              use_starting_offsets=True)

        jc = JsonConverter()
        const, links = jc._read_const(const_path)

        # the extra 1 is for the "unmatched" label.
        print(vocab.get_token_to_index_vocabulary("labels"))
        print(const.keys())
        assert self.num_tags == len(
            const
        ) + 1, "Num tags ({}) doesn't match the size of the constitution+1 ({})".format(
            self.num_tags,
            len(const) + 1)

        if self.use_sim:
            # create the constitution matrix. Every element is one of the groups.
            tagmap = self.vocab.get_index_to_token_vocabulary("labels")
            self.const_dict = {}
            indices = []
            for i in range(self.num_tags):
                tagname = tagmap[i]
                if tagname != "unmatched":
                    const_text = const[tagname]
                else:
                    const_text = "@@pad@@"

                const_toks = _spacy_word_splitter.split_words(const_text)
                # truncate so BERT is happy.
                const_toks = const_toks[:250]
                const_indices = token_indexer.tokens_to_indices(
                    const_toks, vocab, tokens_namespace)
                indices.append(const_indices)

            max_len = max(map(lambda j: len(j[tokens_namespace]), indices))
            max_offset_len = max(
                map(lambda j: len(j["tokens-offsets"]), indices))

            const_tensor = torch.zeros(self.num_tags, max_len).long()
            const_tensor_offsets = torch.zeros(self.num_tags,
                                               max_offset_len).long()
            const_tensor_mask = torch.zeros(self.num_tags,
                                            max_offset_len).long()
            for i, ind in enumerate(indices):
                toks = ind[tokens_namespace]
                mask = ind["mask"]
                const_tensor[i, :len(toks)] = torch.LongTensor(toks)
                const_tensor_offsets[
                    i, :len(ind["tokens-offsets"])] = torch.LongTensor(
                        ind["tokens-offsets"])
                const_tensor_mask[i, :len(mask)] = torch.LongTensor(mask)

            const_tokens = {
                tokens_namespace: const_tensor,
                "tokens-offsets": const_tensor_offsets,
                "mask": const_tensor_mask
            }

            print("Embedding the constitution... this could take a minute...")
            self.const_mask = util.get_text_field_mask(const_tokens)
            self.const_emb = self._token_embedder(const_tokens).detach()
            print("Done embedding the constitution.")

            if torch.cuda.is_available():
                self.const_emb = self.const_emb.cuda()
                self.const_mask = self.const_mask.cuda()

        self.vectorf1 = VectorF1(unmatched_index=self.vocab.get_token_index(
            "unmatched", namespace="labels"))
        # self.metric = F1Measure(positive_label=1)

        # self.ff = FeedForward(doc_encoder.get_output_dim(), num_layers=4,
        #                       hidden_dims=100,
        #                       activations=Activation.by_name("relu")())

        #self.tag_projection_layer = Linear(self.ff.get_output_dim(), self.num_tags)
        #self.choice_projection_layer = Linear(self.ff.get_output_dim(), 2)

        self.sim_ff = TimeDistributed(
            FeedForward(doc_encoder.get_output_dim(),
                        num_layers=1,
                        hidden_dims=2,
                        activations=Activation.by_name("relu")()))