Exemplo n.º 1
0
    def __init__(self,
                 word_indexer: Optional[TokenIndexer] = None,
                 is_bert: bool = False,
                 conceptnet_path: Optional[Path] = None):
        super().__init__(lazy=False)
        self.pos_indexers = {"pos_tokens": PosTagIndexer()}
        self.ner_indexers = {"ner_tokens": NerTagIndexer()}
        self.rel_indexers = {
            "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')
        }

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        self.word_indexers = {'tokens': word_indexer}
        word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True)
        self.word_tokeniser = WordTokenizer(word_splitter=word_splitter)
        bert_splitter = BertBasicWordSplitter()
        self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=False)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
 def test_do_lower_case(self):
     # BertBasicWordSplitter makes every token not in `never_split` to lowercase by default
     word_splitter = BertBasicWordSplitter(never_split=["[UNUSED0]"])
     sentence = "[UNUSED0] [UNK] [unused0]"
     expected_tokens = ["[UNUSED0]", "[", "unk", "]", "[", "unused0", "]"]
     tokens = [token.text for token in word_splitter.split_words(sentence)]
     assert tokens == expected_tokens
Exemplo n.º 3
0
    def test_sliding_window_with_batch(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8)

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})})

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [1, 2, 3, 4, 5, 6, 7, 8, 9]
        ]
Exemplo n.º 5
0
    def test_sliding_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown [SEP] jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              use_starting_offsets=False,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
                                        # [CLS] the quick est quick brown [SEP] jumped over [SEP]
        assert indexed_tokens["bert"] == [16,   2,  3,    4,  3,    5,    17,   8,     9,   17,
                                        # [CLS] brown [SEP] jumped over the lazy dog [SEP]
                                          16,   5,    17,   8,     9,   2,  14,  12, 17]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]

        # The extra [SEP]s shouldn't pollute the token-type-ids
                                                 # [CLS] the quick est quick brown [SEP] jumped over [SEP]
        assert indexed_tokens["bert-type-ids"] == [0,    0,  0,    0,  0,    0,    0,    1,     1,   1,
                                                 # [CLS] brown [SEP] jumped over the lazy dog [SEP]
                                                   0,    0,    0,    1,     1,   1,  1,   1,  1]
 def batch_to_ids(stncs, tgt_flag=False):
     """
     convert list of text into ids that elmo accepts
     :param stncs: [['I', 'Like', 'you'],['Yes'] ]
     :param tgt_flag: indicates if the inputs is a target sentences, if it is,
                     use only the previous words as context, and neglect last word
     :return ids: indices to feed into elmo
     """
     tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
     tokens = tokenizer.tokenize(stncs)
     vocab = Vocabulary()
     vocab_path = ""
     token_indexer = PretrainedBertIndexer(str(vocab_path))
     ids = token_indexer.tokens_to_indices(tokens, vocab, "bert")
     if tgt_flag:
         ids = ids[:, :-1, :]  # neglect the last word
         b_size, _len, dim = ids.shape
         expand_ids = torch.zeros(b_size * _len,
                                  _len,
                                  dim,
                                  dtype=torch.long)
         for i in range(1, _len + 1):
             expand_ids[b_size * (i - 1):b_size * i, :i, :] = ids[:, :i, :]
         return expand_ids
     return ids
Exemplo n.º 7
0
    def test_truncate_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
Exemplo n.º 8
0
    def test_starting_ending_offsets(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
        assert indexed_tokens["bert-offsets"] == [
            1, 2, 3, 4, 5, 6, 7, 10, 11, 12
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
        assert indexed_tokens["bert-offsets"] == [
            1, 2, 3, 4, 5, 6, 7, 8, 11, 12
        ]
Exemplo n.º 9
0
    def test_read(self, lazy):
        reader = GLUESST2DatasetReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={'bert': PretrainedBertIndexer(
                pretrained_model=self.BERT_VOCAB_PATH)},
            skip_label_indexing=False
        )
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'dev.tsv'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens']]
        label = example.fields['label'].label
        print(label)
        print(tokens)
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)
        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([vocab.get_token_from_index(i, "bert")
               for i in tokens["bert"].tolist()[0]])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
Exemplo n.º 10
0
class Conll04BERTBinaryReader(Conll04SpaCyBinaryReader):
    splitter = BertBasicWordSplitter()

    @classmethod
    def match(cls, index, tokens, split_tokens):
        tk_idx = 0
        tk_tkn = tokens[tk_idx].lower()
        st_idx = 0
        st_tkn = split_tokens[st_idx].text.lower()

        matched_tokens = []
        while True:
            if index[0] <= tk_idx and tk_idx < index[
                    1] and st_idx not in matched_tokens:
                matched_tokens.append(st_idx)

            if len(tk_tkn) < len(st_tkn):
                assert st_tkn.startswith(tk_tkn)
                st_tkn = st_tkn[len(tk_tkn):]
                tk_idx += 1
                tk_tkn = tokens[tk_idx].lower()
            elif len(tk_tkn) > len(st_tkn):
                assert tk_tkn.startswith(st_tkn)
                tk_tkn = tk_tkn[len(st_tkn):]
                st_idx += 1
                st_tkn = split_tokens[st_idx].text.lower()
            else:
                assert st_tkn == tk_tkn
                tk_idx += 1
                st_idx += 1
                if tk_idx == len(tokens):
                    assert st_idx == len(split_tokens)
                    break
                tk_tkn = tokens[tk_idx].lower()
                st_tkn = split_tokens[st_idx].text.lower()

        return matched_tokens

    @cls.textfield('word')
    def update_sentence_raw(self, fields, tokens) -> Field:
        indexers = {
            'word': PretrainedBertIndexer(pretrained_model='bert-base-uncased')
        }
        textfield = TextField(tokens, indexers)
        return textfield

    @cls.field('cancidate')
    def update_relation_cancidate(self, fields: Dict, raw_sample) -> Field:
        tokens, labels, relations = raw_sample
        if relations is None:
            return None
        relation_indices = []
        for relation_type, src_token, dst_token in relations:
            relation_indices.append(
                (src_token[self._entity_index], dst_token[self._entity_index]))
        return AdjacencyField(relation_indices,
                              fields[self.get_fieldname('word')],
                              padding_value=0)
Exemplo n.º 11
0
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "The quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "The quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        assert len(tokens1) == 10
        assert len(tokens2) == 10

        tokens = [Token('[CLS]')] + tokens1 + [Token('[SEP]')] + tokens2

        assert len(tokens) == 22

        vocab = Vocabulary()

        instance = Instance(
            {"sentence_pair": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()

        tensor_dict = batch.as_tensor_dict(padding_lengths)

        tokens = tensor_dict["sentence_pair"]
        assert tokens['mask'].tolist()[0] == [1] * 22
        assert tokens["bert"].tolist()[0] == [
            101, 1996, 4248, 4355, 4248, 2829, 4419, 5598, 2058, 1996, 13971,
            3899, 102, 1996, 4248, 2829, 4419, 5598, 2058, 1996, 2474, 14272,
            3367, 13971, 17709, 2080
        ]
        assert [
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ] == [
            '[CLS]', 'the', 'quick', '##est', 'quick', 'brown', 'fox',
            'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]', 'the', 'quick',
            'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st',
            'lazy', 'elm', '##o'
        ]
        assert len(tokens['bert'][0]) == 26
        assert tokens["bert-offsets"].tolist()[0] == [
            0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            22, 23, 25
        ]
        assert tokens['bert-type-ids'].tolist()[0] == [0] * 13 + [1] * 13

        bert_vectors = self.token_embedder(
            tokens["bert"],
            offsets=tokens["bert-offsets"],
            token_type_ids=tokens['bert-type-ids'])
        assert list(bert_vectors.shape) == [1, 22, 768]
Exemplo n.º 12
0
def get_tokenizer(embedding_type: str, xlnet_vocab_file: Path) -> WordSplitter:
    if embedding_type == 'bert':
        splitter = BertBasicWordSplitter()
    elif embedding_type == 'glove':
        splitter = SpacyWordSplitter()
    elif embedding_type == 'xlnet':
        splitter = XLNetWordSplitter(vocab_file=str(xlnet_vocab_file))
    else:
        raise ValueError(f'Embedding type {embedding_type} not available.')
    return WordTokenizer(word_splitter=splitter)
Exemplo n.º 13
0
    def __init__(self, word_indexer: Optional[TokenIndexer] = None):
        super().__init__(lazy=False)

        splitter = BertBasicWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        if word_indexer is None:
            word_indexer = PretrainedBertIndexer(
                pretrained_model='bert-base-uncased',
                truncate_long_sequences=False)
        self.word_indexers = {'tokens': word_indexer}
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        assert tokens["bert"].tolist() == [
            [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0],
            [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17],
        ]

        assert tokens["bert-offsets"].tolist() == [
            [1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
            [1, 2, 3, 4, 5, 6, 7, 10, 11, 12],
        ]

        # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
 def __init__(self,
              max_instances: int = None,
              min_abstract_len: int = 10,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              ) -> None:
     super().__init__(lazy)
     self.max_instances = max_instances
     self.min_abstract_len = min_abstract_len
     self._tokenizer = tokenizer or BertBasicWordSplitter()
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def test_sliding_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["bert"].tolist() == [[
            16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2,
            14, 12, 17
        ]]
        assert tokens["bert-offsets"].tolist() == [[
            1, 3, 4, 5, 6, 7, 8, 9, 10, 11
        ]]

        bert_vectors = token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"],
                                      token_type_ids=tokens["bert-type-ids"])
        assert list(bert_vectors.shape) == [1, 10, 12]
Exemplo n.º 17
0
    def __init__(self,
                 pretrained_model: str,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 num_choices: int = 5,
                 answer_only: bool = False,
                 restrict_num_choices: int = None,
                 ignore_context: bool = False,
                 sample: int = -1,
                 random_seed: int = 0) -> None:
        super().__init__()

        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        lower_case = not '-cased' in pretrained_model
        self._word_splitter = BertBasicWordSplitter(do_lower_case=lower_case)
        self._max_pieces = max_pieces
        self._sample = sample
        self._num_choices = num_choices
        self._answer_only = answer_only
        self._restrict_num_choices = restrict_num_choices
        self._ignore_context = ignore_context
        self._random_seed = random_seed
Exemplo n.º 18
0
    def test_end_to_end_with_higher_order_inputs(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)
        text_field1 = TextField(tokens1, {"bert": self.token_indexer})

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)
        text_field2 = TextField(tokens2, {"bert": self.token_indexer})

        #            2   5    15 10 11 6
        sentence3 = "the brown laziest fox"
        tokens3 = tokenizer.tokenize(sentence3)
        text_field3 = TextField(tokens3, {"bert": self.token_indexer})

        vocab = Vocabulary()

        instance1 = Instance({"tokens": ListField([text_field1])})
        instance2 = Instance({"tokens": ListField([text_field2, text_field3])})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True)
        tokens = tensor_dict["tokens"]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]
Exemplo n.º 19
0
    def __init__(self,
                 label_field: str,
                 text_field: str,
                 paper_lookup_path: str,
                 sent_max_len: int = 256,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or BertBasicWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._label_field = label_field
        self._text_field = text_field
        self._sent_max_len = sent_max_len

        self._paper_lookup = self.load_paper_lookup(paper_lookup_path)
class TestBertBasicWordSplitter(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.word_splitter = BertBasicWordSplitter()

    def test_never_split(self):
        sentence = "[unused0] [UNK] [SEP] [PAD] [CLS] [MASK]"
        expected_tokens = ["[", "unused0", "]", "[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
        tokens = [token.text for token in self.word_splitter.split_words(sentence)]
        assert tokens == expected_tokens

    def test_do_lower_case(self):
        # BertBasicWordSplitter makes every token not in `never_split` to lowercase by default
        word_splitter = BertBasicWordSplitter(never_split=["[UNUSED0]"])
        sentence = "[UNUSED0] [UNK] [unused0]"
        expected_tokens = ["[UNUSED0]", "[", "unk", "]", "[", "unused0", "]"]
        tokens = [token.text for token in word_splitter.split_words(sentence)]
        assert tokens == expected_tokens
Exemplo n.º 21
0
    def test_truncate_window_dont_split_wordpieces(self):
        """
        Tests if the sentence is not truncated inside of the word with 2 or
        more wordpieces.
        """

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
Exemplo n.º 22
0
    def test_truncate_window_fit_two_wordpieces(self):
        """
        Tests if the both `use_starting_offsets` options work properly when last
        word in the truncated sentence consists of two wordpieces.
        """

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=13)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [
            16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17
        ]
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=13)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [
            16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17
        ]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]
Exemplo n.º 23
0
    def test_read(self, lazy):
        reader = SnliReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={
                'bert':
                PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)
            },
        )

        instances = reader.read(
            str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens'].tokens]
        label = example.fields['label'].label
        weight = example.fields['weight'].weight
        assert label == 'neutral'
        assert weight == 1
        assert instances[1].fields['weight'].weight == 0.5
        assert instances[2].fields['weight'].weight == 1
        assert tokens == [
            'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken',
            'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training',
            'his', 'horse', 'for', 'a', 'competition', '.'
        ]
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
Exemplo n.º 25
0
def set_values(max_sequence_length: Optional[int] = -1,
               concat_title_abstract: Optional[bool] = None,
               data_source: Optional[str] = None,
               included_text_fields: Optional[str] = None) -> None:
    # set global values
    # note: a class with __init__ would have been a better design
    # we have this structure for efficiency reasons: to support multiprocessing
    # as multiprocessing with class methods is slower
    global _tokenizer
    global _token_indexers
    global _token_indexer_author_id
    global _token_indexer_author_position
    global _token_indexer_venue
    global _token_indexer_id
    global _max_sequence_length
    global _concat_title_abstract
    global _data_source
    global _included_text_fields

    if _tokenizer is None:  # if not initialized, initialize the tokenizers and token indexers
        _tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter(
            do_lower_case=bert_params["do_lowercase"]))
        _token_indexers = {
            "bert": PretrainedBertIndexer.from_params(Params(bert_params))
        }
        _token_indexer_author_id = {
            "tokens": SingleIdTokenIndexer(namespace='author')
        }
        _token_indexer_author_position = {
            "tokens": SingleIdTokenIndexer(namespace='author_positions')
        }
        _token_indexer_venue = {
            "tokens": SingleIdTokenIndexer(namespace='venue')
        }
        _token_indexer_id = {"tokens": SingleIdTokenIndexer(namespace='id')}
    _max_sequence_length = max_sequence_length
    _concat_title_abstract = concat_title_abstract
    _data_source = data_source
    _included_text_fields = included_text_fields
Exemplo n.º 26
0
 def word_embeddings(self):
     words = re.split(r'\W+',self.text) 
     Text = ' '.join(words)
     
     tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter())
     
     tokens = tokenizer.tokenize(Text)
     vocab = Vocabulary()
     token_indexer = PretrainedBertIndexer('bert-base-uncased')
     
     instance = Instance({"tokens":TextField(tokens,{'bert':token_indexer})})
     batch = Batch([instance])
     batch.index_instances(vocab)
     
     padding_lenghts = batch.get_padding_lengths()
     tensor_dict = batch.as_tensor_dict(padding_lenghts)
     
     Tokens = tensor_dict["tokens"]
     
     model = PretrainedBertEmbedder('bert-base-uncased')
     bert_vectors = model(Tokens["bert"])
     return(bert_vectors)
Exemplo n.º 27
0
    def __init__(self,
                 text_lookup_path: str,
                 embedded_text: str = 'title',
                 use_bos_eos: bool = True,
                 lazy: bool = False,
                 sent_len_limit: int = None,
                 abstract_tokenizer: Tokenizer = None,
                 abstract_indexers: Dict[str, TokenIndexer] = None,
                 sequence_tokenizer: Tokenizer = None,
                 sequence_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        if embedded_text == 'title':
            with open(text_lookup_path) as f:
                self.data_lookup = {
                    line[0]: {
                        'abstract': line[2]
                    }
                    for line in map(lambda x: x.strip().split('\t'),
                                    f.readlines()) if len(line) > 2
                }
        elif embedded_text == 'abstract':
            with jsonlines.open(text_lookup_path) as reader:
                self.data_lookup = {item['paper_id']: item for item in reader}
        # Add these now so we can find them in the lookup, then replace with
        # [unused0] and [unused1] in text_to_instance method
        self.data_lookup['<s>'] = {'abstract': '<s>'}
        self._sent_len_limit = sent_len_limit
        self._abstract_tokenizer = abstract_tokenizer or BertBasicWordSplitter(
        )
        self._abstract_indexers = abstract_indexers

        self._sequence_tokenizer = sequence_tokenizer or JustSpacesWordSplitter(
        )
        self._sequence_indexers = sequence_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
Exemplo n.º 28
0
    def __init__(self,
                 is_bert: bool,
                 conceptnet_path: Path,
                 word_indexer: Optional[TokenIndexer] = None):
        super().__init__(lazy=False)

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=True)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        # self.rel_indexers = {
        #     "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')}
        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
Exemplo n.º 29
0
        if label == pred_label:
            correct += 1
    print(
        f'Accuracy: {correct}/{len(labels)} = {correct/len(labels)*100:.2f}%')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CSQA using BERT NSP model')
    parser.add_argument('--input', help='input dataset')
    parser.add_argument('--bert-vocab', help='bert vocab file')
    parser.add_argument('--bert-model', help='pretrained bert model')
    parser.add_argument('--batch-size',
                        type=int,
                        default=8,
                        help='batch size for BERT')
    parser.add_argument('--gpu-id', '-g', type=int, default=0, help='GPU ID')

    args = parser.parse_args()

    print('Initialize BERT model...')

    TOKENIZER = WordTokenizer(word_splitter=BertBasicWordSplitter())
    WORD_INDEXER = PretrainedBertIndexer(pretrained_model=args.bert_vocab)
    VOCAB = Vocabulary()
    GPU_ID = args.gpu_id
    BERT_NEXT_SENTENCE = BertForNextSentencePrediction.from_pretrained(
        args.bert_model).to(torch.device(f"cuda:{GPU_ID}"))
    BERT_NEXT_SENTENCE.eval()

    main()
Exemplo n.º 30
0
class BertMCQAReader(DatasetReader):
    """
    Reads a file from the AllenAI-V1-Feb2018 dataset in Json format.  This data is
    formatted as jsonl, one json-formatted instance per line.  An example of the json in the data is:
        {"id":"MCAS_2000_4_6",
        "question":{"stem":"Which technology was developed most recently?",
            "choices":[
                {"text":"cellular telephone","label":"A"},
                {"text":"television","label":"B"},
                {"text":"refrigerator","label":"C"},
                {"text":"airplane","label":"D"}
            ]},
        "answerKey":"A"
        }
    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
        We use this ``Tokenizer`` for both the premise and the hypothesis.  See :class:`Tokenizer`.
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We similarly use this for both the premise and the hypothesis.  See :class:`TokenIndexer`.
    """
    def __init__(self,
                 pretrained_model: str,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 num_choices: int = 5,
                 answer_only: bool = False,
                 restrict_num_choices: int = None,
                 ignore_context: bool = False,
                 sample: int = -1,
                 random_seed: int = 0) -> None:
        super().__init__()

        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        lower_case = not '-cased' in pretrained_model
        self._word_splitter = BertBasicWordSplitter(do_lower_case=lower_case)
        self._max_pieces = max_pieces
        self._sample = sample
        self._num_choices = num_choices
        self._answer_only = answer_only
        self._restrict_num_choices = restrict_num_choices
        self._ignore_context = ignore_context
        self._random_seed = random_seed

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        counter = self._sample + 1
        debug = 5

        with open(file_path, 'r') as data_file:
            logger.info("Reading QA instances from jsonl dataset at: %s",
                        file_path)
            instances = []
            for line in data_file:
                counter -= 1
                debug -= 1
                if counter == 0:
                    break
                item_json = json.loads(line.strip())

                if debug > 0:
                    logger.info(item_json)

                item_id = item_json["id"]
                context = item_json.get("para")
                if self._ignore_context:
                    context = None
                question_text = item_json["question"]["stem"]

                if self._answer_only:
                    question_text = ""

                choice_label_to_id = {}
                choice_text_list = []
                choice_context_list = []

                any_correct = False
                choice_id_correction = 0

                for choice_id, choice_item in enumerate(
                        item_json["question"]["choices"]):
                    if self._restrict_num_choices and len(
                            choice_text_list) == self._restrict_num_choices:
                        if not any_correct:
                            choice_text_list.pop(-1)
                            choice_context_list.pop(-1)
                            choice_id_correction += 1
                        else:
                            break

                    choice_label = choice_item["label"]
                    choice_label_to_id[
                        choice_label] = choice_id - choice_id_correction

                    choice_text = choice_item["text"]
                    choice_context = choice_item.get("para")
                    if self._ignore_context:
                        choice_context = None

                    choice_text_list.append(choice_text)
                    choice_context_list.append(choice_context)

                    if item_json.get('answerKey') == choice_label:
                        if any_correct:
                            raise ValueError(
                                "More than one correct answer found for {item_json}!"
                            )
                        any_correct = True

                    if self._restrict_num_choices \
                            and len(choice_text_list) == self._restrict_num_choices \
                            and not any_correct:
                        continue

                if not any_correct and 'answerKey' in item_json:
                    raise ValueError(
                        "No correct answer found for {item_json}!")

                answer_id = choice_label_to_id[item_json["answerKey"]]
                # Pad choices with empty strings if not right number
                if len(choice_text_list) != self._num_choices:
                    choice_text_list = (
                        choice_text_list +
                        self._num_choices * [''])[:self._num_choices]
                    choice_context_list = (
                        choice_context_list +
                        self._num_choices * [None])[:self._num_choices]
                    if answer_id >= self._num_choices:
                        logging.warning(
                            f"Skipping question with more than {self._num_choices} answers: {item_json}"
                        )
                        continue

                instances.append(
                    self.text_to_instance(item_id, question_text,
                                          choice_text_list, answer_id, context,
                                          choice_context_list, debug))

            random.seed(self._random_seed)
            random.shuffle(instances)
            for instance in instances:
                yield instance

    @overrides
    def text_to_instance(
            self,  # type: ignore
            item_id: str,
            question: str,
            choice_list: List[str],
            answer_id: int = None,
            context: str = None,
            choice_context_list: List[str] = None,
            debug: int = -1) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        pair_fields = []
        pair_tokens_list = []
        choice1_index_fields = []
        choice2_index_fields = []

        for index1, index2 in itertools.permutations(range(len(choice_list)),
                                                     2):
            choice1, choice2 = (choice_list[index1], choice_list[index2])
            # TODO: What to do if contexts are not none?
            assert context is None
            if choice_context_list is not None:
                assert all(map(lambda x: x is None, choice_context_list))
            pair_tokens = self.bert_features_from_q_2a(question, choice1,
                                                       choice2)
            pair_field = TextField(pair_tokens, self._token_indexers)
            choice1_index_field = LabelField(index1, skip_indexing=True)
            choice2_index_field = LabelField(index2, skip_indexing=True)
            pair_fields.append(pair_field)
            pair_tokens_list.append(pair_tokens)
            choice1_index_fields.append(choice1_index_field)
            choice2_index_fields.append(choice2_index_field)
            if debug > 0:
                logger.info(f"qa_tokens = {pair_tokens}")

        fields['question'] = ListField(pair_fields)
        fields['choice1_indexes'] = ListField(choice1_index_fields)
        fields['choice2_indexes'] = ListField(choice2_index_fields)

        if answer_id is not None:
            fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question,
            "choice_text_list": choice_list,
            "correct_answer_index": answer_id,
            "question_tokens_list": pair_tokens_list
        }

        if debug > 0:
            logger.info(f"answer_id = {answer_id}")

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)

    @staticmethod
    def _truncate_tokens(tokens_a, tokens_b, tokens_c, max_length):
        """
        Truncate 'a' from the start, 'b' form the start, and 'c' from the end until total is less than max_length.
        At each step, truncate the longest one
        """
        while len(tokens_a) + len(tokens_b) + len(tokens_c) > max_length:
            reduction_candidate = numpy.argmax(len(tokens_a), len(tokens_b),
                                               len(tokens_c))
            if reduction_candidate == 0:
                # 'a' is the longest
                tokens_a.pop(0)
            elif reduction_candidate == 1:
                # 'b' is the longest
                tokens_b.pop(0)
            else:
                # 'c' is the longest
                tokens_c.pop()
        return tokens_a, tokens_b, tokens_c

    def bert_features_from_q_2a(self,
                                question: str,
                                answer1: str,
                                answer2: str,
                                context: str = None):
        #TODO: What should we do if context is not None (where to append it?)
        assert context is None

        sep_token = Token("[SEP]")
        question_tokens = self._word_splitter.split_words(question)

        choice1_tokens = self._word_splitter.split_words(answer1)
        choice2_tokens = self._word_splitter.split_words(answer2)
        question_tokens, choice1_tokens, choice2_tokens = self._truncate_tokens(
            question_tokens, choice1_tokens, choice2_tokens,
            self._max_pieces - 2)

        tokens = choice1_tokens + [sep_token] + question_tokens + [
            sep_token
        ] + choice2_tokens
        return tokens