Пример #1
0
    def test_elmo_empty_token_list(self):
        indexer = CustomELMoTokenCharactersIndexer()
        indexer = {'elmo': indexer}

        tokens_1 = TextField([Token('Apple')], indexer)
        targets_1 = ListField([TextField([Token('Apple')], indexer)])
        tokens_2 = TextField([Token('Screen'), Token('device')], indexer)
        targets_2 = ListField([
            TextField([Token('Screen')], indexer),
            TextField([Token('Device')], indexer)
        ])
        instance_1 = Instance({'tokens': tokens_1, 'targets': targets_1})
        instance_2 = Instance({'tokens': tokens_2, 'targets': targets_2})
        a_batch = Batch([instance_1, instance_2])
        a_batch.index_instances(Vocabulary())
        batch_tensor = a_batch.as_tensor_dict()
        elmo_target_token_indices = batch_tensor['targets']['elmo']['tokens']
        empty_target = elmo_target_token_indices[0][1].numpy()
        np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
        non_empty_targets = [
            elmo_target_token_indices[0][0], elmo_target_token_indices[1][0],
            elmo_target_token_indices[1][1]
        ]
        for non_empty_target in non_empty_targets:
            with pytest.raises(AssertionError):
                np.testing.assert_array_equal(np.zeros((1, 50)),
                                              non_empty_target)
Пример #2
0
    def forward_on_instances(self, instances: List[Instance],
                             **kwargs) -> List[Dict[str, np.ndarray]]:
        # An exact copy of the original method, but supports kwargs
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = self.make_output_human_readable(
                self(**model_input, **kwargs))
            instance_separated_output: List[Dict[str, np.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Пример #3
0
 def test_elmo_empty_token_list(self):
     # Basic test
     indexer = ELMoTokenCharactersIndexer()
     assert {"elmo_tokens": []} == indexer.get_empty_token_list()
     # Real world test
     indexer = {"elmo": indexer}
     tokens_1 = TextField([Token("Apple")], indexer)
     targets_1 = ListField([TextField([Token("Apple")], indexer)])
     tokens_2 = TextField([Token("Screen"), Token("device")], indexer)
     targets_2 = ListField([
         TextField([Token("Screen")], indexer),
         TextField([Token("Device")], indexer)
     ])
     instance_1 = Instance({"tokens": tokens_1, "targets": targets_1})
     instance_2 = Instance({"tokens": tokens_2, "targets": targets_2})
     a_batch = Batch([instance_1, instance_2])
     a_batch.index_instances(Vocabulary())
     batch_tensor = a_batch.as_tensor_dict()
     elmo_target_token_indices = batch_tensor["targets"]["elmo"][
         "elmo_tokens"]
     # The TextField that is empty should have been created using the
     # `get_empty_token_list` and then padded with zeros.
     empty_target = elmo_target_token_indices[0][1].numpy()
     np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
     non_empty_targets = [
         elmo_target_token_indices[0][0],
         elmo_target_token_indices[1][0],
         elmo_target_token_indices[1][1],
     ]
     for non_empty_target in non_empty_targets:
         with pytest.raises(AssertionError):
             np.testing.assert_array_equal(np.zeros((1, 50)),
                                           non_empty_target)
Пример #4
0
    def test_end_to_end(self, train_parameters: bool, last_layer_only: bool):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "train_parameters": train_parameters,
                    "last_layer_only": last_layer_only,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
        assert bert_vectors.requires_grad == (train_parameters
                                              or not last_layer_only)
Пример #5
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    # Parameters

    batch : `List[List[str]]`, required
        A list of tokenized sentences.

    # Returns

        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {"character_ids": indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()["elmo"]["character_ids"]["tokens"]
Пример #6
0
    def test_end_to_end_t5(
        self,
        train_parameters: bool,
        last_layer_only: bool,
        gradient_checkpointing: bool,
    ):
        tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random")
        token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params(
            {
                "token_embedders": {
                    "bert": {
                        "type": "pretrained_transformer",
                        "model_name": "patrickvonplaten/t5-tiny-random",
                        "train_parameters": train_parameters,
                        "last_layer_only": last_layer_only,
                        "gradient_checkpointing": gradient_checkpointing,
                        "sub_module": "encoder",
                    }
                }
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 8, 64)
        assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
Пример #7
0
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     probs = output_dict["class_probabilities"]
     assert probs.size() == (2, 7,
                             self.model.vocab.get_vocab_size("labels"))
Пример #8
0
 def convert_documents_to_batch(self, documents: List[Tuple[List[Token],
                                                            List[Token]]],
                                vocabulary) -> Dict[str, Any]:
     batch = Batch(
         [self.convert_tokens_to_instance(tokens) for tokens in documents])
     batch.index_instances(vocabulary)
     batch = batch.as_tensor_dict()
     return batch["document"]
    def test_long_sequence_splitting_end_to_end(self):
        # Mostly the same as the end_to_end test (except for adding max_length=4),
        # because we don't want this splitting behavior to change input/output format.

        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4)

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "max_length": 4,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        # Adds n_segments * 2 special tokens
        segment_concat_length = int(math.ceil(max_length / 4)) * 2 + max_length
        assert tokens["bert"]["token_ids"].shape == (2, segment_concat_length)

        assert tokens["bert"]["mask"].tolist() == [
            [1, 1, 1, 1, 1, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1, 1, 0, 0],
        ]
        assert tokens["bert"]["segment_concat_mask"].tolist() == [
            [1] * segment_concat_length,
            [1] * (segment_concat_length - 4) +
            [0] * 4,  # 4 is hard-coded length difference
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
Пример #10
0
def transform_collate(
        vocab,  # Use vocab to index the transformed instances
        reader,  # call reader's function to transform instances
        transform: Callable,
        instances: List[Instance]) -> TensorDict:
    new_instances = reader.transform_instances(transform, instances)
    batch = Batch(new_instances)
    batch.index_instances(vocab)
    ret = batch.as_tensor_dict(batch.get_padding_lengths())
    return ret
Пример #11
0
    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(
            text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
    def test_end_to_end_for_first_sub_token_embedding(self,
                                                      sub_token_mode: str):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", ",", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                    "sub_token_mode": sub_token_mode,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, False],
            [True, True, True, True, True, True],
        ]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [2, 2], [3, 5], [6, 6], [7, 7], [0, 0]],
            [[1, 3], [4, 4], [5, 5], [6, 6], [7, 8], [9, 9]],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)

        assert bert_vectors.size() == (2, max(len(sentence1),
                                              len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
Пример #13
0
 def test_padding_lengths_uses_max_instance_lengths(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     padding_lengths = dataset.get_padding_lengths()
     assert padding_lengths == {
         "text1": {
             "tokens___tokens": 5
         },
         "text2": {
             "tokens___tokens": 6
         }
     }
Пример #14
0
    def test_end_to_end(self):
        tokenizer = BertPreTokenizer()

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]["bert"]

        # 16 = [CLS], 17 = [SEP]
        assert tokens["input_ids"].tolist() == [
            [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0],
            [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17],
        ]

        assert tokens["offsets"].tolist() == [
            [1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
            [1, 2, 3, 4, 5, 6, 7, 10, 11, 12],
        ]

        # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["input_ids"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["input_ids"],
                                           offsets=tokens["offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["input_ids"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        bert_vectors = tlo_embedder(tokens["input_ids"],
                                    offsets=tokens["offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Пример #15
0
    def get_gradients(
            self, instances: List[Instance]
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        Gets the gradients of the loss with respect to the model inputs.

        # Parameters

        instances: List[Instance]

        # Returns

        Tuple[Dict[str, Any], Dict[str, Any]]
        The first item is a Dict of gradient entries for each input.
        The keys have the form  `{grad_input_1: ..., grad_input_2: ... }`
        up to the number of inputs given. The second item is the model's output.

        Notes
        -----
        Takes a `JsonDict` representing the inputs of the model and converts
        them to [`Instances`](../data/instance.md)), sends these through
        the model [`forward`](../models/model.md#forward) function after registering hooks on the embedding
        layer of the model. Calls `backward` on the loss and then removes the
        hooks.
        """
        embedding_gradients: List[Tensor] = []
        hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks(
            embedding_gradients)

        dataset = Batch(instances)
        dataset.index_instances(self._model.vocab)
        dataset_tensor_dict = util.move_to_device(dataset.as_tensor_dict(),
                                                  self.cuda_device)
        # To bypass "RuntimeError: cudnn RNN backward can only be called in training mode"
        with backends.cudnn.flags(enabled=False):
            outputs = self._model.make_output_human_readable(
                self._model.forward(**dataset_tensor_dict)  # type: ignore
            )

            loss = outputs["loss"]
            self._model.zero_grad()
            loss.backward()

        for hook in hooks:
            hook.remove()

        grad_dict = dict()
        for idx, grad in enumerate(embedding_gradients):
            key = "grad_input_" + str(idx + 1)
            grad_dict[key] = grad.detach().cpu().numpy()

        return grad_dict, outputs
Пример #16
0
    def forward_on_instances(
            self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        `torch.Tensors` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        # Parameters

        instances : List[Instance], required
            The instances to run the model on.

        # Returns

        A list of the models output for each instance.
        """
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Пример #17
0
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     tags = output_dict["tags"]
     assert len(tags) == 2
     assert len(tags[0]) == 7
     assert len(tags[1]) == 7
     for example_tags in tags:
         for tag_id in example_tags:
             tag = self.model.vocab.get_token_from_index(tag_id,
                                                         namespace="labels")
             assert tag in {"O", "I-ORG", "I-PER", "I-LOC"}
Пример #18
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
Пример #19
0
    def test_sliding_window(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]["bert"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["input_ids"].tolist() == [[
            16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2,
            14, 12, 17
        ]]
        assert tokens["offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

        bert_vectors = token_embedder(tokens["input_ids"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"],
                                      token_type_ids=tokens["token_type_ids"])
        assert list(bert_vectors.shape) == [1, 10, 12]
Пример #20
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {"character_ids": indexer})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()["elmo"]["character_ids"]["elmo_tokens"]
Пример #21
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError(
            "The 'vocabulary' directory in the provided serialization directory is non-empty"
        )

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )

    instances = [
        instance
        for key, dataset in all_datasets.items()
        for instance in dataset
        if key in datasets_for_vocab_creation
    ]

    vocab = Vocabulary.from_params(vocab_params, instances=instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop("model"))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    log_frozen_and_tunable_parameter_names(model)
Пример #22
0
 def forward_on_instances(
         self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
     """
     我省略了复杂繁琐的检查,因为这会导致模型最后可能没有输出
     :param instances:
     :return:
     """
     batch_size = len(instances)
     with torch.no_grad():
         cuda_device = self._get_prediction_device()
         dataset = Batch(instances)
         dataset.index_instances(self.vocab)
         model_input = util.move_to_device(dataset.as_tensor_dict(),
                                           cuda_device)
         outputs = self.decode(self(**model_input))
         return outputs
Пример #23
0
 def instances_to_captum_inputs(self, labeled_instances):
     batch_size = len(labeled_instances)
     with torch.no_grad():
         cuda_device = self._get_prediction_device()
         batch = Batch(labeled_instances)
         batch.index_instances(self.vocab)
         model_input = util.move_to_device(batch.as_tensor_dict(),
                                           cuda_device)
         input_ids = model_input["tokens"]["tokens"]["token_ids"]
         label = model_input["label"]
         attention_mask = model_input["tokens"]["tokens"]["mask"]
         embedded_tokens = self.embeddings(input_ids)
         output_dict = {}
         output_dict["embedding"] = embedded_tokens
         return (embedded_tokens, ), None, (attention_mask, label,
                                            output_dict)
Пример #24
0
def make_vocab_from_params(
    params: Params, serialization_dir: str, print_statistics: bool = False
) -> Vocabulary:
    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError(
            "The 'vocabulary' directory in the provided serialization directory is non-empty"
        )

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )

    instances: Iterable[Instance] = (
        instance
        for key, dataset in all_datasets.items()
        if key in datasets_for_vocab_creation
        for instance in dataset
    )

    if print_statistics:
        instances = list(instances)

    vocab = Vocabulary.from_params(vocab_params, instances=instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")

    if print_statistics:
        dataset = Batch(instances)
        dataset.index_instances(vocab)
        dataset.print_statistics()
        vocab.print_statistics()

    return vocab
Пример #25
0
    def test_end_to_end_with_higher_order_inputs(self):
        tokenizer = BertPreTokenizer()

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)
        text_field1 = TextField(tokens1, {"bert": self.token_indexer})

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)
        text_field2 = TextField(tokens2, {"bert": self.token_indexer})

        #            2   5    15 10 11 6
        sentence3 = "the brown laziest fox"
        tokens3 = tokenizer.tokenize(sentence3)
        text_field3 = TextField(tokens3, {"bert": self.token_indexer})

        vocab = Vocabulary()

        instance1 = Instance({"tokens": ListField([text_field1])})
        instance2 = Instance({"tokens": ListField([text_field2, text_field3])})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True)
        tokens = tensor_dict["tokens"]["bert"]

        # No offsets, should get 14 vectors back ([CLS] + 12 wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["input_ids"])
        assert list(bert_vectors.shape) == [2, 2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["input_ids"],
                                           offsets=tokens["offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["input_ids"])
        assert list(bert_vectors.shape) == [2, 2, 14, 12]

        bert_vectors = tlo_embedder(tokens["input_ids"],
                                    offsets=tokens["offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]
Пример #26
0
    def preprocess(self, token_batch):
        seq_lens = [len(sequence) for sequence in token_batch if sequence]
        if not seq_lens:
            return []
        max_len = min(max(seq_lens), self.max_len)
        batches = []
        for indexer in self.indexers:
            batch = []
            for sequence in token_batch:
                tokens = sequence[:max_len]
                tokens = [Token(token) for token in ['$START'] + tokens]
                batch.append(Instance({'tokens': TextField(tokens, indexer)}))
            batch = Batch(batch)
            batch.index_instances(self.vocab)
            batches.append(batch)

        return batches
Пример #27
0
class TestLSTMPointerForRewrite(TestCase):
    def setUp(self) -> None:
        super().setUp()
        param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet"
        dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt"
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        # 获取reader
        reader = DatasetReader.from_params(params["dataset_reader"])
        instances = reader.read(str(dataset_file))
        # 如果存在词表的参数,则加载词表
        if "vocabulary" in params:
            vocab_params = params["vocabulary"]
            vocab = Vocabulary.from_params(
                params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)

        self.vocab = vocab
        self.instances = instances
        self.instances.index_with(vocab)
        # 加载模型
        self.model = Model.from_params(params=params["model"], vocab=self.vocab)

        self.dataset = Batch(list(self.instances))
        self.dataset.index_instances(self.vocab)
        self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))

    def test_model_can_train_save_and_load(self):
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        # test train and save
        model = train_model_from_file(self.param_file, save_dir)
        # test load
        loaded_model = load_archive(archive_file, cuda_device=-1).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # make sure that the state dict (the parameters) are the same
        # for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
Пример #28
0
def setup_model(params_file, dataset_file):
    params = Params.from_file(params_file)

    #reader = DatasetReader.from_params(params['dataset_reader'])
    reader = ToxicReader()
    instances = reader.read(str(dataset_file))
    Vocabulary.from_instances(instances)
    if 'vocabulary' in params:
        vocab_params = params['vocabulary']
        vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
    else:
        vocab = Vocabulary.from_instances(instances)
    
    vocab.save_to_files("new_vocab2")
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    
    print(dataset.as_tensor_dict())
Пример #29
0
    def test_token_without_wordpieces(self):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", "", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "", "great"]
        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]
        vocab = Vocabulary()
        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]],
            [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]],
        ]

        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, max(len(sentence1),
                                              len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
        assert all(bert_vectors[0, 1] == 0)
        assert all(bert_vectors[1, 1] == 0)
Пример #30
0
    def test_sliding_window_with_batch(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig.from_json_file(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({
            "tokens":
            TextField(tokens + tokens + tokens, {"bert": token_indexer})
        })

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]["bert"]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"])
        assert bert_vectors is not None

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["input_ids"],
                                      offsets=tokens["offsets"],
                                      token_type_ids=tokens["token_type_ids"])
        assert bert_vectors is not None