Exemplo n.º 1
0
    def forward_on_instances(
            self, instances: List[Instance],
            cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        dataset = Dataset(instances)
        dataset.index_instances(self.vocab)
        model_input = dataset.as_tensor_dict(cuda_device=cuda_device,
                                             for_training=False)
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [
            {} for _ in dataset.instances
        ]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(
                    instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Exemplo n.º 2
0
def token_to_elmo_id(token):
    tokens = [Token(token)]
    field = TextField(tokens, {'character_ids': indexer})
    instance = Instance({"elmo": field})
    instances = [instance]
    dataset = Dataset(instances)
    vocab = Vocabulary()
    for instance in dataset.instances:
        instance.index_fields(vocab)
    #dataset.index_instances(vocab) # replaced by above, so that there's no progress bar
    return dataset.as_tensor_dict()['elmo']['character_ids']
Exemplo n.º 3
0
 def _yield_one_epoch(self, dataset: Dataset, shuffle: bool,
                      cuda_device: int, for_training: bool):
     grouped_instances = self._create_batches(dataset, shuffle)
     for group in grouped_instances:
         batch = Dataset(group)
         padding_lengths = batch.get_padding_lengths()
         logger.debug("Batch padding lengths: %s", str(padding_lengths))
         logger.debug("Batch size: %d", len(batch.instances))
         yield batch.as_tensor_dict(padding_lengths,
                                    cuda_device=cuda_device,
                                    for_training=for_training)
Exemplo n.º 4
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.dataset.instances):
         dataset = Dataset([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Dataset([instance for instance in self.dataset])
     batch_tensors = full_dataset.as_tensor_dict(self.dataset.get_padding_lengths(), for_training=False)
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     # This is probably a sequence model, and our output shape has some padded
                     # elements in the batched case.  Fixing this in general is complicated;
                     # we'll just fix some easy cases that we actually have, for now.
                     num_tokens = single_predicted.size(0)
                     if batch_predicted.dim() == 1:
                         batch_predicted = batch_predicted[:num_tokens]
                     elif batch_predicted.dim() == 2:
                         batch_predicted = batch_predicted[:num_tokens, :]
                     else:
                         raise NotImplementedError
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Exemplo n.º 5
0
    def test_lazy_as_tensor_dict(self):
        lazy_dataset = self.get_lazy_dataset()
        lazy_dataset.index_instances(self.vocab)

        for _ in range(10):
            dataset = Dataset([instance for instance in lazy_dataset])
            padding_lengths = dataset.get_padding_lengths()
            tensors = dataset.as_tensor_dict(padding_lengths)
            text1 = tensors["text1"]["tokens"].data.cpu().numpy()
            text2 = tensors["text2"]["tokens"].data.cpu().numpy()

            numpy.testing.assert_array_almost_equal(
                text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
            numpy.testing.assert_array_almost_equal(
                text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
Exemplo n.º 6
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Dataset(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']