Exemplo n.º 1
0
    def test_forward_pass_runs_correctly(self):
        """
        Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same
        results as the model itself.
        """
        bidaf_ensemble = BidafEnsemble([self.model, self.model])

        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()

        bidaf_output_dict = self.model(**training_tensors)
        ensemble_output_dict = bidaf_ensemble(**training_tensors)

        metrics = self.model.get_metrics(reset=True)

        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics['f1'] > 0
        assert torch.equal(ensemble_output_dict['best_span'], bidaf_output_dict['best_span'])
        assert ensemble_output_dict['best_span_str'] == bidaf_output_dict['best_span_str']
Exemplo n.º 2
0
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 5, 6, 8, 9, 2, 14, 12]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 1, 2, 3, 4, 5, 6, 7, 8]
        ]
Exemplo n.º 3
0
    def forward_on_instances(self,
                             instances: List[Instance],
                             cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Exemplo n.º 4
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Exemplo n.º 5
0
    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))
Exemplo n.º 6
0
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     tags = output_dict['tags']
     assert len(tags) == 2
     assert len(tags[0]) == 7
     assert len(tags[1]) == 7
     for example_tags in tags:
         for tag_id in example_tags:
             tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels")
             assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
Exemplo n.º 7
0
    def test_squad_with_unwordpieceable_passage(self):
        # pylint: disable=line-too-long
        tokenizer = WordTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, "
                    "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:")
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = ("Broca, being what today would be called a neurosurgeon, "
                    "had taken an interest in the pathology of speech. He wanted "
                    "to localize the difference between man and the other animals, "
                    "which appeared to reside in speech. He discovered the speech "
                    "center of the human brain, today called Broca's area after him. "
                    "His interest was mainly in Biological anthropology, but a German "
                    "philosopher specializing in psychology, Theodor Waitz, took up the "
                    "theme of general and social anthropology in his six-volume work, "
                    "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
                    """soon translated as "The Anthropology of Primitive Peoples". """
                    "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance

        instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1),
                                                        tokenizer.tokenize(passage1),
                                                        {"bert": token_indexer},
                                                        passage1)

        instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2),
                                                        tokenizer.tokenize(passage2),
                                                        {"bert": token_indexer},
                                                        passage2)

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
Exemplo n.º 8
0
    def forward_on_instances(self,
                             instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.
        cuda_device : int, required
            The GPU device to use.  -1 means use the CPU.

        Returns
        -------
        A list of the models output for each instance.
        """
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                outputs[name] = output
                for instance_output, batch_element in zip(instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Exemplo n.º 9
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)
Exemplo n.º 10
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
Exemplo n.º 11
0
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0],
                [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                [0, 1, 2, 3, 4, 5, 6, 9, 10, 11]
        ]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Exemplo n.º 12
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
Exemplo n.º 13
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            # organizing instances per question
            intances_question_id = [
                instance.fields['metadata'].metadata['question_id']
                for instance in instance_list
            ]
            split_inds = [0]
            for ind in range(len(intances_question_id) - 1):
                if intances_question_id[ind] != intances_question_id[ind + 1]:
                    split_inds.append(ind + 1)
            split_inds += [len(intances_question_id)]
            per_question_instances = [
                instance_list[split_inds[ind]:split_inds[ind + 1]]
                for ind in range(len(split_inds) - 1)
            ]

            # added by Mingzhu, batch shuffle, each batch only contains examples from one dataset.
            batch_dict = {}
            for question_instances in per_question_instances:
                set_name = question_instances[0].fields["metadata"]["dataset"]
                batch_dict.setdefault(set_name, [])
                instances_to_add = question_instances
                batch_dict[set_name] += instances_to_add
                for name, batch in batch_dict.items():
                    if len(batch) + len(
                            instances_to_add) > self._batch_size and len(
                                batch) > 0:
                        batch = sorted(batch,
                                       key=lambda x: x.fields['metadata'].
                                       metadata['question_id'])
                        yield Batch(batch)
                        batch_dict[name] = []

            # yielding remainder batch
            for name, batch in batch_dict.items():
                if len(batch) > 0:
                    batch = sorted(batch,
                                   key=lambda x: x.fields['metadata'].metadata[
                                       'question_id'])
                    yield Batch(batch)
    def forward(self, inputs, elmo_lstm_output):
        texts = self.inputs_to_texts(inputs)
        instances = self.texts_to_instances(texts)
        dataset = Batch(instances)
        dataset.index_instances(self.model.vocab)
        dp_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                        self.cuda_device)
        words, pos_tags = dp_inputs['words'], dp_inputs['pos_tags']

        mask = get_text_field_mask(words)

        layer_activations = elmo_lstm_output['activations']
        mask_with_bos_eos = elmo_lstm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            if self._keep_sentence_boundaries:
                processed_representation = representation_with_bos_eos
                processed_mask = mask_with_bos_eos
            else:
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                processed_representation = representation_without_bos_eos
                processed_mask = mask_without_bos_eos
            representations.append(self._dropout(processed_representation))

        # reshape if necessary
        mask = processed_mask
        elmo_representations = representations

        embedded_text_input = elmo_representations[0]

        embedded_pos_tags = self.model._pos_tag_embedding(pos_tags)
        embedded_text_input = torch.cat(
            [embedded_text_input, embedded_pos_tags], -1)

        encoded_text = self.model.encoder(embedded_text_input, mask)
        return encoded_text.detach()
Exemplo n.º 15
0
 def setUp(self):
     self.token_lookup = {
         'Entity1': [['Robert', 'Logan'], ['Robby']],
         'Entity2': [['Jimmy']]
     }
     self.id_map_lookup = {
         'Entity1': {
             'Robert': 1,
             'Logan': 2,
             'Robby': 3
         },
         'Entity2': {
             'Jimmy': 1
         }
     }
     self.id_array_lookup = {
         'Entity1': np.array([[1, 2], [3, 0]], dtype=int),
         'Entity2': np.array([[1]], dtype=int)
     }
     self.token_to_entity_lookup = {
         'Robert': {'Entity1'},
         'Logan': {'Entity1'},
         'Robby': {'Entity1'},
         'Jimmy': {'Entity2'}
     }
     token_indexer = SingleIdTokenIndexer()
     entity_indexer = SingleIdTokenIndexer(namespace='entity_ids')
     text_field = TextField([
         Token(t)
         for t in ['Robby', 'is', 'a', 'nickname', 'for', 'Robert']
     ], {'tokens': token_indexer})
     entity_field = TextField(
         [Token(t) for t in ['Entity1', '', '', '', '', 'Entity1']],
         {'entity_ids': entity_indexer})
     self.instance = Instance({
         'tokens': text_field,
         'entity_identifiers': entity_field
     })
     self.dataset = Batch([self.instance])
     self.vocab = Vocabulary.from_instances(self.dataset)
     self.dataset.index_instances(self.vocab)
     super(AliasDatabaseTest, self).setUp()
Exemplo n.º 16
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_token_to_namespace("a", namespace="tokens1")
        original_vocab.add_token_to_namespace("b", namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a"
                                                    "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch(
            [Instance({
                "text1": text_field1,
                "text2": text_field2
            })])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": []
        })
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": ["tokens1"]
        })
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": ["tokens1", "tokens2"]
        })
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
Exemplo n.º 17
0
class DialogQATest(ModelTestCase):
    def setUp(self):
        super().setUp()
        self.set_up_model(self.FIXTURES_ROOT / 'dialog_qa' / 'experiment.json',
                          self.FIXTURES_ROOT / 'data' / 'quac_sample.json')
        self.batch = Batch(self.instances)
        self.batch.index_instances(self.vocab)

    def test_forward_pass_runs_correctly(self):
        training_tensors = self.batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)
        assert "best_span_str" in output_dict and "loss" in output_dict
        assert "followup" in output_dict and "yesno" in output_dict

    def test_model_can_train_save_and_load(self):
        self.ensure_model_can_train_save_and_load(self.param_file,
                                                  tolerance=1e-4)

    def test_batch_predictions_are_consistent(self):
        self.ensure_batch_predictions_are_consistent()
Exemplo n.º 18
0
 def _create_batches(self, instances: Iterable[Instance],
                     shuffle: bool) -> Iterable[Batch]:
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator,
                                               self._batch_size * 30):
             yield Batch(batch_instances)
Exemplo n.º 19
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        if self.counter is None:
            self.build_counter(instances)

        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            if shuffle:
                random.shuffle(instance_list)
            iterator = iter(instance_list)
            excess: Deque[Instance] = deque()
            # Then break each memory-sized list into batches.
            for batch_instances in lazy_groups_of(iterator, self._batch_size):
                batch_instances = self.modify_batch_instances(batch_instances)
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                        batch_instances, excess):
                    batch = Batch(possibly_smaller_batches)
                    yield batch
            if excess:
                yield Batch(excess)
Exemplo n.º 20
0
 def _create_batches(self, instances: Iterable[Instance],
                     shuffle: bool) -> Iterable[Batch]:
     """
     As you can see, we don't shuffle our objects here.
     """
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         iterator = iter(instance_list)
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             yield Batch(batch_instances)
Exemplo n.º 21
0
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
    def predict_instance(self,
                         instances: Tuple[Instance, Instance],
                         num_samples: int = 100) -> JsonDict:

        conditioning_instance, generative_instance = instances

        self._model.eval()

        with torch.no_grad():
            # TODO: Make this a parameter somewhere
            num_samples = num_samples

            # Duplicate instances (to sample in parallel)
            cuda_device = self._model._get_prediction_device()
            conditioning_batch = Batch([conditioning_instance] * num_samples)
            conditioning_batch.index_instances(self._model.vocab)
            conditioning_batch = util.move_to_device(
                conditioning_batch.as_tensor_dict(), cuda_device)

            generative_batch = Batch([generative_instance] * num_samples)
            generative_batch.index_instances(self._model.vocab)
            generative_batch = util.move_to_device(
                generative_batch.as_tensor_dict(), cuda_device)

            # Sample annotations and generate next token
            self._model._use_shortlist = True
            conditioning_output = self._model.sample(**conditioning_batch,
                                                     emit_tokens=False)
            logger.debug('clears condition generation')
            # self._model(**conditioning_output)  # Shouldn't need to do this, but just in case
            # logger.debug('clears reconditioning')
            generative_output = self._model.sample(**generative_batch,
                                                   emit_tokens=True)
            logger.debug('clears generation')
            del conditioning_batch, generative_batch

            aggregate_word_probs = self._aggregate_word_probs(
                generative_output)
            logger.debug('clears word probs')

            return aggregate_word_probs
Exemplo n.º 23
0
def predict(model, dataset_reader, test_file, output_file, cuda_device):
    gold_test_data = load_json(test_file)
    instances = dataset_reader.read(test_file)
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    iterator = BatchIterator()
    iterator._batch_size = 5
    # For long documents, loop over batches of sentences. Keep track of the
    # total length and append onto the end of the predictions for each sentence
    # batch.
    assert len(gold_test_data) == 1
    gold_data = gold_test_data[0]
    predictions = {}
    total_length = 0
    for sents in iterator(batch.instances, num_epochs=1, shuffle=False):
        sents = nn_util.move_to_device(sents, cuda_device)  # Put on GPU.
        sentence_lengths = [
            len(entry["sentence"]) for entry in sents["metadata"]
        ]
        sentence_starts = np.cumsum(sentence_lengths) + total_length
        sentence_starts = np.roll(sentence_starts, 1)
        sentence_starts[0] = total_length
        pred = model(**sents)
        decoded = model.decode(pred)
        if total_length == 0:
            for k, v in decoded.items():
                predictions[decode_names[k]] = cleanup(k, v[decode_fields[k]],
                                                       sentence_starts)
        else:
            for k, v in decoded.items():
                predictions[decode_names[k]] += cleanup(
                    k, v[decode_fields[k]], sentence_starts)
        total_length += sum(sentence_lengths)

    res = {}
    res.update(gold_data)
    res.update(predictions)
    check_lengths(res)
    encoded = json.dumps(res, default=int)
    with open(output_file, "w") as f:
        f.write(encoded + "\n")
Exemplo n.º 24
0
    def forward_on_instances(
            self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.
        cuda_device : int, required
            The GPU device to use.  -1 means use the CPU.

        Returns
        -------
        A list of the models output for each instance.
        """
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    output = output.detach().cpu().numpy()
                outputs[name] = output
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Exemplo n.º 25
0
 def _create_batches(self, instances, shuffle):
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                     batch_instances):
                 batch = Batch(possibly_smaller_batches)
                 yield batch
Exemplo n.º 26
0
def prepare_batch(tokens_batch, vocab, indexers, args):
    """ Do preprocessing for batch """
    instance_ls = []
    token_ls = []
    for tokens in tokens_batch:
        field = sentence_to_text_field(tokens, indexers)
        field.index(vocab)
        instance_ls.append(Instance({"input1": field}))
        token_ls.append(tokens)
    batch = Batch(instance_ls).as_tensor_dict()
    batch = move_to_device(batch, args.cuda)
    return batch, token_ls
Exemplo n.º 27
0
def read_squad_word_char(file_path):
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace="token_ids"),
        "chars": TokenCharactersIndexer(namespace="token_chars")
    }
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print(len(word2idx))
    print(len(char2idx))
    print(char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print(padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print(tensor_dict['passage']['tokens'].shape)
    print(tensor_dict['passage']['chars'].shape)
    print(tensor_dict['question']['tokens'].shape)
    print(tensor_dict['question']['chars'].shape)
    print(tensor_dict['span_start'].shape)
    print(tensor_dict['span_end'].shape)
Exemplo n.º 28
0
    def remove_tokens(self, attentions, metadata, threshold, labels):
        attentions_cpu = attentions.cpu().data.numpy()
        sentences = [x["tokens"] for x in metadata]
        instances = []
        for b in range(attentions_cpu.shape[0]):
            sentence = [x for x in sentences[b]]
            always_keep_mask = metadata[b]['always_keep_mask']
            attn = attentions_cpu[b][:len(sentence
                                          )] + always_keep_mask * -10000
            max_length = math.ceil((1 - always_keep_mask).sum() * threshold)

            top_ind = np.argsort(attn)[:-max_length]
            new_tokens = [
                x for i, x in enumerate(sentence)
                if i in top_ind or always_keep_mask[i] == 1
            ]
            instances += metadata[0]["convert_tokens_to_instance"](new_tokens,
                                                                   None)

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {
            k: v.to(attentions.device)
            for k, v in batch["document"].items()
        }
    def test_embeddings(self, transformer_name, gold_offsets: torch.LongTensor, use_starting_offsets):
        self.token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False,
                                                use_starting_offsets=use_starting_offsets)
        self.transformer_embedder = TransformerEmbedder(model_name=transformer_name, trainable=False)

        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens0 = sent0.split()
        tokens1 = sent1.split()
        tokens0 = [Token(token) for token in tokens0]
        tokens1 = [Token(token) for token in tokens1]
        vocab = Vocabulary()

        instance0 = Instance({"tokens": TextField(tokens0, {"transformer": self.token_indexer})})
        instance1 = Instance({"tokens": TextField(tokens1, {"transformer": self.token_indexer})})

        batch = Batch([instance0, instance1])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        input_ids = tokens['transformer']
        offsets = tokens['transformer-offsets']
        transformer_mask = tokens['transformer-mask']

        test_select_embeddings = self.transformer_embedder(input_ids, offsets, transformer_mask)
        transformer_vectors = self.transformer_embedder(token_ids=input_ids, mask=transformer_mask)
        gold_select_embeddings = get_select_embedding(transformer_vectors, gold_offsets)
        assert gold_select_embeddings.equal(test_select_embeddings)
Exemplo n.º 30
0
    def test_encode_decode_with_raw_text_base(self, transformer_name):
        token_indexer = TransformerIndexer(model_name=transformer_name,
                                           do_lowercase=False)
        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"

        vocab = Vocabulary()

        instance1 = Instance({
            "tokens":
            TextField([Token(sent0)], {"transformer": token_indexer})
        })
        instance2 = Instance({
            "tokens":
            TextField([Token(sent1)], {"transformer": token_indexer})
        })

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        input_ids = tokens['transformer']
        input_ids_0 = [id.item() for id in input_ids[0]]
        input_ids_1 = [id.item() for id in input_ids[1]]
        # 原句子应与indexer后的句子保持一致
        assert sent0 == token_indexer.tokenizer.decode(
            input_ids_0, skip_special_tokens=True)
        assert sent1 == token_indexer.tokenizer.decode(
            input_ids_1, skip_special_tokens=True)
Exemplo n.º 31
0
def instances_to_batch(instances, model, for_training, cuda_device=0):
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    padding_lengths = batch.get_padding_lengths()
    return batch.as_tensor_dict(padding_lengths,
                                cuda_device=cuda_device,
                                for_training=for_training)
Exemplo n.º 32
0
    def forward_on_instance(self, instance: SyncedFieldsInstance) -> Dict[str, str]:
        """
        Takes an :class:`~allennlp.data.instance.Instance`, which typically has raw text in it,
        converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays
        through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and remove the batch dimension.
        """
        cuda_device = self._get_prediction_device()
        dataset = Batch([instance])
        dataset.index_instances(self.vocab)

        gt_has_oov = False
        dataset_tensor_dict = dataset.as_tensor_dict()
        if self.OOV_ID in dataset_tensor_dict["target_tokens"]["ids_with_unks"]:
            gt_has_oov = True

        model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
        output_ids = self.beam_search_decode(**model_input)

        output_words = []
        for _id in output_ids:
            if _id<self.vocab_size:
                output_words.append(self.vocab.get_token_from_index(_id))
            else:
                output_words.append(instance.oov_list[_id-self.vocab_size])

        assert output_words[0]==START_SYMBOL, "somehow the first symbol is not the START symbol. might be a bug"
        output_words=output_words[1:]
        if output_words[-1]==END_SYMBOL:
            output_words = output_words[:-1]
        return " ".join(output_words)
Exemplo n.º 33
0
    def test_offsets_with_tokenized_text_base(self, transformer_name):
        token_indexer = TransformerIndexer(model_name=transformer_name,
                                           do_lowercase=False)
        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"

        sent0 = sent0.split()
        sent1 = sent1.split()

        tokens0 = [Token(token) for token in sent0]
        tokens1 = [Token(token) for token in sent1]

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens0, {"transformer": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens1, {"transformer": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 每个token应该只取一个sub_word代表作为token的特征
        assert len(tokens['transformer-offsets'][0]) == len(tokens0)
        assert len(tokens['transformer-offsets'][1]) == len(tokens1)
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [1, 2, 3, 4, 5, 6, 7, 8, 9]
        ]
Exemplo n.º 35
0
    def forward_on_instances(
            self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.
        cuda_device : int, required
            The GPU device to use.  -1 means use the CPU.

        Returns
        -------
        A list of the models output for each instance.
        """
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output = []

            metadata = [
                x.fields["metadata"].metadata for x in dataset.instances
            ]
            for res in export_output_data_arc_multi_choice_json(
                    metadata, outputs):
                instance_separated_output.append(res)

            return instance_separated_output
Exemplo n.º 36
0
def read_squad_allennlp(file_path):
    '''read data, build vocab, batch, padding, to idx
    Args:
        file_path -- raw squad json file
    Returns:
        None
    '''
    token_indexers = {
            "tokens": SingleIdTokenIndexer(namespace="token_ids"),
            "chars": TokenCharactersIndexer(namespace="token_chars")}
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    for instance in instances:
        question = instance.fields['question']
        print (question)
        print (type(question))
        break
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print (len(word2idx))
    print (len(char2idx))
    print (char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print (padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print (tensor_dict['passage']['tokens'].shape)
    print (tensor_dict['passage']['chars'].shape)
    print (tensor_dict['question']['tokens'].shape)
    print (tensor_dict['question']['chars'].shape)
    print (tensor_dict['span_start'].shape)
    print (tensor_dict['span_end'].shape)
Exemplo n.º 37
0
def predict(instances: List[Instance]) -> List[float]:
    """Output BERT NSP next sentence probability for a list of instances.

    Parameters
    ----------
    instances : List[Instance]

    Returns
    -------
    List[float]
        BERT NSP scores in range [0, 1].
    """
    scores = []
    for batch_instance in tqdm(batch(instances, batch_size=args.batch_size),
                               total=math.ceil(
                                   len(instances) / args.batch_size),
                               desc='Predicting'):
        batch_ins = Batch(batch_instance)
        batch_ins.index_instances(VOCAB)
        tensor_dict = batch_ins.as_tensor_dict(batch_ins.get_padding_lengths())
        tokens = tensor_dict["tokens"]
        input_ids = tokens['bert'].to(torch.device(f'cuda:{GPU_ID}'))
        token_type_ids = tokens['bert-type-ids'].to(
            torch.device(f'cuda:{GPU_ID}'))
        input_mask = (input_ids != 0).long()
        cls_out = BERT_NEXT_SENTENCE.forward(input_ids=input_ids,
                                             token_type_ids=token_type_ids,
                                             attention_mask=input_mask)
        probs = F.softmax(cls_out, dim=-1)
        next_sentence_score = probs[:, 0].detach().cpu().numpy().tolist()
        scores += next_sentence_score

    return scores
Exemplo n.º 38
0
    def test_read(self, lazy):
        reader = GLUESST2DatasetReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={'bert': PretrainedBertIndexer(
                pretrained_model=self.BERT_VOCAB_PATH)},
            skip_label_indexing=False
        )
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'dev.tsv'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens']]
        label = example.fields['label'].label
        print(label)
        print(tokens)
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)
        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([vocab.get_token_from_index(i, "bert")
               for i in tokens["bert"].tolist()[0]])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
Exemplo n.º 39
0
def predict(archive_file, test_file, output_file, cuda_device, score_dir):
    import_submodules("dygie")
    gold_test_data = load_json(test_file)
    archive = load_archive(archive_file, cuda_device)
    model = archive.model
    model.eval()
    config = archive.config.duplicate()
    dataset_reader_params = config["dataset_reader"]
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    instances = dataset_reader.read(test_file)
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    iterator = DocumentIterator()
    with open(output_file, "w") as f:
        for doc, gold_data in zip(
                iterator(batch.instances, num_epochs=1, shuffle=False),
                gold_test_data):
            doc = nn_util.move_to_device(doc, cuda_device)  # Put on GPU.
            sentence_lengths = [
                len(entry["sentence"]) for entry in doc["metadata"]
            ]
            sentence_starts = np.cumsum(sentence_lengths)
            sentence_starts = np.roll(sentence_starts, 1)
            sentence_starts[0] = 0
            pred = model(**doc)
            if score_dir is not None:
                dump_scores(doc, pred, score_dir)
            decoded = model.decode(pred)
            predictions = {}
            for k, v in decoded.items():
                predictions[decode_names[k]] = cleanup(k, v[decode_fields[k]],
                                                       sentence_starts)
            res = {}
            res.update(gold_data)
            res.update(predictions)
            if "dataset" in res:
                del res["dataset"]
            check_lengths(res)
            encoded = json.dumps(res, default=int)
            f.write(encoded + "\n")
Exemplo n.º 40
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):
            instance_list = [
                ins for ins in instance_list
                if random.random() < ins['metadata'].metadata['keep_prob']
            ]

            if len(self._sorting_keys) == 0:
                instance_list = sort_by_padding(instance_list,
                                                self._sorting_keys, self.vocab,
                                                self._padding_noise)

            batches = []
            excess: Deque[Instance] = deque()
            for batch_instances in lazy_groups_of(iter(instance_list),
                                                  self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                        batch_instances, excess):
                    batches.append(Batch(possibly_smaller_batches))
            if excess:
                batches.append(Batch(excess))

            # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group
            # num_gpu batches together, shuffle and then expand the groups.
            # This guards against imbalanced batches across GPUs.
            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                # NOTE: if shuffle is false, the data will still be in a different order
                # because of the bucket sorting.
                random.shuffle(batches)
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            yield from batches
Exemplo n.º 41
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.instances):
         dataset = Batch([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Batch(self.instances)
     batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False)
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     slices = tuple(slice(0, size) for size in single_predicted.size())
                     batch_predicted = batch_predicted[slices]
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Exemplo n.º 42
0
    def test_sliding_window_with_batch(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8)

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})})

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None
Exemplo n.º 43
0
    def _regenerate_tokens(self, metadata, sample_z):
        sample_z_cpu = sample_z.cpu().data.numpy()
        tokens = [m["tokens"] for m in metadata]

        assert len(tokens) == len(sample_z_cpu)
        assert max([len(x) for x in tokens]) == sample_z_cpu.shape[1]

        instances = []
        new_tokens = []
        for words, mask, meta in zip(tokens, sample_z_cpu, metadata):
            mask = mask[:len(words)]
            new_words = [
                w for i, (w, m) in enumerate(zip(words, mask))
                if i == 0 or m == 1
            ]

            new_tokens.append(new_words)
            meta["new_tokens"] = new_tokens
            instance = metadata[0]["convert_tokens_to_instance"](new_words,
                                                                 None)
            instances += instance

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {k: v.to(sample_z.device) for k, v in batch["document"].items()}
Exemplo n.º 44
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:

        instances = ensure_list(instances)
        instances_len = len(instances)

        num_batches = math.floor(instances_len / self._batch_size)

        # want all batches to be the same size
        stop = instances_len - instances_len % self._batch_size

        for batch_ind in range(num_batches):
            yield Batch(instances[batch_ind:stop:num_batches])
Exemplo n.º 45
0
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        metrics = self.model.get_metrics(reset=True)
        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics['f1'] > 0

        span_start_probs = output_dict['span_start_probs'][0].data.numpy()
        span_end_probs = output_dict['span_start_probs'][0].data.numpy()
        assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6)
        assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6)
        span_start, span_end = tuple(output_dict['best_span'][0].data.numpy())
        assert span_start >= 0
        assert span_start <= span_end
        assert span_end < self.instances[0].fields['passage'].sequence_length()
        assert isinstance(output_dict['best_span_str'][0], str)
Exemplo n.º 46
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(self.vocab, params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Exemplo n.º 47
0
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)
Exemplo n.º 48
0
    def ensure_batch_predictions_are_consistent(
            self,
            keys_to_ignore: Iterable[str] = ()):
        """
        Ensures that the model performs the same on a batch of instances as on individual instances.
        Ignores metrics matching the regexp .*loss.* and those specified explicitly.

        Parameters
        ----------
        keys_to_ignore : ``Iterable[str]``, optional (default=())
            Names of metrics that should not be taken into account, e.g. "batch_weight".
        """
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if 'loss' in key:
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                if key in keys_to_ignore:
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.Tensor):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Exemplo n.º 49
0
class ModelTestCase(AllenNlpTestCase):
    """
    A subclass of :class:`~allennlp.common.testing.test_case.AllenNlpTestCase`
    with added methods for testing :class:`~allennlp.models.model.Model` subclasses.
    """
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(dataset_file))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1,
                                             gradients_to_ignore: Set[str] = None,
                                             overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model

    def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6) -> None:
        if isinstance(field1, torch.Tensor):
            assert_allclose(field1.detach().cpu().numpy(),
                            field2.detach().cpu().numpy(),
                            rtol=tolerance,
                            err_msg=name)
        elif isinstance(field1, dict):
            assert field1.keys() == field2.keys()
            for key in field1:
                self.assert_fields_equal(field1[key],
                                         field2[key],
                                         tolerance=tolerance,
                                         name=name + '.' + str(key))
        elif isinstance(field1, (list, tuple)):
            assert len(field1) == len(field2)
            for i, (subfield1, subfield2) in enumerate(zip(field1, field2)):
                self.assert_fields_equal(subfield1,
                                         subfield2,
                                         tolerance=tolerance,
                                         name=name + f"[{i}]")
        elif isinstance(field1, (float, int)):
            assert_allclose([field1], [field2], rtol=tolerance, err_msg=name)
        else:
            if field1 != field2:
                for key in field1.__dict__:
                    print(key, getattr(field1, key) == getattr(field2, key))
            assert field1 == field2, f"{name}, {type(field1)}, {type(field2)}"

    @staticmethod
    def check_model_computes_gradients_correctly(model: Model,
                                                 model_batch: Dict[str, Union[Any, Dict[str, Any]]],
                                                 params_to_ignore: Set[str] = None):
        print("Checking gradients")
        model.zero_grad()
        result = model(**model_batch)
        result["loss"].backward()
        has_zero_or_none_grads = {}
        for name, parameter in model.named_parameters():
            zeros = torch.zeros(parameter.size())
            if params_to_ignore and name in params_to_ignore:
                continue
            if parameter.requires_grad:

                if parameter.grad is None:
                    has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)"

                elif parameter.grad.is_sparse or parameter.grad.data.is_sparse:
                    pass

                # Some parameters will only be partially updated,
                # like embeddings, so we just check that any gradient is non-zero.
                elif (parameter.grad.cpu() == zeros).all():
                    has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})"
            else:
                assert parameter.grad is None

        if has_zero_or_none_grads:
            for name, grad in has_zero_or_none_grads.items():
                print(f"Parameter: {name} had incorrect gradient: {grad}")
            raise Exception("Incorrect gradients found. See stdout for more info.")

    def ensure_batch_predictions_are_consistent(self):
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if 'loss' in key:
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.Tensor):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Exemplo n.º 50
0
    if (create_video_training):
        pf.create_image_weights_epoch(model, video_fotograms_folder2, i)
        pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset,
                                            tr_data_loss, val_data_loss, KL_loss,
                                            video_fotograms_folder4, i+1)

#            output = model(tensor_dict["text_field"],tensor_dict["tags_field"])
#            loss = output["loss"] # We can get the loss coz we gave the labels as input



			# gradient and everything. 
"""
############## Use the trained model ######################
We use an already implemented predictor that takes the model and how to preprocess the data
"""

name_exmaple = "Eat my motherfucking jeans"
name_exmaple = "Carlos Sanchez"
tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))]
Instance_test = reader.generate_instance(tokens_list,None)
batch = Batch([Instance_test])
batch.index_instances(vocab)

padding_lengths = batch.get_padding_lengths()
tensor_dict = batch.as_tensor_dict(padding_lengths)

model.eval()
tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy()
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])
Exemplo n.º 51
0
    
    return field
    # Create the instance with the ELMO field
    
    
instances = []
for sentence in sentences:
    ## We tokenize every word. 
    field = get_ELMO_text_field(sentence, indexer, tokenizer)
    instance = Instance({"elmo": field})
    print("Fields in instance: ", instance.fields)
    instances.append(instance)


### Create a batch of the instances 
dataset = Batch(instances)

## Create an empty vocabulary ! We do not need to create one from dataset,
# It will use all of the indexer !!
vocab = Vocabulary()

## Create the index_instances from the batch, this will be used later by ELMO
dataset.index_instances(vocab)

"""
IMPORTANT: The ELMO uses just a character vocab in the interface.
It will compute the rest internally!

The ELMO words are padded to length 50 !
"""
Exemplo n.º 52
0
class ModelTestCase(AllenNlpTestCase):
    """
    A subclass of :class:`~allennlp.common.testing.test_case.AllenNlpTestCase`
    with added methods for testing :class:`~allennlp.models.model.Model` subclasses.
    """
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(self.vocab, params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model

    def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6) -> None:
        if isinstance(field1, torch.autograd.Variable):
            assert_allclose(field1.data.cpu().numpy(),
                            field2.data.cpu().numpy(),
                            rtol=tolerance,
                            err_msg=name)
        elif isinstance(field1, dict):
            assert field1.keys() == field2.keys()
            for key in field1:
                self.assert_fields_equal(field1[key],
                                         field2[key],
                                         tolerance=tolerance,
                                         name=name + '.' + key)
        elif isinstance(field1, (list, tuple)):
            assert len(field1) == len(field2)
            for i, (subfield1, subfield2) in enumerate(zip(field1, field2)):
                self.assert_fields_equal(subfield1,
                                         subfield2,
                                         tolerance=tolerance,
                                         name=name + f"[{i}]")
        else:
            assert field1 == field2

    @staticmethod
    def check_model_computes_gradients_correctly(model, model_batch):
        model.zero_grad()
        result = model(**model_batch)
        result["loss"].backward()
        has_zero_or_none_grads = {}
        for name, parameter in model.named_parameters():
            zeros = torch.zeros(parameter.size())
            if parameter.requires_grad:

                if parameter.grad is None:
                    has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)"
                # Some parameters will only be partially updated,
                # like embeddings, so we just check that any gradient is non-zero.
                if (parameter.grad.data.cpu() == zeros).all():
                    has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})"
            else:
                assert parameter.grad is None

        if has_zero_or_none_grads:
            for name, grad in has_zero_or_none_grads.items():
                print(f"Parameter: {name} had incorrect gradient: {grad}")
            raise Exception("Incorrect gradients found. See stdout for more info.")

    def ensure_batch_predictions_are_consistent(self):
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False)
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if key == 'loss':
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.autograd.Variable):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Exemplo n.º 53
0
 def test_padding_lengths_uses_max_instance_lengths(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     padding_lengths = dataset.get_padding_lengths()
     assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
                                "text2": {"num_tokens": 6, "tokens_length": 6}}
Exemplo n.º 54
0
    question_text = "What kind of test succeeded on its first attempt?"
    char_spans = [(6, 10)]
    instance = squad_reader.text_to_instance(question_text, 
                                               passage_text, 
                                               char_spans = char_spans)
    
    print ("Keys instance: ", instance.fields.keys())
    
    # Batch intances and convert to index using the vocabulary.
    instances = [instance]
else:

    instances = [train_dataset[0],train_dataset[1]]

## Create the batch ready to be used
dataset = Batch(instances)
dataset.index_instances(vocab)

print ("-------------- DATASET EXAMPLE ---------------")
character_ids_passage = dataset.as_tensor_dict()['passage']['character_ids']
character_ids_question = dataset.as_tensor_dict()['question']['character_ids']

question =  dataset.as_tensor_dict()['question']
passage =  dataset.as_tensor_dict()['passage']
span_start =  dataset.as_tensor_dict()['span_start']
span_end =  dataset.as_tensor_dict()['span_end']
metadata =  dataset.as_tensor_dict()['metadata']

print ("Shape of characters ids passage: ", character_ids_passage.shape)
print ("Shape of characters ids question: ", character_ids_question.shape)
 def _get_training_tensors(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     return dataset.as_tensor_dict()
Exemplo n.º 56
0
    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    ## Vocabulary ##
    vocab = model.vocab 

    """
    ############  Propagate an instance text #############
    """
    instance = dataset_reader.text_to_instance("What kind of test succeeded on its first attempt?", 
                                               "One time I was writing a unit test, and it succeeded on the first attempt.", 
                                               char_spans=[(6, 10)])
    
    print ("Keys instance: ", instance.fields.keys())
    
    # Batch intances and convert to index using the vocabulary.
    instances = [instance]
    dataset = Batch(instances)
    dataset.index_instances(model.vocab)
    
    # Create the index tensor from the vocabulary.
    cuda_device = model._get_prediction_device()
    model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
    
    # Propagate the sample and obtain the loss (since we passed labels)
    outputs = model(**model_input)
    outputs["loss"].requires_grad