示例#1
0
 def test_as_tensor_handles_words(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"words": SingleIdTokenIndexer("words")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                             numpy.array([1, 1, 1, 2, 1]))
示例#2
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
示例#3
0
 def test_as_tensor_handles_characters(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                             expected_character_array)
示例#4
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"),
                                              "characters": TokenCharactersIndexer("characters")}
        self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()
示例#5
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
示例#6
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in ["a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
示例#7
0
 def test_token_embedder_returns_dict(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       token_indexers={"field_with_dict": DictReturningTokenIndexer(),
                                       "words": SingleIdTokenIndexer("words"),
                                       "characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
             'token_ids': 5,
             'additional_key': 2,
             'words': 2,
             'characters': 2,
             'num_token_characters': 8
     }
     padding_lengths['token_ids'] = 7
     padding_lengths['additional_key'] = 3
     padding_lengths['words'] = 4
     padding_lengths['characters'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
示例#8
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
示例#9
0
    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()
    def _prepare_next_inputs(self, predictions: torch.Tensor,
                             target_attention_map: torch.Tensor,
                             target_dynamic_vocabs: List[Dict[int, str]],
                             meta_data: List[Dict], batch_size: int,
                             last_decoding_step: int,
                             source_dynamic_vocab_size: int) -> Dict:
        """
        Read out a group of hybrid predictions. Based on different ways of node prediction,
        find the corresponding token, node index and pos tags. Prepare the tensorized inputs
        for the next decoding step. Update the target attention map, target dynamic vocab, etc.
        :param predictions: [group_size,]
        :param target_attention_map: [group_size, target_length, target_dynamic_vocab_size].
        :param target_dynamic_vocabs: a group_size list of target dynamic vocabs.
        :param meta_data: meta data for each instance.
        :param batch_size: int.
        :param last_decoding_step: the decoding step starts from 0, so the last decoding step
            starts from -1.
        :param source_dynamic_vocab_size: int.
        """
        # On the default, if a new node is created via either generation or source-side copy,
        # its node index will be last_decoding_step + 1. One shift between the last decoding
        # step and the default node index is because node index 0 is reserved for no target copy.
        # See `_prepare_inputs` for detail.
        default_node_index = last_decoding_step + 1

        def batch_index(instance_i: int) -> int:
            if predictions.size(0) == batch_size * self._beam_size:
                return instance_i // self._beam_size
            else:
                return instance_i

        token_instances = []

        node_indices = torch.zeros_like(predictions)
        pos_tags = torch.zeros_like(predictions)

        for i, index in enumerate(predictions.tolist()):

            instance_meta = meta_data[batch_index(i)]
            pos_tag_lut = instance_meta["pos_tag_lut"]
            target_dynamic_vocab = target_dynamic_vocabs[i]
            # Generation.
            if index < self._vocab_size:
                token = self.vocab.get_token_from_index(
                    index, self._target_output_namespace)
                node_index = default_node_index
                pos_tag = pos_tag_lut.get(token, DEFAULT_OOV_TOKEN)
            # Source-side copy.
            elif self._vocab_size <= index < self._vocab_size + source_dynamic_vocab_size:
                index -= self._vocab_size
                source_dynamic_vocab = instance_meta["source_dynamic_vocab"]
                token = source_dynamic_vocab.get_token_from_idx(index)
                node_index = default_node_index
                pos_tag = pos_tag_lut.get(token, DEFAULT_OOV_TOKEN)
            # Target-side copy.
            else:
                index -= (self._vocab_size + source_dynamic_vocab_size)
                token = target_dynamic_vocab[index]
                node_index = index
                pos_tag = pos_tag_lut.get(token, DEFAULT_OOV_TOKEN)

            target_token = TextField([Token(token)],
                                     instance_meta["target_token_indexers"])

            token_instances.append(Instance({"target_tokens": target_token}))
            node_indices[i] = node_index
            pos_tags[i] = self.vocab.get_token_index(pos_tag,
                                                     self._pos_tag_namespace)
            if last_decoding_step != -1:  # For <BOS>, we set the last decoding step to -1.
                target_attention_map[i, last_decoding_step, node_index] = 1
                target_dynamic_vocab[node_index] = token

        # Covert tokens to tensors.
        batch = Batch(token_instances)
        batch.index_instances(self.vocab)
        padding_lengths = batch.get_padding_lengths()
        tokens = {}
        for key, tensor in batch.as_tensor_dict(
                padding_lengths)["target_tokens"].items():
            tokens[key] = tensor.type_as(predictions)

        return dict(
            tokens=tokens,
            # [group_size, 1]
            node_indices=node_indices.unsqueeze(1),
            pos_tags=pos_tags.unsqueeze(1),
        )
示例#11
0
文件: drop.py 项目: MyPaperCode/RAIN
    def make_marginal_bert_drop_instance(passage_question_tokens: List[Token],
                                    #passage_tokens: List[Token],
                                    implicit_tokens: List[Token],
                                    number_tokens: List[Token],
                                    number_indices: List[int],
                                    token_indexers: Dict[str, TokenIndexer],
                                    passage_text: str,
                                    answer_info: Dict[str, Any] = None,
                                    additional_metadata: Dict[str, Any] = None) -> Instance:
        
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}

        passage_question_field = TextField(passage_question_tokens,token_indexers)
        fields["passage_question"] = passage_question_field
        
        number_index_fields: List[Field] = [IndexField(index, passage_question_field) for index in number_indices]
        fields["number_indices"] = ListField(number_index_fields)

        numbers_in_passage_question_field = TextField(number_tokens, token_indexers)
        
        implicit_token_field = TextField(implicit_tokens, token_indexers)

        metadata = {"original_passage": passage_text,
                    "passage_question_tokens": [token.text for token in passage_question_tokens],
                    "number_tokens": [token.text for token in number_tokens],
                    "number_indices": number_indices}

        if answer_info:
       
            metadata["answer_texts"] = answer_info["answer_texts"]

            """
            spans
            """
            span_fields: List[Field] = \
                [SpanField(span[0], span[1], passage_question_field) for span in answer_info["answer_spans"]]
            if not span_fields:
                span_fields.append(SpanField(-1, -1, passage_question_field))
            fields["answer_as_spans"] = ListField(span_fields)


            """
            number and date  
            """
            add_sub_signs_field: List[Field] = []
            for signs_for_one_add_sub_expression in answer_info["signs_for_add_sub_expressions"]:
                add_sub_signs_field.append(SequenceLabelField(signs_for_one_add_sub_expression,
                                                              numbers_in_passage_question_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(SequenceLabelField([0] * len(number_tokens),
                                                              numbers_in_passage_question_field))
            fields["answer_as_add_sub_expressions"] = ListField(add_sub_signs_field)

            """
            count
            """
            count_fields: List[Field] = [LabelField(count_label, skip_indexing=True)
                                         for count_label in answer_info["counts"]]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)



            answer_label = np.zeros((3))
            if answer_info["answer_spans"]:
                answer_label[0] = 1.0
            if answer_info["signs_for_add_sub_expressions"]:
                answer_label[1] = 1.0
            if answer_info["counts"]:
                answer_label[2] = 1.0
            if sum(answer_label)!=0:
                answer_label = answer_label /float(sum(answer_label))
            fields["answer_type"] = ArrayField(answer_label, -1)

        metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
示例#12
0
class TestListField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"),
                                              "characters": TokenCharactersIndexer("characters")}
        self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()

    def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5}

    def test_list_field_can_handle_empty_text_fields(self):
        list_field = ListField([self.field1, self.field2, self.empty_text_field])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(),
                                         numpy.array([[2, 3, 4, 5, 0],
                                                      [2, 3, 4, 1, 5],
                                                      [0, 0, 0, 0, 0]]))

    def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField([self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]]))

    def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField([self.sequence_label_field,
                                self.sequence_label_field,
                                self.empty_sequence_label_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(),
                                         numpy.array([[1, 1, 0, 1],
                                                      [1, 1, 0, 1],
                                                      [0, 0, 0, 0]]))

    def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 5, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 1, 5]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(),
                                                numpy.array([2, 3, 1, 5, 0]))

    def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
        nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                   [0, 1, 2, 3, 4, -1],
                                                   [5, 6, 7, 8, 9, 10]])

    def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["list_words_length"] = 7
        padding_lengths["num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 1, 5, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(),
                                                numpy.array([2, 3, 1, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(),
                                                numpy.array([0, 0, 0, 0, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(),
                                                numpy.array([0, 0, 0, 0, 0, 0, 0]))

    def test_as_tensor_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5],
                                                                    [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 4, 1, 5, 1, 3, 1, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0],
                                                                    [2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

    def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field)

    def test_sequence_methods(self):
        list_field = ListField([self.field1, self.field2, self.field3])

        assert len(list_field) == 3
        assert list_field[1] == self.field2
        assert [f for f in list_field] == [self.field1, self.field2, self.field3]
示例#13
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_tokens_to_namespace(["d", "a", "b"],
                                                   namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)
            instances = Batch([Instance({"text": text_field})])
            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            extra_count = 2 if extended_vocab.is_padded("tokens") else 0
            assert extended_vocab.get_token_index("d",
                                                  "tokens") == 0 + extra_count
            assert extended_vocab.get_token_index("a",
                                                  "tokens") == 1 + extra_count
            assert extended_vocab.get_token_index("b",
                                                  "tokens") == 2 + extra_count

            assert extended_vocab.get_token_index(
                "c", "tokens")  # should be present
            assert extended_vocab.get_token_index(
                "e", "tokens")  # should be present

            assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace(
                "a", namespace="tokens1")  # index2
            text_field = TextField(
                [Token(t) for t in ["b"]],
                {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)

            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            # Should have two namespaces
            assert len(extended_vocab._token_to_index) == 2

            extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
            assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

            extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
            assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
示例#14
0
    def text_to_instance(
            self,  # type: ignore
            sentences: List[List[str]],
            document_id: str,
            sentence_id: int,
            gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
            user_threshold: Optional[float] = 0.0) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        document_id : ``str``, required.
            A string representing the document ID.
        sentence_id : ``int``, required.
            An int representing the sentence ID.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.
        user_threshold: ``Optional[float]``, optional (default = 0.0)
            approximate % of gold labels to label to hold out as user input.
            EX = 0.5, 0.33, 0.25, 0.125

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]

        metadata: Dict[str, Any] = {
            "original_text": flattened_sentences,
            "ID": document_id + ";" + str(sentence_id)
        }
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters
            metadata["num_gold_clusters"] = len(gold_clusters)

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        user_threshold_mod = int(
            1 / user_threshold
        ) if self._simulate_user_inputs and user_threshold > 0 else 0
        cluster_dict = {}
        simulated_user_cluster_dict = {}

        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for i in range(len(cluster)):
                    # use modulo to have a relatively even distribution of user labels across length of document,
                    # (since clusters are sorted)--so user simulated clusters are spread evenly across document
                    if user_threshold_mod == 0 or i % user_threshold_mod != user_threshold_mod - 1:
                        cluster_dict[tuple(cluster[i])] = cluster_id
                    simulated_user_cluster_dict[tuple(cluster[i])] = cluster_id

        # Note simulated_user_cluster_dict encompasses ALL gold labels, including those in cluster_dict
        # Consequently user_labels encompasses all gold labels
        spans: List[Field] = []
        if gold_clusters is not None:
            span_labels: Optional[List[int]] = []
            user_labels: Optional[List[
                int]] = [] if self._simulate_user_inputs and user_threshold > 0 else None
        else:
            span_labels = user_labels = None

        # our must-link and cannot-link constraints, derived from user labels
        # using gold_clusters being None as an indicator of whether we're running training or not
        must_link: Optional[
            List[int]] = [] if gold_clusters is not None else None
        cannot_link: Optional[
            List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        doc_info = None
        if self._saved_labels is not None and metadata[
                'ID'] in self._saved_labels:
            doc_info = self._saved_labels[metadata['ID']]
            span_labels = doc_info['span_labels'].tolist()
            if 'must_link' in doc_info:
                must_link = doc_info['must_link'].squeeze(-1).tolist()
                cannot_link = doc_info['cannot_link'].squeeze(-1).tolist()
        for sentence in sentences:
            for start, end in enumerate_spans(
                    sentence,
                    offset=sentence_offset,
                    max_span_width=self._max_span_width):
                if span_labels is not None:
                    if doc_info is None:
                        # only do if we haven't already loaded span labels
                        if (start, end) in cluster_dict:
                            span_labels.append(cluster_dict[(start, end)])
                        else:
                            span_labels.append(-1)
                    if self._simulate_user_inputs and user_threshold > 0:
                        if (start, end) in simulated_user_cluster_dict:
                            user_labels.append(
                                simulated_user_cluster_dict[(start, end)])
                        else:
                            user_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }

        if must_link is not None and len(must_link) > 0:
            must_link_field = []
            cannot_link_field = []
            for link in must_link:
                must_link_field.append(
                    PairField(
                        IndexField(link[0], span_field),
                        IndexField(link[1], span_field),
                    ))
            for link in cannot_link:
                cannot_link_field.append(
                    PairField(
                        IndexField(link[0], span_field),
                        IndexField(link[1], span_field),
                    ))
            must_link_field = ListField(must_link_field)
            cannot_link_field = ListField(cannot_link_field)
            fields["must_link"] = must_link_field
            fields["cannot_link"] = cannot_link_field

        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)
            if user_labels is not None:
                fields["user_labels"] = SequenceLabelField(
                    user_labels, span_field)

        # sanity checks
        if doc_info is not None:
            assert (fields["span_labels"].as_tensor(
                fields["span_labels"].get_padding_lengths()) !=
                    doc_info['span_labels']).nonzero().size(0) == 0
            if 'must_link' in doc_info:
                assert 'must_link' in fields
                assert (fields["must_link"].as_tensor(
                    fields["must_link"].get_padding_lengths()) !=
                        doc_info['must_link']).nonzero().size(0) == 0
                assert (fields["cannot_link"].as_tensor(
                    fields["cannot_link"].get_padding_lengths()) !=
                        doc_info['cannot_link']).nonzero().size(0) == 0

        return Instance(fields)
示例#15
0
 def text_to_instance(self, line: str) -> Instance:  # type: ignore
     tokens = self._tokenizer.tokenize(line)
     return Instance({"line": TextField(tokens)})
示例#16
0
 def create_instance(self, str_tokens: List[str]):
     tokens = [Token(t) for t in str_tokens]
     instance = Instance({'text': TextField(tokens, self.token_indexers)})
     return instance
    def text_to_instance(
            self,  # type: ignore
            premise: List[Tuple[str, float]],  # Important type information
            hypothesis: str,
            pid: str = None,
            label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        premise_tokens_list = []
        premise_prob_values = []

        premise_span_list: List[Tuple[int, int]] = []
        premise_span_prob: List[float] = []

        # sentence_count = len(premise)

        if self.shuffle_sentences:
            # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch.
            random.shuffle(premise)

        span_start = 0
        for premise_sent, prob in premise:
            cur_premise_tokens = [Token(t) for t in premise_sent.split(' ')
                                  ]  # Removing code for parentheses in NLI
            span_end = span_start + len(cur_premise_tokens)  #
            premise_span_list.append(
                (span_start, span_end))  # Calculate the span.
            span_start = span_end
            premise_span_prob.append(prob)
            prob_value = np.ones(
                (len(cur_premise_tokens), 1), dtype=np.float32) * prob
            premise_tokens_list.extend(cur_premise_tokens)
            premise_prob_values.append(prob_value)

        premise_prob = np.concatenate(premise_prob_values, axis=0)

        hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')]
        hypothesis_prob = np.ones((len(hypothesis_tokens), 1),
                                  dtype=np.float32)

        if self.max_l is not None:
            premise_tokens_list = premise_tokens_list[:self.max_l]
            hypothesis_tokens = hypothesis_tokens[:self.max_l]
            premise_prob = premise_prob[:self.max_l, :]
            hypothesis_prob = hypothesis_prob[:self.max_l, :]
            # for span, prob in zip(premise_span_list, premise_span_prob):

        fields['premise_spans'] = MetadataField(
            ParagraphSpan(premise_span_list))
        fields['premise_probs'] = MetadataField(premise_span_prob)
        fields['premise'] = TextField(premise_tokens_list,
                                      self._token_indexers)  # (t_len, 1)
        fields['hypothesis'] = TextField(hypothesis_tokens,
                                         self._token_indexers)

        # WN feature dict:
        premise_s = ' '.join([t for t, p in premise]).split(' ')
        hypothesis_s = hypothesis.split(' ')

        if self.max_l is not None:
            premise_s = premise_s[:self.max_l]
            hypothesis_s = hypothesis_s[:self.max_l]

        # if self.ablation is not None and self.ablation['rm_wn'] and self.ablation['rm_simi']:
        #     p_feature_array = np.concatenate([premise_prob], axis=1)
        #     h_feature_array = np.concatenate([hypothesis_prob], axis=1)
        if self.wn_p_dict is None:
            p_feature_array = np.zeros(1)
            h_feature_array = np.zeros(1)

        elif self.ablation is not None and self.wn_p_dict is not None and self.ablation[
                'rm_wn']:
            p_feature_array = np.concatenate([premise_prob], axis=1)
            h_feature_array = np.concatenate([hypothesis_prob], axis=1)

        elif self.ablation is not None and self.wn_p_dict is not None and self.ablation[
                'rm_simi']:
            example_feature = wn_persistent_api.compute_wn_features_p_accerate(
                premise_s, hypothesis_s, self.wn_p_dict)

            p_wn_nparray, h_wn_nparray = wn_persistent_api.wn_raw_feature_to_nparray(
                example_feature, self.wn_feature_list)
            # Appending more features
            p_num_feature = encode_num_in_ltokens(premise_s)  # (t_len, 5)
            h_num_feature = encode_num_in_ltokens(hypothesis_s)  # (t_len, 5)
            p_feature_array = np.concatenate([p_wn_nparray, p_num_feature],
                                             axis=1)
            h_feature_array = np.concatenate([h_wn_nparray, h_num_feature],
                                             axis=1)

        elif self.wn_p_dict is not None:
            # Whole Model no ablation.
            example_feature = wn_persistent_api.compute_wn_features_p_accerate(
                premise_s, hypothesis_s, self.wn_p_dict)

            p_wn_nparray, h_wn_nparray = wn_persistent_api.wn_raw_feature_to_nparray(
                example_feature, self.wn_feature_list)

            # Appending more features
            p_num_feature = encode_num_in_ltokens(premise_s)  # (t_len, 5)
            h_num_feature = encode_num_in_ltokens(hypothesis_s)  # (t_len, 5)
            p_feature_array = np.concatenate(
                [p_wn_nparray, p_num_feature, premise_prob], axis=1)
            h_feature_array = np.concatenate(
                [h_wn_nparray, h_num_feature, hypothesis_prob], axis=1)

            assert len(premise_tokens_list) == p_feature_array.shape[0]
            assert len(hypothesis_tokens) == h_feature_array.shape[0]

        fields['p_wn_feature'] = ArrayField(p_feature_array)
        fields['h_wn_feature'] = ArrayField(h_feature_array)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)
示例#18
0
    def text_to_instance(
            self,
            para_id: str,
            sentence_texts: List[str],
            participants: List[str],
            states: List[
                List[str]] = None,  # states[i][j] is ith participant at time j
            filename: str = '',
            score: float = None) -> Instance:

        tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(
            pos_tags=True))

        paragraph = " ".join(sentence_texts)

        # Tokenize the sentences
        sentences = [
            tokenizer.tokenize(sentence_text)
            for sentence_text in sentence_texts
        ]

        # Find the verbs
        verb_indexes = [[
            1 if token.pos_ == "VERB" else 0 for token in sentence
        ] for sentence in sentences]

        if states is not None:
            # Actions is (num_participants, num_events)
            actions = [_infer_actions(states_i) for states_i in states]

            tokenized_states = [[
                tokenizer.tokenize(state_ij) for state_ij in states_i
            ] for states_i in states]

            location_spans = [
                _compute_location_spans(states_i, sentences)
                for states_i in tokenized_states
            ]

        # Create indicators for the participants.
        participant_tokens = [
            tokenizer.tokenize(participant) for participant in participants
        ]
        participant_indicators: List[List[List[int]]] = []

        for participant_i_tokens in participant_tokens:
            targets = [
                list(token_group)
                for is_semicolon, token_group in itertools.groupby(
                    participant_i_tokens, lambda t: t.text == ";")
                if not is_semicolon
            ]

            participant_i_indicators: List[List[int]] = []

            for sentence in sentences:
                sentence_indicator = [0 for _ in sentence]

                for target in targets:
                    start = 0
                    while True:
                        span_start, span_end = _find_span(target,
                                                          sentence,
                                                          start,
                                                          target_is_noun=True)
                        if span_start >= 0:
                            for j in range(span_start, span_end + 1):
                                sentence_indicator[j] = 1
                            start = span_start + 1
                        else:
                            break

                participant_i_indicators.append(sentence_indicator)

            participant_indicators.append(participant_i_indicators)

        fields: Dict[str, Field] = {}
        fields["paragraph"] = TextField(tokenizer.tokenize(paragraph),
                                        self._token_indexers)
        fields["participants"] = ListField([
            TextField(tokenizer.tokenize(participant), self._token_indexers)
            for participant in participants
        ])

        # One per sentence
        fields["sentences"] = ListField([
            TextField(sentence, self._token_indexers) for sentence in sentences
        ])

        # One per sentence
        fields["verbs"] = ListField([
            SequenceLabelField(verb_indexes[i],
                               fields["sentences"].field_list[i])
            for i in range(len(sentences))
        ])
        # And also at the paragraph level
        fields["paragraph_verbs"] = SequenceLabelField([
            verb_indicator for verb_indexes_i in verb_indexes
            for verb_indicator in verb_indexes_i
        ], fields["paragraph"])

        if states is not None:
            # Outer ListField is one per participant
            fields["actions"] = ListField([
                # Inner ListField is one per sentence
                ListField([
                    # action is an Enum, so call .value to get an int
                    LabelField(action.value, skip_indexing=True)
                    for action in participant_actions
                ]) for participant_actions in actions
            ])

            # Outer ListField is one per participant
            fields["before_locations"] = ListField([
                # Inner ListField is one per sentence
                ListField([
                    SpanField(start, end, fields["sentences"].field_list[i])
                    for i, ((start, end),
                            _) in enumerate(participant_location_spans)
                ]) for participant_location_spans in location_spans
            ])
            # Outer ListField is one per participant
            fields["after_locations"] = ListField([
                # Inner ListField is one per sentence
                ListField([
                    SpanField(start, end, fields["sentences"].field_list[i])
                    for i, (_, (start,
                                end)) in enumerate(participant_location_spans)
                ]) for participant_location_spans in location_spans
            ])

        # one per participant
        fields["participant_indicators"] = ListField([
            # one per sentence
            ListField([
                SequenceLabelField(sentence_indicator,
                                   fields["sentences"].field_list[i]) for i,
                sentence_indicator in enumerate(participant_i_indicators)
            ]) for participant_i_indicators in participant_indicators
        ])

        # and also at the paragraph level
        # one per participant
        fields["paragraph_participant_indicators"] = ListField([
            SequenceLabelField([
                indicator for sentence_indicator in participant_i_indicators
                for indicator in sentence_indicator
            ], fields["paragraph"])
            for participant_i_indicators in participant_indicators
        ])

        # Finally, we want to indicate before / inside / after for each sentence.
        paragraph_sentence_indicators: List[SequenceLabelField] = []
        for i in range(len(sentences)):
            before_length = sum(len(sentence) for sentence in sentences[:i])
            sentence_length = len(sentences[i])
            after_length = sum(
                len(sentence) for sentence in sentences[(i + 1):])
            paragraph_sentence_indicators.append(
                SequenceLabelField([0] * before_length +
                                   [1] * sentence_length + [2] * after_length,
                                   fields["paragraph"]))

        fields["paragraph_sentence_indicators"] = ListField(
            paragraph_sentence_indicators)

        # These fields are passed on to the decoder trainer that internally uses it
        # to compute commonsense scores for predicted actions
        fields["para_id"] = MetadataField(para_id)
        fields["participant_strings"] = MetadataField(participants)

        fields["filename"] = MetadataField(filename)

        if score is not None:
            fields["score"] = MetadataField(score)

        return Instance(fields)
示例#19
0
    def text_to_instance(  # type: ignore
        self,
        tokens: List[Token],
        ners: List[str] = None,
        tag_ids: List[int] = None,
        root_ids: List[int] = None,
        rels: List[str] = None,
        isdef: str = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

        # Add "feature labels" to instance
        if "ner" in self.feature_labels:
            if ners is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use definition terms as "
                    "features. Pass them to text_to_instance."
                )
            instance_fields["ner"] = SequenceLabelField(ners, sequence, "ner")
        if "tag_id" in self.feature_labels:
            if tag_ids is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use tag ids as "
                    "features. Pass them to text_to_instance."
                )
            instance_fields["tag_id"] = SequenceLabelField(tag_ids, sequence, "tag_id")
        if "root_id" in self.feature_labels:
            if root_ids is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use roots as "
                    " features. Pass them to text_to_instance."
                )
            instance_fields["root_id"] = SequenceLabelField(root_ids, sequence, "root_id")
        if "rel" in self.feature_labels:
            if rels is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use relations as "
                    " features. Pass them to text_to_instance."
                )
            instance_fields["rel"] = SequenceLabelField(rels, sequence, "rel")
        if "isdef" in self.feature_labels:
            if isdef is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use relations as "
                    " features. Pass them to text_to_instance."
                )
            instance_fields["isdef"] = LabelField(isdef, "isdef")

        # Add "tag label" to instance
        if self.tag_label == "ner" and ners is not None:
            instance_fields["tags"] = SequenceLabelField(ners, sequence, self.label_namespace)
        elif self.tag_label == "tag_id" and tag_ids is not None:
            instance_fields["tags"] = SequenceLabelField(tag_ids, sequence, self.label_namespace)
        elif self.tag_label == "root_id" and root_ids is not None:
            instance_fields["tags"] = SequenceLabelField(root_ids, sequence, self.label_namespace)
        elif self.tag_label == "rel" and rels is not None:
            instance_fields["tags"] = SequenceLabelField(rels, sequence, self.label_namespace)
        elif self.tag_label == "isdef" and isdef is not None:
            instance_fields["labels"] = LabelField(isdef, self.label_namespace)        
        return Instance(instance_fields)
示例#20
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                               min_padding_length=1)})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters",
                                                                               min_padding_length=1)})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
示例#21
0
    def test_sequence_methods(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]
示例#22
0
    def text_to_instance(
        self,
        text: str,
        sentiment: str,
        selected_text: Optional[str] = None,
    ) -> Instance:
        fields = {}
        text_tokens = self._tokenizer.tokenize(text)
        sentiment_tokens = self._tokenizer.tokenize(sentiment)
        # add special tokens

        text_with_sentiment_tokens = self._tokenizer.add_special_tokens(
            text_tokens, sentiment_tokens)
        tokens_field = TextField(text_with_sentiment_tokens,
                                 {"tokens": self._tokenindexer})
        fields["tokens"] = tokens_field

        additional_metadata = {}
        if selected_text is not None:
            context = text
            answer = selected_text
            additional_metadata["selected_text"] = selected_text
            first_answer_offset = context.find(answer)

            def tokenize_slice(start: int, end: int) -> Iterable[Token]:
                text_to_tokenize = context[start:end]
                if start - 1 >= 0 and context[start - 1].isspace():
                    prefix = (
                        "a "
                    )  # must end in a space, and be short so we can be sure it becomes only one token
                    wordpieces = self._tokenizer.tokenize(prefix +
                                                          text_to_tokenize)
                    for wordpiece in wordpieces:
                        if wordpiece.idx is not None:
                            wordpiece.idx -= len(prefix)
                    return wordpieces[1:]
                else:
                    return self._tokenizer.tokenize(text_to_tokenize)

            tokenized_context = []
            token_start = 0
            for i, c in enumerate(context):
                if c.isspace():
                    for wordpiece in tokenize_slice(token_start, i):
                        if wordpiece.idx is not None:
                            wordpiece.idx += token_start
                        tokenized_context.append(wordpiece)
                    token_start = i + 1
            for wordpiece in tokenize_slice(token_start, len(context)):
                if wordpiece.idx is not None:
                    wordpiece.idx += token_start
                tokenized_context.append(wordpiece)

            if first_answer_offset is None:
                (token_answer_span_start, token_answer_span_end) = (-1, -1)
            else:
                (
                    token_answer_span_start,
                    token_answer_span_end,
                ), _ = char_span_to_token_span(
                    [(t.idx, t.idx + len(sanitize_wordpiece(t.text)))
                     if t.idx is not None else None
                     for t in tokenized_context],
                    (first_answer_offset, first_answer_offset + len(answer)),
                )
            tags = ["O"] * len(tokens_field)
            for i in range(token_answer_span_start, token_answer_span_end + 1):
                tags[i] = "I"
            fields["tags"] = SequenceLabelField(tags, tokens_field)

        # make the metadata
        metadata = {
            "text": text,
            "sentiment": sentiment,
            "words": text,
            "text_with_sentiment_tokens": text_with_sentiment_tokens,
        }
        if additional_metadata:
            metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
 def text_to_instance(self, data: dict) -> Instance:  # type: ignore
     field_of_tokens = TextField(data['sdp'], self._token_indexers)
     return Instance({
         'sen_dep': field_of_tokens,
         'label': LabelField(data['relation'])
     })
示例#24
0
 def test_printing_doesnt_crash(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       {"words": SingleIdTokenIndexer(namespace="words")})
     print(field)
 def tokenize(self, d):
     tokenized_tokens = self._tokenizer.tokenize(' '.join(d['token']))
     return TextField(tokenized_tokens, self._token_indexers)
示例#26
0
文件: nlvr.py 项目: Jaynil1611/syn-qg
    def text_to_instance(
            self,  # type: ignore
            sentence: str,
            structured_representations: List[List[List[JsonDict]]],
            labels: List[str] = None,
            target_sequences: List[List[str]] = None,
            identifier: str = None) -> Instance:
        """
        Parameters
        ----------
        sentence : ``str``
            The query sentence.
        structured_representations : ``List[List[List[JsonDict]]]``
            A list of Json representations of all the worlds. See expected format in this class' docstring.
        labels : ``List[str]`` (optional)
            List of string representations of the labels (true or false) corresponding to the
            ``structured_representations``. Not required while testing.
        target_sequences : ``List[List[str]]`` (optional)
            List of target action sequences for each element which lead to the correct denotation in
            worlds corresponding to the structured representations.
        identifier : ``str`` (optional)
            The identifier from the dataset if available.
        """
        # pylint: disable=arguments-differ
        worlds = []
        for structured_representation in structured_representations:
            boxes = set([
                Box(object_list, box_id)
                for box_id, object_list in enumerate(structured_representation)
            ])
            worlds.append(NlvrLanguage(boxes))
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence,
                                   self._sentence_token_indexers)
        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change
        # later.
        for production_rule in worlds[0].all_possible_productions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)
        worlds_field = ListField([MetadataField(world) for world in worlds])
        metadata: Dict[str, Any] = {
            "sentence_tokens": [x.text for x in tokenized_sentence]
        }
        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "worlds": worlds_field,
            "actions": action_field,
            "metadata": MetadataField(metadata)
        }
        if identifier is not None:
            fields["identifier"] = MetadataField(identifier)
        # Depending on the type of supervision used for training the parser, we may want either
        # target action sequences or an agenda in our instance. We check if target sequences are
        # provided, and include them if they are. If not, we'll get an agenda for the sentence, and
        # include that in the instance.
        if target_sequences:
            action_sequence_fields: List[Field] = []
            for target_sequence in target_sequences:
                index_fields = ListField([
                    IndexField(instance_action_ids[action], action_field)
                    for action in target_sequence
                ])
                action_sequence_fields.append(index_fields)
                # TODO(pradeep): Define a max length for this field.
            fields["target_action_sequences"] = ListField(
                action_sequence_fields)
        elif self._output_agendas:
            # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(sentence)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField([
                IndexField(instance_action_ids[action], action_field)
                for action in agenda
            ])
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField([
                LabelField(label, label_namespace='denotations')
                for label in labels
            ])
            fields["labels"] = labels_field

        return Instance(fields)
示例#27
0
    'token_characters': TokenCharactersIndexer(namespace='character_vocab'),
    'pos_tags': PosTagIndexer(namespace='pos_tag_vocab')
}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'],
                              namespace='token_vocab')
vocab.add_tokens_to_namespace(
    ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'],
    namespace='character_vocab')
vocab.add_tokens_to_namespace(['DT', 'VBZ', 'NN', '.'],
                              namespace='pos_tag_vocab')

text = "This is some text."
tokens = tokenizer.tokenize(text)
print(tokens)
print([token.tag_ for token in tokens])

text_field = TextField(tokens, token_indexers)

# In order to convert the token strings into integer ids, we need to tell the
# TextField what Vocabulary to use.
text_field.index(vocab)

# We typically batch things together when making tensors, which requires some
# padding computation.  Don't worry too much about the padding for now.
padding_lengths = text_field.get_padding_lengths()

tensor_dict = text_field.as_tensor(padding_lengths)
print(tensor_dict)
示例#28
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        """
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        """

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(
            non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens0")  # index:2
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens0")  # index:3
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens0")  # index:4

        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens1")  # index:0
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens1")  # index:1
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens1")  # index:2

        original_vocab.add_token_to_namespace("a",
                                              namespace="tokens2")  # index:0
        original_vocab.add_token_to_namespace("b",
                                              namespace="tokens2")  # index:1
        original_vocab.add_token_to_namespace("c",
                                              namespace="tokens2")  # index:2

        original_vocab.add_token_to_namespace("p",
                                              namespace="tokens3")  # index:0
        original_vocab.add_token_to_namespace("q",
                                              namespace="tokens3")  # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens0": SingleIdTokenIndexer("tokens0")},
        )
        text_field1 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens1": SingleIdTokenIndexer("tokens1")},
        )
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([
            Instance({
                "text0": text_field0,
                "text1": text_field1,
                "text4": text_field4,
                "text5": text_field5,
            })
        ])

        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1", "tokens5"],
        })
        extended_vocab = Vocabulary.from_params(params, instances=instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {
            "tokens1", "tokens3", "tokens5"
        }

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size(
            "tokens0") == 8  # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size(
            "tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens4") == 6  # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3  # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(
                    token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(
                    index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(
                    index, namespace)
                assert vocab_token == extended_vocab_token
示例#29
0
 def setUp(self):
     super(TestSpanField, self).setUp()
     self.indexers = {"words": SingleIdTokenIndexer("words")}
     self.text = TextField([Token(t) for t in ["here", "is", "a", "sentence", "for", "spans", "."]],
                           self.indexers)
示例#30
0
文件: atis.py 项目: yucoian/allennlp
    def text_to_instance(self,  # type: ignore
                         utterances: List[str],
                         sql_query_labels: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query_labels: ``List[str]``, optional
            The SQL queries that are given as labels during training or validation.
        """
        utterance = utterances[-1]
        action_sequence: List[str] = []

        if not utterance:
            return None

        world = AtisWorld(utterances=utterances)

        if sql_query_labels:
            # If there are multiple sql queries given as labels, we use the shortest
            # one for training.
            sql_query = min(sql_query_labels, key=len)
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                logger.debug(f'Parsing error')

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields: List[Field] = []

        for production_rule in world.all_possible_actions():
            nonterminal, _ = production_rule.split(' ->')
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = ' '.join([token for token in production_rule.split(' ') if token != 'ws'])
            field = ProductionRuleField(production_rule, self._is_global_rule(nonterminal))
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = {action.rule: i # type: ignore
                      for i, action in enumerate(action_field.field_list)}
        index_fields: List[Field] = []
        world_field = MetadataField(world)
        fields = {'utterance' : utterance_field,
                  'actions' : action_field,
                  'world' : world_field,
                  'linking_scores' : ArrayField(world.linking_scores)}

        if sql_query_labels != None:
            fields['sql_queries'] = MetadataField(sql_query_labels)
            if action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(IndexField(action_map[production_rule], action_field))

                action_sequence_field = ListField(index_fields)
                fields['target_action_sequence'] = action_sequence_field
            else:
                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
                return None

        return Instance(fields)
示例#31
0
    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()
示例#32
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters",
                                                                 min_padding_length=1)})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters",
                                                                                min_padding_length=1)})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
示例#33
0
 def text_to_instance(self, subject: str, predicate: str, obj: str) -> Instance:
     concatenated_tuple = " ".join((subject, predicate, object))
     tokens = self._tokenizer.tokenize(concatenated_tuple)
     return Instance({
         "tokens": TextField(tokens, self._token_indexers)
     })
示例#34
0
    def text_to_instance(self,
                         question_text: str,
                         passage_text: str,
                         passage_tokens: List[Token],
                         numbers_in_passage: List[Any],
                         number_words: List[str],
                         number_indices: List[int],
                         number_len: List[int],
                         question_id: str = None,
                         answer_annotations: List[List[str]] = None,
                         dataset: str = None) -> Union[Instance, None]:
        # Tokenize question and passage
        question_tokens = self.tokenizer.tokenize(question_text)
        qlen = len(question_tokens)
        plen = len(passage_tokens)

        question_passage_tokens = [Token('[CLS]')] + question_tokens + [
            Token('[SEP]')
        ] + passage_tokens
        if len(question_passage_tokens) > self.max_pieces - 1:
            question_passage_tokens = question_passage_tokens[:self.
                                                              max_pieces - 1]
            passage_tokens = passage_tokens[:self.max_pieces - qlen - 3]
            plen = len(passage_tokens)
            if len(number_indices) > 0:
                number_indices, number_len, numbers_in_passage = \
                    clipped_passage_num(number_indices, number_len, numbers_in_passage, plen)

        question_passage_tokens += [Token('[SEP]')]
        number_indices = [index + qlen + 2 for index in number_indices] + [-1]
        # Not done in-place so they won't change the numbers saved for the passage
        number_len = number_len + [1]
        numbers_in_passage = numbers_in_passage + [0]
        number_tokens = [Token(str(number)) for number in numbers_in_passage]
        extra_number_tokens = [Token(str(num)) for num in self.extra_numbers]

        mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1]

        fields: Dict[str, Field] = {}

        # Add feature fields
        question_passage_field = TextField(question_passage_tokens,
                                           self.token_indexers)
        fields["question_passage"] = question_passage_field

        number_token_indices = \
            [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1)
             for i, start_ind in enumerate(number_indices)]
        fields["number_indices"] = ListField(number_token_indices)
        numbers_in_passage_field = TextField(number_tokens,
                                             self.token_indexers)
        extra_numbers_field = TextField(extra_number_tokens,
                                        self.token_indexers)
        all_numbers_field = TextField(extra_number_tokens + number_tokens,
                                      self.token_indexers)
        mask_index_fields: List[Field] = [
            IndexField(index, question_passage_field) for index in mask_indices
        ]
        fields["mask_indices"] = ListField(mask_index_fields)

        # Compile question, passage, answer metadata
        metadata = {
            "original_passage": passage_text,
            "original_question": question_text,
            "original_numbers": numbers_in_passage,
            "original_number_words": number_words,
            "extra_numbers": self.extra_numbers,
            "passage_tokens": passage_tokens,
            "question_tokens": question_tokens,
            "question_passage_tokens": question_passage_tokens,
            "question_id": question_id,
            "dataset": dataset
        }

        if answer_annotations:
            answer_texts = answer_annotations[0]
            answer_type = "span"
            tokenized_answer_texts = []
            num_spans = min(len(answer_texts), self.max_spans)
            for answer_text in answer_texts:
                answer_tokens = self.tokenizer.tokenize(answer_text)
                tokenized_answer_texts.append(' '.join(
                    token.text for token in answer_tokens))

            metadata["answer_annotations"] = answer_annotations
            metadata["answer_texts"] = answer_texts
            metadata["answer_tokens"] = tokenized_answer_texts

            # Find answer text in question and passage
            valid_question_spans = DropReader.find_valid_spans(
                question_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_question_spans):
                valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1)
            valid_passage_spans = DropReader.find_valid_spans(
                passage_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_passage_spans):
                valid_passage_spans[span_ind] = (span[0] + qlen + 2,
                                                 span[1] + qlen + 2)

            # Get target numbers
            target_numbers = []
            for answer_text in answer_texts:
                if answer_text.strip().count(" ") == 0:
                    number = self.word_to_num(answer_text, True)
                    if number is not None:
                        target_numbers.append(number)

            # Get possible ways to arrive at target numbers with add/sub

            valid_expressions: List[List[int]] = []
            exp_strings = None
            if answer_type in ["number", "date"]:
                if self.exp_search == 'full':
                    expressions = get_full_exp(
                        list(enumerate(self.extra_numbers +
                                       numbers_in_passage)), target_numbers,
                        self.operations, self.op_dict, self.max_depth)
                    zipped = list(zip(*expressions))
                    if zipped:
                        valid_expressions = list(zipped[0])
                        exp_strings = list(zipped[1])
                elif self.exp_search == 'add_sub':
                    valid_expressions = \
                        DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage,
                                                                  target_numbers,
                                                                  self.max_numbers_expression)
                elif self.exp_search == 'template':
                    valid_expressions, exp_strings = \
                        get_template_exp(self.extra_numbers + numbers_in_passage,
                                         target_numbers,
                                         self.templates,
                                         self.template_strings)
                    exp_strings = sum(exp_strings, [])

            # Get possible ways to arrive at target numbers with counting
            valid_counts: List[int] = []
            if answer_type in ["number"]:
                numbers_for_count = list(range(self.max_count + 1))
                valid_counts = DropReader.find_valid_counts(
                    numbers_for_count, target_numbers)

            # Update metadata with answer info
            answer_info = {
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "num_spans": num_spans,
                "expressions": valid_expressions,
                "counts": valid_counts
            }
            if self.exp_search in ['template', 'full']:
                answer_info['expr_text'] = exp_strings
            metadata["answer_info"] = answer_info

            # Add answer fields
            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_passage_spans
            ]
            if not passage_span_fields:
                passage_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_question_spans
            ]
            if not question_span_fields:
                question_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            if self.exp_search == 'add_sub':
                add_sub_signs_field: List[Field] = []
                extra_signs_field: List[Field] = []
                for signs_for_one_add_sub_expressions in valid_expressions:
                    extra_signs = signs_for_one_add_sub_expressions[:len(
                        self.extra_numbers)]
                    normal_signs = signs_for_one_add_sub_expressions[
                        len(self.extra_numbers):]
                    add_sub_signs_field.append(
                        SequenceLabelField(normal_signs,
                                           numbers_in_passage_field))
                    extra_signs_field.append(
                        SequenceLabelField(extra_signs, extra_numbers_field))
                if not add_sub_signs_field:
                    add_sub_signs_field.append(
                        SequenceLabelField([0] * len(number_tokens),
                                           numbers_in_passage_field))
                if not extra_signs_field:
                    extra_signs_field.append(
                        SequenceLabelField([0] * len(self.extra_numbers),
                                           extra_numbers_field))
                fields["answer_as_expressions"] = ListField(
                    add_sub_signs_field)
                if self.extra_numbers:
                    fields["answer_as_expressions_extra"] = ListField(
                        extra_signs_field)
            elif self.exp_search in ['template', 'full']:
                expression_indices = []
                for expression in valid_expressions:
                    if not expression:
                        expression.append(3 * [-1])
                    expression_indices.append(
                        ArrayField(np.array(expression), padding_value=-1))
                if not expression_indices:
                    expression_indices = \
                        [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))]
                fields["answer_as_expressions"] = ListField(expression_indices)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in valid_counts
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)
            fields["impossible_answer"] = LabelField(0, skip_indexing=True)

            #fields["num_spans"] = LabelField(num_spans, skip_indexing=True)

        else:
            fields["answer_as_passage_spans"] = ListField(
                [SpanField(-1, -1, question_passage_field)])
            fields["answer_as_counts"] = ListField(
                [LabelField(-1, skip_indexing=True)])
            fields["answer_as_expressions"] = ListField([
                SequenceLabelField([0] * len(numbers_in_passage_field),
                                   numbers_in_passage_field)
            ])
            fields["impossible_answer"] = LabelField(1, skip_indexing=True)
            metadata["answer_annotations"] = [{'spans': [""]}]
            fields["answer_as_question_spans"] = ListField(
                [SpanField(-1, -1, question_passage_field)])

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
示例#35
0
    def text_to_instance(
            self,  # type: ignore
            premises: Union[List[str], List[List[str]]],
            choices: List[str],
            coverage: List[List[float]],
            label: int = None,
            question: str = None) -> Instance:
        number_of_choices = len(choices)
        if isinstance(premises[0], str):
            premises = [premises] * number_of_choices

        # create an empty dictionary to store the input
        fields: Dict[str, Field] = {}
        tokens = []
        token_type_ids = []
        all_links = []
        all_link_token_ids = []

        if len(coverage) != len(choices):

            logger.error("the dimension of coverage and choices did not match")
            exit(0)

        max_len = 0
        max_premises = 0
        for arr, p in zip(coverage, premises):

            if len(arr) != len(p):
                logger.error(
                    "the dimension of coverage and premises did not match")
                exit(0)
            max_premises = max([max_premises, len(p)])
            max_len = max([max_len, max([len(a) for a in arr])])

        # padding
        np_coverage = np.zeros([len(coverage), max_premises, max_len])
        for c_idx in range(len(coverage)):
            for p_idx in range(len(coverage[c_idx])):
                np_coverage[
                    c_idx, p_idx,
                    0:len(coverage[c_idx][p_idx])] = coverage[c_idx][p_idx]

        fields['coverage'] = ArrayField(np_coverage)

        for premise, hypothesis in zip(premises, choices):

            # two major keys
            # ph: [cls]all_premise[sep]hypothesis[sep]
            # two different segment_ids
            # join all premise sentences
            all_premise = " ".join(premise)
            if question is None:
                ph_tokens, ph_token_type_ids = self.bert_features_from_qa(
                    question=all_premise, answer=hypothesis)
            else:
                ph_tokens, ph_token_type_ids = self.bert_features_from_qa(
                    question=question, context=all_premise, answer=hypothesis)

            # create a simple textfield for hypothesis
            tokens_field = TextField(ph_tokens, self._token_indexers)
            tokens.append(tokens_field)
            token_type_ids.append(
                SequenceLabelField(ph_token_type_ids, tokens_field))

            links_segment_2d = []
            links_2d = []

            for i in range(0, len(premise)):
                tokenized_links_field = []
                type_ids_of_links = []
                for j in range(0, len(premise)):
                    if i == j:
                        continue
                    else:
                        if question is None:
                            pp_tokens, pp_token_type_ids = self.bert_features_from_qa(
                                question=premise[i],
                                answer=hypothesis,
                                context=premise[j])
                        else:
                            pp_tokens, pp_token_type_ids = self.bert_features_from_qa(
                                question=question,
                                context2=premise[j],
                                answer=hypothesis,
                                context=premise[i])
                        pp_tokens_field = TextField(pp_tokens,
                                                    self._token_indexers)
                        tokenized_links_field.append(pp_tokens_field)
                        type_ids_of_links.append(
                            SequenceLabelField(pp_token_type_ids,
                                               pp_tokens_field))
                links_2d.append(ListField(tokenized_links_field))
                links_segment_2d.append(ListField(type_ids_of_links))

            if len(premise) >= 2:
                all_links.append(ListField(links_2d))
                all_link_token_ids.append(ListField(links_segment_2d))
            else:
                # add an empty list field
                empty_tokens_field = [TextField([], self._token_indexers)]
                empty_type_ids_of_links = [
                    SequenceLabelField([], empty_tokens_field[0])
                ]
                all_links.append(ListField(ListField(empty_tokens_field)))
                all_link_token_ids.append(
                    ListField(ListField(empty_type_ids_of_links)))

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)

        fields['tokens'] = ListField(tokens)
        fields['token_type_ids'] = ListField(token_type_ids)

        fields['links_tokens'] = ListField(all_links)
        fields['links_token_type_ids'] = ListField(all_link_token_ids)

        return Instance(fields)
示例#36
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters"),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5,
                                   "characters_length": 5,
                                   "words_length": 5,
                                   "num_token_characters": 8}
def make_reading_comprehension_instance(
    question_tokens: List[Token],
    passage_tokens: List[Token],
    token_indexers: Dict[str, TokenIndexer],
    passage_text: str,
    token_spans: List[Tuple[int, int]] = None,
    answer_texts: List[str] = None,
    additional_metadata: Dict[str, Any] = None,
) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_tokens : ``List[Token]``
        An already-tokenized question.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans : ``List[Tuple[int, int]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list because there might be several possible correct answer spans in the passage.
        Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
        annotations on the dev set; this will select the span that the most annotators gave as
        correct).
    answer_texts : ``List[str]``, optional
        All valid answer strings for the given question.  In SQuAD, e.g., the training set has
        exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
        possible answers, which are the aliases for the known correct entity.  This is put into the
        metadata for use with official evaluation scripts, but not used anywhere else.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]

    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields["passage"] = passage_field
    fields["question"] = TextField(question_tokens, token_indexers)
    metadata = {
        "original_passage": passage_text,
        "token_offsets": passage_offsets,
        "question_tokens": [token.text for token in question_tokens],
        "passage_tokens": [token.text for token in passage_tokens],
    }
    if answer_texts:
        metadata["answer_texts"] = answer_texts

    if token_spans:
        # There may be multiple answer annotations, so we pick the one that occurs the most.  This
        # only matters on the SQuAD dev set, and it means our computed metrics ("start_acc",
        # "end_acc", and "span_acc") aren't quite the same as the official metrics, which look at
        # all of the annotations.  This is why we have a separate official SQuAD metric calculation
        # (the "em" and "f1" metrics use the official script).
        candidate_answers: Counter = Counter()
        for span_start, span_end in token_spans:
            candidate_answers[(span_start, span_end)] += 1
        span_start, span_end = candidate_answers.most_common(1)[0][0]

        fields["span_start"] = IndexField(span_start, passage_field)
        fields["span_end"] = IndexField(span_end, passage_field)

    metadata.update(additional_metadata)
    fields["metadata"] = MetadataField(metadata)
    return Instance(fields)
示例#38
0
    def text_to_instance(
        self,  # type: ignore
        question: str,
        table_lines: List[List[str]],
        target_values: List[str] = None,
        offline_search_output: List[str] = None,
    ) -> Instance:
        """
        Reads text inputs and makes an instance. We pass the ``table_lines`` to ``TableQuestionContext``, and that
        method accepts this field either as lines from CoreNLP processed tagged files that come with the dataset,
        or simply in a tsv format where each line corresponds to a row and the cells are tab-separated.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[List[str]]``
            The table content optionally preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines``
            for the expected format.
        target_values : ``List[str]``, optional
            Target values for the denotations the logical forms should execute to. Not required for testing.
        offline_search_output : ``List[str]``, optional
            List of logical forms, produced by offline search. Not required during test.
        """
        tokenized_question = self._tokenizer.tokenize(question.lower())
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        metadata: Dict[str, Any] = {
            "question_tokens": [x.text for x in tokenized_question]
        }
        table_context = TableQuestionContext.read_from_lines(
            table_lines, tokenized_question)
        world = WikiTablesLanguage(table_context)
        world_field = MetadataField(world)
        # Note: Not passing any featre extractors when instantiating the field below. This will make
        # it use all the available extractors.
        table_field = KnowledgeGraphField(
            table_context.get_table_knowledge_graph(),
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens,
        )
        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_productions():
            _, rule_right_side = production_rule.split(" -> ")
            is_global_rule = not world.is_instance_specific_entity(
                rule_right_side)
            field = ProductionRuleField(production_rule,
                                        is_global_rule=is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            "question": question_field,
            "metadata": MetadataField(metadata),
            "table": table_field,
            "world": world_field,
            "actions": action_field,
        }

        if target_values is not None:
            target_values_field = MetadataField(target_values)
            fields["target_values"] = target_values_field

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i
            for i, action in enumerate(action_field.field_list)
        }  # type: ignore
        if offline_search_output:
            action_sequence_fields: List[Field] = []
            for logical_form in offline_search_output:
                try:
                    action_sequence = world.logical_form_to_action_sequence(
                        logical_form)
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(
                            IndexField(action_map[production_rule],
                                       action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except ParsingError as error:
                    logger.debug(
                        f"Parsing error: {error.message}, skipping logical form"
                    )
                    logger.debug(f"Question was: {question}")
                    logger.debug(f"Logical form was: {logical_form}")
                    logger.debug(f"Table info was: {table_lines}")
                    continue
                except KeyError as error:
                    logger.debug(
                        f"Missing production rule: {error.args}, skipping logical form"
                    )
                    logger.debug(f"Question was: {question}")
                    logger.debug(f"Table info was: {table_lines}")
                    logger.debug(f"Logical form was: {logical_form}")
                    continue
                except:  # noqa
                    logger.error(logical_form)
                    raise
                if len(action_sequence_fields
                       ) >= self._max_offline_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields["target_action_sequences"] = ListField(
                action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda(conservative=True):
                agenda_index_fields.append(
                    IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields["agenda"] = ListField(agenda_index_fields)
        return Instance(fields)
def make_reading_comprehension_instance_quac(
    question_list_tokens: List[List[Token]],
    passage_tokens: List[Token],
    token_indexers: Dict[str, TokenIndexer],
    passage_text: str,
    token_span_lists: List[List[Tuple[int, int]]] = None,
    yesno_list: List[int] = None,
    followup_list: List[int] = None,
    additional_metadata: Dict[str, Any] = None,
    num_context_answers: int = 0,
) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_list_tokens : ``List[List[Token]]``
        An already-tokenized list of questions. Each dialog have multiple questions.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_span_lists : ``List[List[Tuple[int, int]]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list of list, first because there is multiple questions per dialog, and
        because there might be several possible correct answer spans in the passage.
        Currently, we just select the last span in this list (i.e., QuAC has multiple
        annotations on the dev set; this will select the last span, which was given by the original annotator).
    yesno_list : ``List[int]``
        List of the affirmation bit for each question answer pairs.
    followup_list : ``List[int]``
        List of the continuation bit for each question answer pairs.
    num_context_answers : ``int``, optional
        How many answers to encode into the passage.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields["passage"] = passage_field
    fields["question"] = ListField([
        TextField(q_tokens, token_indexers)
        for q_tokens in question_list_tokens
    ])
    metadata = {
        "original_passage":
        passage_text,
        "token_offsets":
        passage_offsets,
        "question_tokens": [[token.text for token in question_tokens]
                            for question_tokens in question_list_tokens],
        "passage_tokens": [token.text for token in passage_tokens],
    }
    p1_answer_marker_list: List[Field] = []
    p2_answer_marker_list: List[Field] = []
    p3_answer_marker_list: List[Field] = []

    def get_tag(i, i_name):
        # Generate a tag to mark previous answer span in the passage.
        return "<{0:d}_{1:s}>".format(i, i_name)

    def mark_tag(span_start, span_end, passage_tags, prev_answer_distance):
        try:
            assert span_start >= 0
            assert span_end >= 0
        except:  # noqa
            raise ValueError(
                "Previous {0:d}th answer span should have been updated!".
                format(prev_answer_distance))
        # Modify "tags" to mark previous answer span.
        if span_start == span_end:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "")
        else:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "start")
            passage_tags[prev_answer_distance][span_end] = get_tag(
                prev_answer_distance, "end")
            for passage_index in range(span_start + 1, span_end):
                passage_tags[prev_answer_distance][passage_index] = get_tag(
                    prev_answer_distance, "in")

    if token_span_lists:
        span_start_list: List[Field] = []
        span_end_list: List[Field] = []
        p1_span_start, p1_span_end, p2_span_start = -1, -1, -1
        p2_span_end, p3_span_start, p3_span_end = -1, -1, -1
        # Looping each <<answers>>.
        for question_index, answer_span_lists in enumerate(token_span_lists):
            span_start, span_end = answer_span_lists[
                -1]  # Last one is the original answer
            span_start_list.append(IndexField(span_start, passage_field))
            span_end_list.append(IndexField(span_end, passage_field))
            prev_answer_marker_lists = [
                ["O"] * len(passage_tokens),
                ["O"] * len(passage_tokens),
                ["O"] * len(passage_tokens),
                ["O"] * len(passage_tokens),
            ]
            if question_index > 0 and num_context_answers > 0:
                mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists,
                         1)
                if question_index > 1 and num_context_answers > 1:
                    mark_tag(p2_span_start, p2_span_end,
                             prev_answer_marker_lists, 2)
                    if question_index > 2 and num_context_answers > 2:
                        mark_tag(p3_span_start, p3_span_end,
                                 prev_answer_marker_lists, 3)
                    p3_span_start = p2_span_start
                    p3_span_end = p2_span_end
                p2_span_start = p1_span_start
                p2_span_end = p1_span_end
            p1_span_start = span_start
            p1_span_end = span_end
            if num_context_answers > 2:
                p3_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[3],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 1:
                p2_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[2],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 0:
                p1_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[1],
                                       passage_field,
                                       label_namespace="answer_tags"))
        fields["span_start"] = ListField(span_start_list)
        fields["span_end"] = ListField(span_end_list)
        if num_context_answers > 0:
            fields["p1_answer_marker"] = ListField(p1_answer_marker_list)
            if num_context_answers > 1:
                fields["p2_answer_marker"] = ListField(p2_answer_marker_list)
                if num_context_answers > 2:
                    fields["p3_answer_marker"] = ListField(
                        p3_answer_marker_list)
        fields["yesno_list"] = ListField([
            LabelField(yesno, label_namespace="yesno_labels")
            for yesno in yesno_list
        ])
        fields["followup_list"] = ListField([
            LabelField(followup, label_namespace="followup_labels")
            for followup in followup_list
        ])
    metadata.update(additional_metadata)
    fields["metadata"] = MetadataField(metadata)
    return Instance(fields)
示例#40
0
 def text_to_instance(
     self,
     source_context: str,
     source: str,
     target_context: str = None,
     target: str = None,
     doc_id: int = None,
     sent_id: int = None,
     context_sent_id: int = None,
     label: int = None,
 ) -> Instance:
     fields = {}
     target_tokens = ([] if target is None else
                      self._target_tokenizer.tokenize(target))
     if self._translation_data_mode == "2-to-2":
         target_context_tokens = self._target_tokenizer.tokenize(
             target_context)
         target_tokens = (target_context_tokens + [Token(CONCAT_SYMBOL)] +
                          target_tokens)
     if target_tokens:
         target_tokens.insert(0, Token(START_SYMBOL))
         target_tokens.append(Token(END_SYMBOL))
     if not self._source_only:
         fields["target_tokens"] = TextField(target_tokens,
                                             self._target_token_indexers)
     if self._classification_data_mode != "none":
         # PretrainedTransformerTokenizer can add special tokens by self now
         # What we want here is: [CLS] source_context [SEP] source [SEP]
         for key, value in zip(
             ("doc_id", "sent_id", "context_sent_id"),
             (doc_id, sent_id, context_sent_id),
         ):
             if value is not None:
                 fields[key] = MetadataField(value)
         if label is not None:
             fields["label"] = LabelField(str(label))
         source_tokens = self._source_tokenizer.tokenize_sentence_pair(
             source_context, source)
         fields["source_tokens"] = TextField(source_tokens,
                                             self._source_token_indexers)
     else:
         source_context_tokens = (
             [] if source_context is None else
             self._source_tokenizer.tokenize(source_context))
         source_tokens = ([] if source is None else
                          self._source_tokenizer.tokenize(source))
         if self._translation_data_mode != "1-to-1":
             if self._concat_source_context:
                 context_factor, source_factor = Token("C"), Token("S")
                 source_factors = [context_factor] * (
                     len(source_context_tokens) +
                     1) + [source_factor] * len(source_tokens)
                 source_tokens = (source_context_tokens +
                                  [Token(CONCAT_SYMBOL)] + source_tokens)
                 if self._source_add_factors:
                     fields["source_factors"] = TextField(
                         source_factors, self._source_factor_indexers)
             else:
                 fields["source_context_tokens"] = TextField(
                     source_context_tokens, self._source_token_indexers)
         if self._source_add_start_token:
             source_tokens.insert(0, Token(START_SYMBOL))
         if self._source_add_end_token:
             source_tokens.append(Token(END_SYMBOL))
         fields["source_tokens"] = TextField(source_tokens,
                                             self._source_token_indexers)
     return Instance(fields)
示例#41
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                               min_padding_length=1)})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                               min_padding_length=1),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5,
                                   "characters_length": 5,
                                   "words_length": 5,
                                   "num_token_characters": 8}