def test_as_tensor_handles_words(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1]))
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), expected_character_array)
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp()
def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"}
def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField([Token(t) for t in ["a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def test_token_embedder_returns_dict(self): field = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids': 5, 'additional_key': 2, 'words': 2, 'characters': 2, 'num_token_characters': 8 } padding_lengths['token_ids'] = 7 padding_lengths['additional_key'] = 3 padding_lengths['words'] = 4 padding_lengths['characters'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors['token_ids'].shape) == [7] assert list(tensors['additional_key'].shape) == [3] assert list(tensors['words'].shape) == [4] assert list(tensors['characters'].shape) == [4, 8]
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths()
def _prepare_next_inputs(self, predictions: torch.Tensor, target_attention_map: torch.Tensor, target_dynamic_vocabs: List[Dict[int, str]], meta_data: List[Dict], batch_size: int, last_decoding_step: int, source_dynamic_vocab_size: int) -> Dict: """ Read out a group of hybrid predictions. Based on different ways of node prediction, find the corresponding token, node index and pos tags. Prepare the tensorized inputs for the next decoding step. Update the target attention map, target dynamic vocab, etc. :param predictions: [group_size,] :param target_attention_map: [group_size, target_length, target_dynamic_vocab_size]. :param target_dynamic_vocabs: a group_size list of target dynamic vocabs. :param meta_data: meta data for each instance. :param batch_size: int. :param last_decoding_step: the decoding step starts from 0, so the last decoding step starts from -1. :param source_dynamic_vocab_size: int. """ # On the default, if a new node is created via either generation or source-side copy, # its node index will be last_decoding_step + 1. One shift between the last decoding # step and the default node index is because node index 0 is reserved for no target copy. # See `_prepare_inputs` for detail. default_node_index = last_decoding_step + 1 def batch_index(instance_i: int) -> int: if predictions.size(0) == batch_size * self._beam_size: return instance_i // self._beam_size else: return instance_i token_instances = [] node_indices = torch.zeros_like(predictions) pos_tags = torch.zeros_like(predictions) for i, index in enumerate(predictions.tolist()): instance_meta = meta_data[batch_index(i)] pos_tag_lut = instance_meta["pos_tag_lut"] target_dynamic_vocab = target_dynamic_vocabs[i] # Generation. if index < self._vocab_size: token = self.vocab.get_token_from_index( index, self._target_output_namespace) node_index = default_node_index pos_tag = pos_tag_lut.get(token, DEFAULT_OOV_TOKEN) # Source-side copy. elif self._vocab_size <= index < self._vocab_size + source_dynamic_vocab_size: index -= self._vocab_size source_dynamic_vocab = instance_meta["source_dynamic_vocab"] token = source_dynamic_vocab.get_token_from_idx(index) node_index = default_node_index pos_tag = pos_tag_lut.get(token, DEFAULT_OOV_TOKEN) # Target-side copy. else: index -= (self._vocab_size + source_dynamic_vocab_size) token = target_dynamic_vocab[index] node_index = index pos_tag = pos_tag_lut.get(token, DEFAULT_OOV_TOKEN) target_token = TextField([Token(token)], instance_meta["target_token_indexers"]) token_instances.append(Instance({"target_tokens": target_token})) node_indices[i] = node_index pos_tags[i] = self.vocab.get_token_index(pos_tag, self._pos_tag_namespace) if last_decoding_step != -1: # For <BOS>, we set the last decoding step to -1. target_attention_map[i, last_decoding_step, node_index] = 1 target_dynamic_vocab[node_index] = token # Covert tokens to tensors. batch = Batch(token_instances) batch.index_instances(self.vocab) padding_lengths = batch.get_padding_lengths() tokens = {} for key, tensor in batch.as_tensor_dict( padding_lengths)["target_tokens"].items(): tokens[key] = tensor.type_as(predictions) return dict( tokens=tokens, # [group_size, 1] node_indices=node_indices.unsqueeze(1), pos_tags=pos_tags.unsqueeze(1), )
def make_marginal_bert_drop_instance(passage_question_tokens: List[Token], #passage_tokens: List[Token], implicit_tokens: List[Token], number_tokens: List[Token], number_indices: List[int], token_indexers: Dict[str, TokenIndexer], passage_text: str, answer_info: Dict[str, Any] = None, additional_metadata: Dict[str, Any] = None) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_question_field = TextField(passage_question_tokens,token_indexers) fields["passage_question"] = passage_question_field number_index_fields: List[Field] = [IndexField(index, passage_question_field) for index in number_indices] fields["number_indices"] = ListField(number_index_fields) numbers_in_passage_question_field = TextField(number_tokens, token_indexers) implicit_token_field = TextField(implicit_tokens, token_indexers) metadata = {"original_passage": passage_text, "passage_question_tokens": [token.text for token in passage_question_tokens], "number_tokens": [token.text for token in number_tokens], "number_indices": number_indices} if answer_info: metadata["answer_texts"] = answer_info["answer_texts"] """ spans """ span_fields: List[Field] = \ [SpanField(span[0], span[1], passage_question_field) for span in answer_info["answer_spans"]] if not span_fields: span_fields.append(SpanField(-1, -1, passage_question_field)) fields["answer_as_spans"] = ListField(span_fields) """ number and date """ add_sub_signs_field: List[Field] = [] for signs_for_one_add_sub_expression in answer_info["signs_for_add_sub_expressions"]: add_sub_signs_field.append(SequenceLabelField(signs_for_one_add_sub_expression, numbers_in_passage_question_field)) if not add_sub_signs_field: add_sub_signs_field.append(SequenceLabelField([0] * len(number_tokens), numbers_in_passage_question_field)) fields["answer_as_add_sub_expressions"] = ListField(add_sub_signs_field) """ count """ count_fields: List[Field] = [LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"]] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) answer_label = np.zeros((3)) if answer_info["answer_spans"]: answer_label[0] = 1.0 if answer_info["signs_for_add_sub_expressions"]: answer_label[1] = 1.0 if answer_info["counts"]: answer_label[2] = 1.0 if sum(answer_label)!=0: answer_label = answer_label /float(sum(answer_label)) fields["answer_type"] = ArrayField(answer_label, -1) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5} def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]])) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words_length"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field) def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3]
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_tokens_to_namespace(["d", "a", "b"], namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) instances = Batch([Instance({"text": text_field})]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index( "c", "tokens") # should be present assert extended_vocab.get_token_index( "e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace( "a", namespace="tokens1") # index2 text_field = TextField( [Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def text_to_instance( self, # type: ignore sentences: List[List[str]], document_id: str, sentence_id: int, gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, user_threshold: Optional[float] = 0.0) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. document_id : ``str``, required. A string representing the document ID. sentence_id : ``int``, required. An int representing the sentence ID. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. user_threshold: ``Optional[float]``, optional (default = 0.0) approximate % of gold labels to label to hold out as user input. EX = 0.5, 0.33, 0.25, 0.125 Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [ self._normalize_word(word) for sentence in sentences for word in sentence ] metadata: Dict[str, Any] = { "original_text": flattened_sentences, "ID": document_id + ";" + str(sentence_id) } if gold_clusters is not None: metadata["clusters"] = gold_clusters metadata["num_gold_clusters"] = len(gold_clusters) text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) user_threshold_mod = int( 1 / user_threshold ) if self._simulate_user_inputs and user_threshold > 0 else 0 cluster_dict = {} simulated_user_cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for i in range(len(cluster)): # use modulo to have a relatively even distribution of user labels across length of document, # (since clusters are sorted)--so user simulated clusters are spread evenly across document if user_threshold_mod == 0 or i % user_threshold_mod != user_threshold_mod - 1: cluster_dict[tuple(cluster[i])] = cluster_id simulated_user_cluster_dict[tuple(cluster[i])] = cluster_id # Note simulated_user_cluster_dict encompasses ALL gold labels, including those in cluster_dict # Consequently user_labels encompasses all gold labels spans: List[Field] = [] if gold_clusters is not None: span_labels: Optional[List[int]] = [] user_labels: Optional[List[ int]] = [] if self._simulate_user_inputs and user_threshold > 0 else None else: span_labels = user_labels = None # our must-link and cannot-link constraints, derived from user labels # using gold_clusters being None as an indicator of whether we're running training or not must_link: Optional[ List[int]] = [] if gold_clusters is not None else None cannot_link: Optional[ List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 doc_info = None if self._saved_labels is not None and metadata[ 'ID'] in self._saved_labels: doc_info = self._saved_labels[metadata['ID']] span_labels = doc_info['span_labels'].tolist() if 'must_link' in doc_info: must_link = doc_info['must_link'].squeeze(-1).tolist() cannot_link = doc_info['cannot_link'].squeeze(-1).tolist() for sentence in sentences: for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if doc_info is None: # only do if we haven't already loaded span labels if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) if self._simulate_user_inputs and user_threshold > 0: if (start, end) in simulated_user_cluster_dict: user_labels.append( simulated_user_cluster_dict[(start, end)]) else: user_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if must_link is not None and len(must_link) > 0: must_link_field = [] cannot_link_field = [] for link in must_link: must_link_field.append( PairField( IndexField(link[0], span_field), IndexField(link[1], span_field), )) for link in cannot_link: cannot_link_field.append( PairField( IndexField(link[0], span_field), IndexField(link[1], span_field), )) must_link_field = ListField(must_link_field) cannot_link_field = ListField(cannot_link_field) fields["must_link"] = must_link_field fields["cannot_link"] = cannot_link_field if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) if user_labels is not None: fields["user_labels"] = SequenceLabelField( user_labels, span_field) # sanity checks if doc_info is not None: assert (fields["span_labels"].as_tensor( fields["span_labels"].get_padding_lengths()) != doc_info['span_labels']).nonzero().size(0) == 0 if 'must_link' in doc_info: assert 'must_link' in fields assert (fields["must_link"].as_tensor( fields["must_link"].get_padding_lengths()) != doc_info['must_link']).nonzero().size(0) == 0 assert (fields["cannot_link"].as_tensor( fields["cannot_link"].get_padding_lengths()) != doc_info['cannot_link']).nonzero().size(0) == 0 return Instance(fields)
def text_to_instance(self, line: str) -> Instance: # type: ignore tokens = self._tokenizer.tokenize(line) return Instance({"line": TextField(tokens)})
def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({'text': TextField(tokens, self.token_indexers)}) return instance
def text_to_instance( self, # type: ignore premise: List[Tuple[str, float]], # Important type information hypothesis: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens_list = [] premise_prob_values = [] premise_span_list: List[Tuple[int, int]] = [] premise_span_prob: List[float] = [] # sentence_count = len(premise) if self.shuffle_sentences: # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch. random.shuffle(premise) span_start = 0 for premise_sent, prob in premise: cur_premise_tokens = [Token(t) for t in premise_sent.split(' ') ] # Removing code for parentheses in NLI span_end = span_start + len(cur_premise_tokens) # premise_span_list.append( (span_start, span_end)) # Calculate the span. span_start = span_end premise_span_prob.append(prob) prob_value = np.ones( (len(cur_premise_tokens), 1), dtype=np.float32) * prob premise_tokens_list.extend(cur_premise_tokens) premise_prob_values.append(prob_value) premise_prob = np.concatenate(premise_prob_values, axis=0) hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')] hypothesis_prob = np.ones((len(hypothesis_tokens), 1), dtype=np.float32) if self.max_l is not None: premise_tokens_list = premise_tokens_list[:self.max_l] hypothesis_tokens = hypothesis_tokens[:self.max_l] premise_prob = premise_prob[:self.max_l, :] hypothesis_prob = hypothesis_prob[:self.max_l, :] # for span, prob in zip(premise_span_list, premise_span_prob): fields['premise_spans'] = MetadataField( ParagraphSpan(premise_span_list)) fields['premise_probs'] = MetadataField(premise_span_prob) fields['premise'] = TextField(premise_tokens_list, self._token_indexers) # (t_len, 1) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) # WN feature dict: premise_s = ' '.join([t for t, p in premise]).split(' ') hypothesis_s = hypothesis.split(' ') if self.max_l is not None: premise_s = premise_s[:self.max_l] hypothesis_s = hypothesis_s[:self.max_l] # if self.ablation is not None and self.ablation['rm_wn'] and self.ablation['rm_simi']: # p_feature_array = np.concatenate([premise_prob], axis=1) # h_feature_array = np.concatenate([hypothesis_prob], axis=1) if self.wn_p_dict is None: p_feature_array = np.zeros(1) h_feature_array = np.zeros(1) elif self.ablation is not None and self.wn_p_dict is not None and self.ablation[ 'rm_wn']: p_feature_array = np.concatenate([premise_prob], axis=1) h_feature_array = np.concatenate([hypothesis_prob], axis=1) elif self.ablation is not None and self.wn_p_dict is not None and self.ablation[ 'rm_simi']: example_feature = wn_persistent_api.compute_wn_features_p_accerate( premise_s, hypothesis_s, self.wn_p_dict) p_wn_nparray, h_wn_nparray = wn_persistent_api.wn_raw_feature_to_nparray( example_feature, self.wn_feature_list) # Appending more features p_num_feature = encode_num_in_ltokens(premise_s) # (t_len, 5) h_num_feature = encode_num_in_ltokens(hypothesis_s) # (t_len, 5) p_feature_array = np.concatenate([p_wn_nparray, p_num_feature], axis=1) h_feature_array = np.concatenate([h_wn_nparray, h_num_feature], axis=1) elif self.wn_p_dict is not None: # Whole Model no ablation. example_feature = wn_persistent_api.compute_wn_features_p_accerate( premise_s, hypothesis_s, self.wn_p_dict) p_wn_nparray, h_wn_nparray = wn_persistent_api.wn_raw_feature_to_nparray( example_feature, self.wn_feature_list) # Appending more features p_num_feature = encode_num_in_ltokens(premise_s) # (t_len, 5) h_num_feature = encode_num_in_ltokens(hypothesis_s) # (t_len, 5) p_feature_array = np.concatenate( [p_wn_nparray, p_num_feature, premise_prob], axis=1) h_feature_array = np.concatenate( [h_wn_nparray, h_num_feature, hypothesis_prob], axis=1) assert len(premise_tokens_list) == p_feature_array.shape[0] assert len(hypothesis_tokens) == h_feature_array.shape[0] fields['p_wn_feature'] = ArrayField(p_feature_array) fields['h_wn_feature'] = ArrayField(h_feature_array) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
def text_to_instance( self, para_id: str, sentence_texts: List[str], participants: List[str], states: List[ List[str]] = None, # states[i][j] is ith participant at time j filename: str = '', score: float = None) -> Instance: tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter( pos_tags=True)) paragraph = " ".join(sentence_texts) # Tokenize the sentences sentences = [ tokenizer.tokenize(sentence_text) for sentence_text in sentence_texts ] # Find the verbs verb_indexes = [[ 1 if token.pos_ == "VERB" else 0 for token in sentence ] for sentence in sentences] if states is not None: # Actions is (num_participants, num_events) actions = [_infer_actions(states_i) for states_i in states] tokenized_states = [[ tokenizer.tokenize(state_ij) for state_ij in states_i ] for states_i in states] location_spans = [ _compute_location_spans(states_i, sentences) for states_i in tokenized_states ] # Create indicators for the participants. participant_tokens = [ tokenizer.tokenize(participant) for participant in participants ] participant_indicators: List[List[List[int]]] = [] for participant_i_tokens in participant_tokens: targets = [ list(token_group) for is_semicolon, token_group in itertools.groupby( participant_i_tokens, lambda t: t.text == ";") if not is_semicolon ] participant_i_indicators: List[List[int]] = [] for sentence in sentences: sentence_indicator = [0 for _ in sentence] for target in targets: start = 0 while True: span_start, span_end = _find_span(target, sentence, start, target_is_noun=True) if span_start >= 0: for j in range(span_start, span_end + 1): sentence_indicator[j] = 1 start = span_start + 1 else: break participant_i_indicators.append(sentence_indicator) participant_indicators.append(participant_i_indicators) fields: Dict[str, Field] = {} fields["paragraph"] = TextField(tokenizer.tokenize(paragraph), self._token_indexers) fields["participants"] = ListField([ TextField(tokenizer.tokenize(participant), self._token_indexers) for participant in participants ]) # One per sentence fields["sentences"] = ListField([ TextField(sentence, self._token_indexers) for sentence in sentences ]) # One per sentence fields["verbs"] = ListField([ SequenceLabelField(verb_indexes[i], fields["sentences"].field_list[i]) for i in range(len(sentences)) ]) # And also at the paragraph level fields["paragraph_verbs"] = SequenceLabelField([ verb_indicator for verb_indexes_i in verb_indexes for verb_indicator in verb_indexes_i ], fields["paragraph"]) if states is not None: # Outer ListField is one per participant fields["actions"] = ListField([ # Inner ListField is one per sentence ListField([ # action is an Enum, so call .value to get an int LabelField(action.value, skip_indexing=True) for action in participant_actions ]) for participant_actions in actions ]) # Outer ListField is one per participant fields["before_locations"] = ListField([ # Inner ListField is one per sentence ListField([ SpanField(start, end, fields["sentences"].field_list[i]) for i, ((start, end), _) in enumerate(participant_location_spans) ]) for participant_location_spans in location_spans ]) # Outer ListField is one per participant fields["after_locations"] = ListField([ # Inner ListField is one per sentence ListField([ SpanField(start, end, fields["sentences"].field_list[i]) for i, (_, (start, end)) in enumerate(participant_location_spans) ]) for participant_location_spans in location_spans ]) # one per participant fields["participant_indicators"] = ListField([ # one per sentence ListField([ SequenceLabelField(sentence_indicator, fields["sentences"].field_list[i]) for i, sentence_indicator in enumerate(participant_i_indicators) ]) for participant_i_indicators in participant_indicators ]) # and also at the paragraph level # one per participant fields["paragraph_participant_indicators"] = ListField([ SequenceLabelField([ indicator for sentence_indicator in participant_i_indicators for indicator in sentence_indicator ], fields["paragraph"]) for participant_i_indicators in participant_indicators ]) # Finally, we want to indicate before / inside / after for each sentence. paragraph_sentence_indicators: List[SequenceLabelField] = [] for i in range(len(sentences)): before_length = sum(len(sentence) for sentence in sentences[:i]) sentence_length = len(sentences[i]) after_length = sum( len(sentence) for sentence in sentences[(i + 1):]) paragraph_sentence_indicators.append( SequenceLabelField([0] * before_length + [1] * sentence_length + [2] * after_length, fields["paragraph"])) fields["paragraph_sentence_indicators"] = ListField( paragraph_sentence_indicators) # These fields are passed on to the decoder trainer that internally uses it # to compute commonsense scores for predicted actions fields["para_id"] = MetadataField(para_id) fields["participant_strings"] = MetadataField(participants) fields["filename"] = MetadataField(filename) if score is not None: fields["score"] = MetadataField(score) return Instance(fields)
def text_to_instance( # type: ignore self, tokens: List[Token], ners: List[str] = None, tag_ids: List[int] = None, root_ids: List[int] = None, rels: List[str] = None, isdef: str = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {"tokens": sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Add "feature labels" to instance if "ner" in self.feature_labels: if ners is None: raise ConfigurationError( "Dataset reader was specified to use definition terms as " "features. Pass them to text_to_instance." ) instance_fields["ner"] = SequenceLabelField(ners, sequence, "ner") if "tag_id" in self.feature_labels: if tag_ids is None: raise ConfigurationError( "Dataset reader was specified to use tag ids as " "features. Pass them to text_to_instance." ) instance_fields["tag_id"] = SequenceLabelField(tag_ids, sequence, "tag_id") if "root_id" in self.feature_labels: if root_ids is None: raise ConfigurationError( "Dataset reader was specified to use roots as " " features. Pass them to text_to_instance." ) instance_fields["root_id"] = SequenceLabelField(root_ids, sequence, "root_id") if "rel" in self.feature_labels: if rels is None: raise ConfigurationError( "Dataset reader was specified to use relations as " " features. Pass them to text_to_instance." ) instance_fields["rel"] = SequenceLabelField(rels, sequence, "rel") if "isdef" in self.feature_labels: if isdef is None: raise ConfigurationError( "Dataset reader was specified to use relations as " " features. Pass them to text_to_instance." ) instance_fields["isdef"] = LabelField(isdef, "isdef") # Add "tag label" to instance if self.tag_label == "ner" and ners is not None: instance_fields["tags"] = SequenceLabelField(ners, sequence, self.label_namespace) elif self.tag_label == "tag_id" and tag_ids is not None: instance_fields["tags"] = SequenceLabelField(tag_ids, sequence, self.label_namespace) elif self.tag_label == "root_id" and root_ids is not None: instance_fields["tags"] = SequenceLabelField(root_ids, sequence, self.label_namespace) elif self.tag_label == "rel" and rels is not None: instance_fields["tags"] = SequenceLabelField(rels, sequence, self.label_namespace) elif self.tag_label == "isdef" and isdef is not None: instance_fields["labels"] = LabelField(isdef, self.label_namespace) return Instance(instance_fields)
def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters", min_padding_length=1)}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1)}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"}
def test_sequence_methods(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], {}) assert len(field) == 5 assert field[1].text == "is" assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]
def text_to_instance( self, text: str, sentiment: str, selected_text: Optional[str] = None, ) -> Instance: fields = {} text_tokens = self._tokenizer.tokenize(text) sentiment_tokens = self._tokenizer.tokenize(sentiment) # add special tokens text_with_sentiment_tokens = self._tokenizer.add_special_tokens( text_tokens, sentiment_tokens) tokens_field = TextField(text_with_sentiment_tokens, {"tokens": self._tokenindexer}) fields["tokens"] = tokens_field additional_metadata = {} if selected_text is not None: context = text answer = selected_text additional_metadata["selected_text"] = selected_text first_answer_offset = context.find(answer) def tokenize_slice(start: int, end: int) -> Iterable[Token]: text_to_tokenize = context[start:end] if start - 1 >= 0 and context[start - 1].isspace(): prefix = ( "a " ) # must end in a space, and be short so we can be sure it becomes only one token wordpieces = self._tokenizer.tokenize(prefix + text_to_tokenize) for wordpiece in wordpieces: if wordpiece.idx is not None: wordpiece.idx -= len(prefix) return wordpieces[1:] else: return self._tokenizer.tokenize(text_to_tokenize) tokenized_context = [] token_start = 0 for i, c in enumerate(context): if c.isspace(): for wordpiece in tokenize_slice(token_start, i): if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) token_start = i + 1 for wordpiece in tokenize_slice(token_start, len(context)): if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) if first_answer_offset is None: (token_answer_span_start, token_answer_span_end) = (-1, -1) else: ( token_answer_span_start, token_answer_span_end, ), _ = char_span_to_token_span( [(t.idx, t.idx + len(sanitize_wordpiece(t.text))) if t.idx is not None else None for t in tokenized_context], (first_answer_offset, first_answer_offset + len(answer)), ) tags = ["O"] * len(tokens_field) for i in range(token_answer_span_start, token_answer_span_end + 1): tags[i] = "I" fields["tags"] = SequenceLabelField(tags, tokens_field) # make the metadata metadata = { "text": text, "sentiment": sentiment, "words": text, "text_with_sentiment_tokens": text_with_sentiment_tokens, } if additional_metadata: metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, data: dict) -> Instance: # type: ignore field_of_tokens = TextField(data['sdp'], self._token_indexers) return Instance({ 'sen_dep': field_of_tokens, 'label': LabelField(data['relation']) })
def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field)
def tokenize(self, d): tokenized_tokens = self._tokenizer.tokenize(' '.join(d['token'])) return TextField(tokenized_tokens, self._token_indexers)
def text_to_instance( self, # type: ignore sentence: str, structured_representations: List[List[List[JsonDict]]], labels: List[str] = None, target_sequences: List[List[str]] = None, identifier: str = None) -> Instance: """ Parameters ---------- sentence : ``str`` The query sentence. structured_representations : ``List[List[List[JsonDict]]]`` A list of Json representations of all the worlds. See expected format in this class' docstring. labels : ``List[str]`` (optional) List of string representations of the labels (true or false) corresponding to the ``structured_representations``. Not required while testing. target_sequences : ``List[List[str]]`` (optional) List of target action sequences for each element which lead to the correct denotation in worlds corresponding to the structured representations. identifier : ``str`` (optional) The identifier from the dataset if available. """ # pylint: disable=arguments-differ worlds = [] for structured_representation in structured_representations: boxes = set([ Box(object_list, box_id) for box_id, object_list in enumerate(structured_representation) ]) worlds.append(NlvrLanguage(boxes)) tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers) production_rule_fields: List[Field] = [] instance_action_ids: Dict[str, int] = {} # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change # later. for production_rule in worlds[0].all_possible_productions(): instance_action_ids[production_rule] = len(instance_action_ids) field = ProductionRuleField(production_rule, is_global_rule=True) production_rule_fields.append(field) action_field = ListField(production_rule_fields) worlds_field = ListField([MetadataField(world) for world in worlds]) metadata: Dict[str, Any] = { "sentence_tokens": [x.text for x in tokenized_sentence] } fields: Dict[str, Field] = { "sentence": sentence_field, "worlds": worlds_field, "actions": action_field, "metadata": MetadataField(metadata) } if identifier is not None: fields["identifier"] = MetadataField(identifier) # Depending on the type of supervision used for training the parser, we may want either # target action sequences or an agenda in our instance. We check if target sequences are # provided, and include them if they are. If not, we'll get an agenda for the sentence, and # include that in the instance. if target_sequences: action_sequence_fields: List[Field] = [] for target_sequence in target_sequences: index_fields = ListField([ IndexField(instance_action_ids[action], action_field) for action in target_sequence ]) action_sequence_fields.append(index_fields) # TODO(pradeep): Define a max length for this field. fields["target_action_sequences"] = ListField( action_sequence_fields) elif self._output_agendas: # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true # now, but may change later too. agenda = worlds[0].get_agenda_for_sentence(sentence) assert agenda, "No agenda found for sentence: %s" % sentence # agenda_field contains indices into actions. agenda_field = ListField([ IndexField(instance_action_ids[action], action_field) for action in agenda ]) fields["agenda"] = agenda_field if labels: labels_field = ListField([ LabelField(label, label_namespace='denotations') for label in labels ]) fields["labels"] = labels_field return Instance(fields)
'token_characters': TokenCharactersIndexer(namespace='character_vocab'), 'pos_tags': PosTagIndexer(namespace='pos_tag_vocab') } vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'], namespace='token_vocab') vocab.add_tokens_to_namespace( ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'], namespace='character_vocab') vocab.add_tokens_to_namespace(['DT', 'VBZ', 'NN', '.'], namespace='pos_tag_vocab') text = "This is some text." tokens = tokenizer.tokenize(text) print(tokens) print([token.tag_ for token in tokens]) text_field = TextField(tokens, token_indexers) # In order to convert the token strings into integer ids, we need to tell the # TextField what Vocabulary to use. text_field.index(vocab) # We typically batch things together when making tensors, which requires some # padding computation. Don't worry too much about the padding for now. padding_lengths = text_field.get_padding_lengths() tensor_dict = text_field.as_tensor(padding_lengths) print(tensor_dict)
def test_from_params_valid_vocab_extension_thoroughly(self): """ Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana """ vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary( non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens0": SingleIdTokenIndexer("tokens0")}, ) text_field1 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens1": SingleIdTokenIndexer("tokens1")}, ) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([ Instance({ "text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5, }) ]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens5"], }) extended_vocab = Vocabulary.from_params(params, instances=instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == { "tokens1", "tokens3", "tokens5" } # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size( "tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size( "tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index( token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index( index, namespace) extended_vocab_token = extended_vocab.get_token_from_index( index, namespace) assert vocab_token == extended_vocab_token
def setUp(self): super(TestSpanField, self).setUp() self.indexers = {"words": SingleIdTokenIndexer("words")} self.text = TextField([Token(t) for t in ["here", "is", "a", "sentence", "for", "spans", "."]], self.indexers)
def text_to_instance(self, # type: ignore utterances: List[str], sql_query_labels: List[str] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- utterances: ``List[str]``, required. List of utterances in the interaction, the last element is the current utterance. sql_query_labels: ``List[str]``, optional The SQL queries that are given as labels during training or validation. """ utterance = utterances[-1] action_sequence: List[str] = [] if not utterance: return None world = AtisWorld(utterances=utterances) if sql_query_labels: # If there are multiple sql queries given as labels, we use the shortest # one for training. sql_query = min(sql_query_labels, key=len) try: action_sequence = world.get_action_sequence(sql_query) except ParseError: logger.debug(f'Parsing error') tokenized_utterance = self._tokenizer.tokenize(utterance.lower()) utterance_field = TextField(tokenized_utterance, self._token_indexers) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): nonterminal, _ = production_rule.split(' ->') # The whitespaces are not semantically meaningful, so we filter them out. production_rule = ' '.join([token for token in production_rule.split(' ') if token != 'ws']) field = ProductionRuleField(production_rule, self._is_global_rule(nonterminal)) production_rule_fields.append(field) action_field = ListField(production_rule_fields) action_map = {action.rule: i # type: ignore for i, action in enumerate(action_field.field_list)} index_fields: List[Field] = [] world_field = MetadataField(world) fields = {'utterance' : utterance_field, 'actions' : action_field, 'world' : world_field, 'linking_scores' : ArrayField(world.linking_scores)} if sql_query_labels != None: fields['sql_queries'] = MetadataField(sql_query_labels) if action_sequence: for production_rule in action_sequence: index_fields.append(IndexField(action_map[production_rule], action_field)) action_sequence_field = ListField(index_fields) fields['target_action_sequence'] = action_sequence_field else: # If we are given a SQL query, but we are unable to parse it, then we will skip it. return None return Instance(fields)
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def text_to_instance(self, subject: str, predicate: str, obj: str) -> Instance: concatenated_tuple = " ".join((subject, predicate, object)) tokens = self._tokenizer.tokenize(concatenated_tuple) return Instance({ "tokens": TextField(tokens, self._token_indexers) })
def text_to_instance(self, question_text: str, passage_text: str, passage_tokens: List[Token], numbers_in_passage: List[Any], number_words: List[str], number_indices: List[int], number_len: List[int], question_id: str = None, answer_annotations: List[List[str]] = None, dataset: str = None) -> Union[Instance, None]: # Tokenize question and passage question_tokens = self.tokenizer.tokenize(question_text) qlen = len(question_tokens) plen = len(passage_tokens) question_passage_tokens = [Token('[CLS]')] + question_tokens + [ Token('[SEP]') ] + passage_tokens if len(question_passage_tokens) > self.max_pieces - 1: question_passage_tokens = question_passage_tokens[:self. max_pieces - 1] passage_tokens = passage_tokens[:self.max_pieces - qlen - 3] plen = len(passage_tokens) if len(number_indices) > 0: number_indices, number_len, numbers_in_passage = \ clipped_passage_num(number_indices, number_len, numbers_in_passage, plen) question_passage_tokens += [Token('[SEP]')] number_indices = [index + qlen + 2 for index in number_indices] + [-1] # Not done in-place so they won't change the numbers saved for the passage number_len = number_len + [1] numbers_in_passage = numbers_in_passage + [0] number_tokens = [Token(str(number)) for number in numbers_in_passage] extra_number_tokens = [Token(str(num)) for num in self.extra_numbers] mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1] fields: Dict[str, Field] = {} # Add feature fields question_passage_field = TextField(question_passage_tokens, self.token_indexers) fields["question_passage"] = question_passage_field number_token_indices = \ [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1) for i, start_ind in enumerate(number_indices)] fields["number_indices"] = ListField(number_token_indices) numbers_in_passage_field = TextField(number_tokens, self.token_indexers) extra_numbers_field = TextField(extra_number_tokens, self.token_indexers) all_numbers_field = TextField(extra_number_tokens + number_tokens, self.token_indexers) mask_index_fields: List[Field] = [ IndexField(index, question_passage_field) for index in mask_indices ] fields["mask_indices"] = ListField(mask_index_fields) # Compile question, passage, answer metadata metadata = { "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "original_number_words": number_words, "extra_numbers": self.extra_numbers, "passage_tokens": passage_tokens, "question_tokens": question_tokens, "question_passage_tokens": question_passage_tokens, "question_id": question_id, "dataset": dataset } if answer_annotations: answer_texts = answer_annotations[0] answer_type = "span" tokenized_answer_texts = [] num_spans = min(len(answer_texts), self.max_spans) for answer_text in answer_texts: answer_tokens = self.tokenizer.tokenize(answer_text) tokenized_answer_texts.append(' '.join( token.text for token in answer_tokens)) metadata["answer_annotations"] = answer_annotations metadata["answer_texts"] = answer_texts metadata["answer_tokens"] = tokenized_answer_texts # Find answer text in question and passage valid_question_spans = DropReader.find_valid_spans( question_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_question_spans): valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1) valid_passage_spans = DropReader.find_valid_spans( passage_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_passage_spans): valid_passage_spans[span_ind] = (span[0] + qlen + 2, span[1] + qlen + 2) # Get target numbers target_numbers = [] for answer_text in answer_texts: if answer_text.strip().count(" ") == 0: number = self.word_to_num(answer_text, True) if number is not None: target_numbers.append(number) # Get possible ways to arrive at target numbers with add/sub valid_expressions: List[List[int]] = [] exp_strings = None if answer_type in ["number", "date"]: if self.exp_search == 'full': expressions = get_full_exp( list(enumerate(self.extra_numbers + numbers_in_passage)), target_numbers, self.operations, self.op_dict, self.max_depth) zipped = list(zip(*expressions)) if zipped: valid_expressions = list(zipped[0]) exp_strings = list(zipped[1]) elif self.exp_search == 'add_sub': valid_expressions = \ DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage, target_numbers, self.max_numbers_expression) elif self.exp_search == 'template': valid_expressions, exp_strings = \ get_template_exp(self.extra_numbers + numbers_in_passage, target_numbers, self.templates, self.template_strings) exp_strings = sum(exp_strings, []) # Get possible ways to arrive at target numbers with counting valid_counts: List[int] = [] if answer_type in ["number"]: numbers_for_count = list(range(self.max_count + 1)) valid_counts = DropReader.find_valid_counts( numbers_for_count, target_numbers) # Update metadata with answer info answer_info = { "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "num_spans": num_spans, "expressions": valid_expressions, "counts": valid_counts } if self.exp_search in ['template', 'full']: answer_info['expr_text'] = exp_strings metadata["answer_info"] = answer_info # Add answer fields passage_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_passage_spans ] if not passage_span_fields: passage_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_question_spans ] if not question_span_fields: question_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) if self.exp_search == 'add_sub': add_sub_signs_field: List[Field] = [] extra_signs_field: List[Field] = [] for signs_for_one_add_sub_expressions in valid_expressions: extra_signs = signs_for_one_add_sub_expressions[:len( self.extra_numbers)] normal_signs = signs_for_one_add_sub_expressions[ len(self.extra_numbers):] add_sub_signs_field.append( SequenceLabelField(normal_signs, numbers_in_passage_field)) extra_signs_field.append( SequenceLabelField(extra_signs, extra_numbers_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) if not extra_signs_field: extra_signs_field.append( SequenceLabelField([0] * len(self.extra_numbers), extra_numbers_field)) fields["answer_as_expressions"] = ListField( add_sub_signs_field) if self.extra_numbers: fields["answer_as_expressions_extra"] = ListField( extra_signs_field) elif self.exp_search in ['template', 'full']: expression_indices = [] for expression in valid_expressions: if not expression: expression.append(3 * [-1]) expression_indices.append( ArrayField(np.array(expression), padding_value=-1)) if not expression_indices: expression_indices = \ [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))] fields["answer_as_expressions"] = ListField(expression_indices) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in valid_counts ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) fields["impossible_answer"] = LabelField(0, skip_indexing=True) #fields["num_spans"] = LabelField(num_spans, skip_indexing=True) else: fields["answer_as_passage_spans"] = ListField( [SpanField(-1, -1, question_passage_field)]) fields["answer_as_counts"] = ListField( [LabelField(-1, skip_indexing=True)]) fields["answer_as_expressions"] = ListField([ SequenceLabelField([0] * len(numbers_in_passage_field), numbers_in_passage_field) ]) fields["impossible_answer"] = LabelField(1, skip_indexing=True) metadata["answer_annotations"] = [{'spans': [""]}] fields["answer_as_question_spans"] = ListField( [SpanField(-1, -1, question_passage_field)]) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore premises: Union[List[str], List[List[str]]], choices: List[str], coverage: List[List[float]], label: int = None, question: str = None) -> Instance: number_of_choices = len(choices) if isinstance(premises[0], str): premises = [premises] * number_of_choices # create an empty dictionary to store the input fields: Dict[str, Field] = {} tokens = [] token_type_ids = [] all_links = [] all_link_token_ids = [] if len(coverage) != len(choices): logger.error("the dimension of coverage and choices did not match") exit(0) max_len = 0 max_premises = 0 for arr, p in zip(coverage, premises): if len(arr) != len(p): logger.error( "the dimension of coverage and premises did not match") exit(0) max_premises = max([max_premises, len(p)]) max_len = max([max_len, max([len(a) for a in arr])]) # padding np_coverage = np.zeros([len(coverage), max_premises, max_len]) for c_idx in range(len(coverage)): for p_idx in range(len(coverage[c_idx])): np_coverage[ c_idx, p_idx, 0:len(coverage[c_idx][p_idx])] = coverage[c_idx][p_idx] fields['coverage'] = ArrayField(np_coverage) for premise, hypothesis in zip(premises, choices): # two major keys # ph: [cls]all_premise[sep]hypothesis[sep] # two different segment_ids # join all premise sentences all_premise = " ".join(premise) if question is None: ph_tokens, ph_token_type_ids = self.bert_features_from_qa( question=all_premise, answer=hypothesis) else: ph_tokens, ph_token_type_ids = self.bert_features_from_qa( question=question, context=all_premise, answer=hypothesis) # create a simple textfield for hypothesis tokens_field = TextField(ph_tokens, self._token_indexers) tokens.append(tokens_field) token_type_ids.append( SequenceLabelField(ph_token_type_ids, tokens_field)) links_segment_2d = [] links_2d = [] for i in range(0, len(premise)): tokenized_links_field = [] type_ids_of_links = [] for j in range(0, len(premise)): if i == j: continue else: if question is None: pp_tokens, pp_token_type_ids = self.bert_features_from_qa( question=premise[i], answer=hypothesis, context=premise[j]) else: pp_tokens, pp_token_type_ids = self.bert_features_from_qa( question=question, context2=premise[j], answer=hypothesis, context=premise[i]) pp_tokens_field = TextField(pp_tokens, self._token_indexers) tokenized_links_field.append(pp_tokens_field) type_ids_of_links.append( SequenceLabelField(pp_token_type_ids, pp_tokens_field)) links_2d.append(ListField(tokenized_links_field)) links_segment_2d.append(ListField(type_ids_of_links)) if len(premise) >= 2: all_links.append(ListField(links_2d)) all_link_token_ids.append(ListField(links_segment_2d)) else: # add an empty list field empty_tokens_field = [TextField([], self._token_indexers)] empty_type_ids_of_links = [ SequenceLabelField([], empty_tokens_field[0]) ] all_links.append(ListField(ListField(empty_tokens_field))) all_link_token_ids.append( ListField(ListField(empty_type_ids_of_links))) if label is not None: fields['label'] = LabelField(label, skip_indexing=True) fields['tokens'] = ListField(tokens) fields['token_type_ids'] = ListField(token_type_ids) fields['links_tokens'] = ListField(all_links) fields['links_token_type_ids'] = ListField(all_link_token_ids) return Instance(fields)
def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words_length": 5, "num_tokens": 5} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters"), "words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "words_length": 5, "num_token_characters": 8}
def make_reading_comprehension_instance( question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, additional_metadata: Dict[str, Any] = None, ) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = TextField(question_tokens, token_indexers) metadata = { "original_passage": passage_text, "token_offsets": passage_offsets, "question_tokens": [token.text for token in question_tokens], "passage_tokens": [token.text for token in passage_tokens], } if answer_texts: metadata["answer_texts"] = answer_texts if token_spans: # There may be multiple answer annotations, so we pick the one that occurs the most. This # only matters on the SQuAD dev set, and it means our computed metrics ("start_acc", # "end_acc", and "span_acc") aren't quite the same as the official metrics, which look at # all of the annotations. This is why we have a separate official SQuAD metric calculation # (the "em" and "f1" metrics use the official script). candidate_answers: Counter = Counter() for span_start, span_end in token_spans: candidate_answers[(span_start, span_end)] += 1 span_start, span_end = candidate_answers.most_common(1)[0][0] fields["span_start"] = IndexField(span_start, passage_field) fields["span_end"] = IndexField(span_end, passage_field) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore question: str, table_lines: List[List[str]], target_values: List[str] = None, offline_search_output: List[str] = None, ) -> Instance: """ Reads text inputs and makes an instance. We pass the ``table_lines`` to ``TableQuestionContext``, and that method accepts this field either as lines from CoreNLP processed tagged files that come with the dataset, or simply in a tsv format where each line corresponds to a row and the cells are tab-separated. Parameters ---------- question : ``str`` Input question table_lines : ``List[List[str]]`` The table content optionally preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines`` for the expected format. target_values : ``List[str]``, optional Target values for the denotations the logical forms should execute to. Not required for testing. offline_search_output : ``List[str]``, optional List of logical forms, produced by offline search. Not required during test. """ tokenized_question = self._tokenizer.tokenize(question.lower()) question_field = TextField(tokenized_question, self._question_token_indexers) metadata: Dict[str, Any] = { "question_tokens": [x.text for x in tokenized_question] } table_context = TableQuestionContext.read_from_lines( table_lines, tokenized_question) world = WikiTablesLanguage(table_context) world_field = MetadataField(world) # Note: Not passing any featre extractors when instantiating the field below. This will make # it use all the available extractors. table_field = KnowledgeGraphField( table_context.get_table_knowledge_graph(), tokenized_question, self._table_token_indexers, tokenizer=self._tokenizer, include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens, ) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_productions(): _, rule_right_side = production_rule.split(" -> ") is_global_rule = not world.is_instance_specific_entity( rule_right_side) field = ProductionRuleField(production_rule, is_global_rule=is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) fields = { "question": question_field, "metadata": MetadataField(metadata), "table": table_field, "world": world_field, "actions": action_field, } if target_values is not None: target_values_field = MetadataField(target_values) fields["target_values"] = target_values_field # We'll make each target action sequence a List[IndexField], where the index is into # the action list we made above. We need to ignore the type here because mypy doesn't # like `action.rule` - it's hard to tell mypy that the ListField is made up of # ProductionRuleFields. action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore if offline_search_output: action_sequence_fields: List[Field] = [] for logical_form in offline_search_output: try: action_sequence = world.logical_form_to_action_sequence( logical_form) index_fields: List[Field] = [] for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) except ParsingError as error: logger.debug( f"Parsing error: {error.message}, skipping logical form" ) logger.debug(f"Question was: {question}") logger.debug(f"Logical form was: {logical_form}") logger.debug(f"Table info was: {table_lines}") continue except KeyError as error: logger.debug( f"Missing production rule: {error.args}, skipping logical form" ) logger.debug(f"Question was: {question}") logger.debug(f"Table info was: {table_lines}") logger.debug(f"Logical form was: {logical_form}") continue except: # noqa logger.error(logical_form) raise if len(action_sequence_fields ) >= self._max_offline_logical_forms: break if not action_sequence_fields: # This is not great, but we're only doing it when we're passed logical form # supervision, so we're expecting labeled logical forms, but we can't actually # produce the logical forms. We should skip this instance. Note that this affects # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the # full test data. return None fields["target_action_sequences"] = ListField( action_sequence_fields) if self._output_agendas: agenda_index_fields: List[Field] = [] for agenda_string in world.get_agenda(conservative=True): agenda_index_fields.append( IndexField(action_map[agenda_string], action_field)) if not agenda_index_fields: agenda_index_fields = [IndexField(-1, action_field)] fields["agenda"] = ListField(agenda_index_fields) return Instance(fields)
def make_reading_comprehension_instance_quac( question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, num_context_answers: int = 0, ) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_list_tokens : ``List[List[Token]]`` An already-tokenized list of questions. Each dialog have multiple questions. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_span_lists : ``List[List[Tuple[int, int]]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list of list, first because there is multiple questions per dialog, and because there might be several possible correct answer spans in the passage. Currently, we just select the last span in this list (i.e., QuAC has multiple annotations on the dev set; this will select the last span, which was given by the original annotator). yesno_list : ``List[int]`` List of the affirmation bit for each question answer pairs. followup_list : ``List[int]`` List of the continuation bit for each question answer pairs. num_context_answers : ``int``, optional How many answers to encode into the passage. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = ListField([ TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens ]) metadata = { "original_passage": passage_text, "token_offsets": passage_offsets, "question_tokens": [[token.text for token in question_tokens] for question_tokens in question_list_tokens], "passage_tokens": [token.text for token in passage_tokens], } p1_answer_marker_list: List[Field] = [] p2_answer_marker_list: List[Field] = [] p3_answer_marker_list: List[Field] = [] def get_tag(i, i_name): # Generate a tag to mark previous answer span in the passage. return "<{0:d}_{1:s}>".format(i, i_name) def mark_tag(span_start, span_end, passage_tags, prev_answer_distance): try: assert span_start >= 0 assert span_end >= 0 except: # noqa raise ValueError( "Previous {0:d}th answer span should have been updated!". format(prev_answer_distance)) # Modify "tags" to mark previous answer span. if span_start == span_end: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "") else: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "start") passage_tags[prev_answer_distance][span_end] = get_tag( prev_answer_distance, "end") for passage_index in range(span_start + 1, span_end): passage_tags[prev_answer_distance][passage_index] = get_tag( prev_answer_distance, "in") if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] p1_span_start, p1_span_end, p2_span_start = -1, -1, -1 p2_span_end, p3_span_start, p3_span_end = -1, -1, -1 # Looping each <<answers>>. for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = answer_span_lists[ -1] # Last one is the original answer span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) prev_answer_marker_lists = [ ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ] if question_index > 0 and num_context_answers > 0: mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists, 1) if question_index > 1 and num_context_answers > 1: mark_tag(p2_span_start, p2_span_end, prev_answer_marker_lists, 2) if question_index > 2 and num_context_answers > 2: mark_tag(p3_span_start, p3_span_end, prev_answer_marker_lists, 3) p3_span_start = p2_span_start p3_span_end = p2_span_end p2_span_start = p1_span_start p2_span_end = p1_span_end p1_span_start = span_start p1_span_end = span_end if num_context_answers > 2: p3_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[3], passage_field, label_namespace="answer_tags")) if num_context_answers > 1: p2_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[2], passage_field, label_namespace="answer_tags")) if num_context_answers > 0: p1_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[1], passage_field, label_namespace="answer_tags")) fields["span_start"] = ListField(span_start_list) fields["span_end"] = ListField(span_end_list) if num_context_answers > 0: fields["p1_answer_marker"] = ListField(p1_answer_marker_list) if num_context_answers > 1: fields["p2_answer_marker"] = ListField(p2_answer_marker_list) if num_context_answers > 2: fields["p3_answer_marker"] = ListField( p3_answer_marker_list) fields["yesno_list"] = ListField([ LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list ]) fields["followup_list"] = ListField([ LabelField(followup, label_namespace="followup_labels") for followup in followup_list ]) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, source_context: str, source: str, target_context: str = None, target: str = None, doc_id: int = None, sent_id: int = None, context_sent_id: int = None, label: int = None, ) -> Instance: fields = {} target_tokens = ([] if target is None else self._target_tokenizer.tokenize(target)) if self._translation_data_mode == "2-to-2": target_context_tokens = self._target_tokenizer.tokenize( target_context) target_tokens = (target_context_tokens + [Token(CONCAT_SYMBOL)] + target_tokens) if target_tokens: target_tokens.insert(0, Token(START_SYMBOL)) target_tokens.append(Token(END_SYMBOL)) if not self._source_only: fields["target_tokens"] = TextField(target_tokens, self._target_token_indexers) if self._classification_data_mode != "none": # PretrainedTransformerTokenizer can add special tokens by self now # What we want here is: [CLS] source_context [SEP] source [SEP] for key, value in zip( ("doc_id", "sent_id", "context_sent_id"), (doc_id, sent_id, context_sent_id), ): if value is not None: fields[key] = MetadataField(value) if label is not None: fields["label"] = LabelField(str(label)) source_tokens = self._source_tokenizer.tokenize_sentence_pair( source_context, source) fields["source_tokens"] = TextField(source_tokens, self._source_token_indexers) else: source_context_tokens = ( [] if source_context is None else self._source_tokenizer.tokenize(source_context)) source_tokens = ([] if source is None else self._source_tokenizer.tokenize(source)) if self._translation_data_mode != "1-to-1": if self._concat_source_context: context_factor, source_factor = Token("C"), Token("S") source_factors = [context_factor] * ( len(source_context_tokens) + 1) + [source_factor] * len(source_tokens) source_tokens = (source_context_tokens + [Token(CONCAT_SYMBOL)] + source_tokens) if self._source_add_factors: fields["source_factors"] = TextField( source_factors, self._source_factor_indexers) else: fields["source_context_tokens"] = TextField( source_context_tokens, self._source_token_indexers) if self._source_add_start_token: source_tokens.insert(0, Token(START_SYMBOL)) if self._source_add_end_token: source_tokens.append(Token(END_SYMBOL)) fields["source_tokens"] = TextField(source_tokens, self._source_token_indexers) return Instance(fields)
def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words_length": 5, "num_tokens": 5} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters", min_padding_length=1)}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters", min_padding_length=1), "words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "words_length": 5, "num_token_characters": 8}