Exemplo n.º 1
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
Exemplo n.º 2
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
Exemplo n.º 3
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_token_to_namespace("a", namespace="tokens1")
        original_vocab.add_token_to_namespace("b", namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a" "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": []})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=[],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1"]})
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)
        extended_vocab = copy.copy(original_vocab)
        extended_vocab._extend(non_padded_namespaces=["tokens1"],
                               tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens2"]})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
Exemplo n.º 4
0
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
            self.create_instance(["this", "is", "a", "sentence"]),
            self.create_instance(["this", "is", "another", "sentence"]),
            self.create_instance(["yet", "another", "sentence"]),
            self.create_instance([
                "this", "is", "a", "very", "very", "very", "very", "long",
                "sentence"
            ]),
            self.create_instance(["sentence"]),
        ]

        class LazyIterable:
            def __iter__(self):
                return (instance for instance in instances)

        self.instances = instances
        self.lazy_instances = LazyIterable()
    def setUp(self):
        super().setUp()
        token_indexer = {"tokens": SingleIdTokenIndexer()}

        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]],
            token_indexer)
        field2 = TextField([
            Token(t)
            for t in ["this", "is", "a", "different", "sentence", "."]
        ], token_indexer)
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           token_indexer)
        self.instances = [
            Instance({
                "text1": field1,
                "text2": field2
            }),
            Instance({
                "text1": field3,
                "text2": field4
            })
        ]
Exemplo n.º 6
0
 def __init__(self,
              max_span_width: int,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._max_span_width = max_span_width
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Exemplo n.º 7
0
 def setUp(self):
     token_indexer = SingleIdTokenIndexer("tokens")
     text_field = TextField([Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
                            {"tokens": token_indexer})
     self.instance = Instance({"text": text_field})
     self.dataset = Batch([self.instance])
     super(TestVocabulary, self).setUp()
Exemplo n.º 8
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False,
              tokenizer: Tokenizer = None) -> None:
     super().__init__(lazy)
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))
Exemplo n.º 9
0
 def __init__(self,
              lazy: bool = False,
              tables_directory: str = None,
              dpd_output_directory: str = None,
              max_dpd_logical_forms: int = 10,
              sort_dpd_logical_forms: bool = True,
              max_dpd_tries: int = 20,
              keep_if_no_dpd: bool = False,
              tokenizer: Tokenizer = None,
              question_token_indexers: Dict[str, TokenIndexer] = None,
              table_token_indexers: Dict[str, TokenIndexer] = None,
              use_table_for_vocab: bool = False,
              linking_feature_extractors: List[str] = None,
              include_table_metadata: bool = False,
              max_table_tokens: int = None,
              output_agendas: bool = False) -> None:
     super().__init__(lazy=lazy)
     self._tables_directory = tables_directory
     self._dpd_output_directory = dpd_output_directory
     self._max_dpd_logical_forms = max_dpd_logical_forms
     self._sort_dpd_logical_forms = sort_dpd_logical_forms
     self._max_dpd_tries = max_dpd_tries
     self._keep_if_no_dpd = keep_if_no_dpd
     self._tokenizer = tokenizer or WordTokenizer(
         SpacyWordSplitter(pos_tags=True))
     self._question_token_indexers = question_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._table_token_indexers = table_token_indexers or self._question_token_indexers
     self._use_table_for_vocab = use_table_for_vocab
     self._linking_feature_extractors = linking_feature_extractors
     self._include_table_metadata = include_table_metadata
     self._basic_types = set(str(type_) for type_ in wt_types.BASIC_TYPES)
     self._max_table_tokens = max_table_tokens
     self._output_agendas = output_agendas
Exemplo n.º 10
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tag_label: str = "ner",
                 feature_labels: Sequence[str] = (),
                 lazy: bool = False,
                 coding_scheme: str = "IOB1",
                 label_namespace: str = "labels",
                 ignore_ner_tags: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        if tag_label is not None and tag_label not in _VALID_LABELS:
            raise ConfigurationError(
                "unknown tag label type: {}".format(tag_label))
        for label in feature_labels:
            if label not in _VALID_LABELS:
                raise ConfigurationError(
                    "unknown feature label type: {}".format(label))
        if coding_scheme not in ("IOB1", "BIOUL"):
            raise ConfigurationError(
                "unknown coding_scheme: {}".format(coding_scheme))

        self.tag_label = tag_label
        self.feature_labels = set(feature_labels)
        self.coding_scheme = coding_scheme
        self.label_namespace = label_namespace
        self.ignore_ner_tags = ignore_ner_tags
Exemplo n.º 11
0
 def test_as_tensor_handles_words(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"words": SingleIdTokenIndexer("words")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                             numpy.array([1, 1, 1, 2, 1]))
Exemplo n.º 12
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False,
              label_namespace_prefix: str = "") -> None:
     super().__init__(lazy=lazy)
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._label_namespace_prefix = label_namespace_prefix
Exemplo n.º 13
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
Exemplo n.º 14
0
 def __init__(self,
              word_tag_delimiter: str = DEFAULT_WORD_TAG_DELIMITER,
              token_delimiter: str = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self._word_tag_delimiter = word_tag_delimiter
     self._token_delimiter = token_delimiter
Exemplo n.º 15
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              use_language_specific_pos: bool = False,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self.use_language_specific_pos = use_language_specific_pos
Exemplo n.º 16
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              domain_identifier: str = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._domain_identifier = domain_identifier
Exemplo n.º 17
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
Exemplo n.º 18
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              sentence_token_indexers: Dict[str, TokenIndexer] = None,
              nonterminal_indexers: Dict[str, TokenIndexer] = None,
              terminal_indexers: Dict[str, TokenIndexer] = None,
              output_agendas: bool = True) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._sentence_token_indexers = sentence_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._nonterminal_indexers = nonterminal_indexers or {
         "tokens": SingleIdTokenIndexer("rule_labels")
     }
     self._terminal_indexers = terminal_indexers or {
         "tokens": SingleIdTokenIndexer("rule_labels")
     }
     self._output_agendas = output_agendas
Exemplo n.º 19
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters"),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}
Exemplo n.º 20
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("this")
     self.vocab.add_token_to_namespace("is")
     self.vocab.add_token_to_namespace("a")
     self.vocab.add_token_to_namespace("sentence")
     self.vocab.add_token_to_namespace(".")
     self.token_indexer = {"tokens": SingleIdTokenIndexer()}
     self.instances = self.get_instances()
     super(TestDataset, self).setUp()
    def test_count_vocab_items_respects_casing(self):
        indexer = SingleIdTokenIndexer("words")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["words"] == {"hello": 1, "Hello": 1}

        indexer = SingleIdTokenIndexer("words", lowercase_tokens=True)
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["words"] == {"hello": 2}
Exemplo n.º 22
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False,
              num_context_answers: int = 0) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._num_context_answers = num_context_answers
Exemplo n.º 23
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              use_pos_tags: bool = True,
              lazy: bool = False,
              label_namespace_prefix: str = "",
              pos_label_namespace: str = "pos") -> None:
     super().__init__(lazy=lazy)
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._use_pos_tags = use_pos_tags
     self._label_namespace_prefix = label_namespace_prefix
     self._pos_label_namespace = pos_label_namespace
Exemplo n.º 24
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        super(TestListField, self).setUp()
Exemplo n.º 25
0
    def __init__(self,
                 tokens_per_instance: int = None,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._tokens_per_instance = tokens_per_instance

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        self._output_indexer: Dict[str, TokenIndexer] = None
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                self._output_indexer = {name: indexer}
                break
        else:
            self._output_indexer = {"tokens": SingleIdTokenIndexer()}
Exemplo n.º 26
0
 def __init__(self,
              source_tokenizer: Tokenizer = None,
              target_tokenizer: Tokenizer = None,
              source_token_indexers: Dict[str, TokenIndexer] = None,
              target_token_indexers: Dict[str, TokenIndexer] = None,
              source_add_start_token: bool = True,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._source_tokenizer = source_tokenizer or WordTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
Exemplo n.º 27
0
    def test_max_vocab_size_partial_dict(self):
        indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()}
        instance = Instance({
                'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers)
        })
        dataset = Batch([instance])
        params = Params({
                "max_vocab_size": {
                        "tokens": 1
                }
        })

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              use_subtrees: bool = False,
              granularity: str = "5-class",
              lazy: bool = False) -> None:
     super().__init__(lazy=lazy)
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._use_subtrees = use_subtrees
     allowed_granularities = ["5-class", "3-class", "2-class"]
     if granularity not in allowed_granularities:
         raise ConfigurationError(
             "granularity is {}, but expected one of: {}".format(
                 granularity, allowed_granularities))
     self._granularity = granularity
Exemplo n.º 29
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in ["a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Exemplo n.º 30
0
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
            'question': self.utterance,
            'columns': ['Name in English', 'Location in English'],
            'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']]
        }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(
            "paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake",
                                                            namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala",
                                                            namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(
            "-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0",
                                                            namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1",
                                                           namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()