def testSpaceTokenizer(self): self._testTokenizer(tokenizers.SpaceTokenizer(), ["Hello world !", "How are you ?", "Good !"], [["Hello", "world", "!"], ["How", "are", "you", "?"], ["Good", "!"]]) self._testDetokenizer( tokenizers.SpaceTokenizer(), [["Hello", "world", "!"], ["Test"], ["My", "name"]], ["Hello world !", "Test", "My name"])
def initialize(self, metadata, asset_dir=None, asset_prefix=""): self.vocabulary_file = metadata[self.vocabulary_file_key] self.vocabulary_size = count_lines(self.vocabulary_file) + self.num_oov_buckets if self.tokenizer is None: tokenizer_config = _get_field(metadata, "tokenization", prefix=asset_prefix) if tokenizer_config: if isinstance(tokenizer_config, six.string_types) and compat.gfile_exists(tokenizer_config): with compat.gfile_open(tokenizer_config, mode="rb") as config_file: tokenizer_config = yaml.load(config_file) self.tokenizer = tokenizers.OpenNMTTokenizer(params=tokenizer_config) else: self.tokenizer = tokenizers.SpaceTokenizer() self.tokenizer.initialize(metadata) return super(TextInputter, self).initialize( metadata, asset_dir=asset_dir, asset_prefix=asset_prefix)