def testSpaceTokenizer(self):
     self._testTokenizer(tokenizers.SpaceTokenizer(),
                         ["Hello world !", "How are you ?", "Good !"],
                         [["Hello", "world", "!"],
                          ["How", "are", "you", "?"], ["Good", "!"]])
     self._testDetokenizer(
         tokenizers.SpaceTokenizer(),
         [["Hello", "world", "!"], ["Test"], ["My", "name"]],
         ["Hello world !", "Test", "My name"])
示例#2
0
 def initialize(self, metadata, asset_dir=None, asset_prefix=""):
   self.vocabulary_file = metadata[self.vocabulary_file_key]
   self.vocabulary_size = count_lines(self.vocabulary_file) + self.num_oov_buckets
   if self.tokenizer is None:
     tokenizer_config = _get_field(metadata, "tokenization", prefix=asset_prefix)
     if tokenizer_config:
       if isinstance(tokenizer_config, six.string_types) and compat.gfile_exists(tokenizer_config):
         with compat.gfile_open(tokenizer_config, mode="rb") as config_file:
           tokenizer_config = yaml.load(config_file)
       self.tokenizer = tokenizers.OpenNMTTokenizer(params=tokenizer_config)
     else:
       self.tokenizer = tokenizers.SpaceTokenizer()
   self.tokenizer.initialize(metadata)
   return super(TextInputter, self).initialize(
       metadata, asset_dir=asset_dir, asset_prefix=asset_prefix)