def testOpenNMTTokenizerAssets(self): asset_dir = self.get_temp_dir() # Write a dummy BPE model. bpe_model_path = os.path.join(asset_dir, "model.bpe") with open(bpe_model_path, "wb") as bpe_model_file: bpe_model_file.write(b"#version: 0.2\ne s</w>\n") tokenizer = OpenNMTTokenizer(mode="conservative", bpe_model_path=bpe_model_path) # Generated assets are prefixed but not existing resources. assets = tokenizer.export_assets(asset_dir, asset_prefix="source_") self.assertIn("source_tokenizer_config.yml", assets) self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"])) self.assertIn("model.bpe", assets) self.assertTrue(os.path.exists(assets["model.bpe"])) # The tokenization configuration should not contain absolute paths to resources. with open(assets["source_tokenizer_config.yml"], "rb") as config_file: asset_config = yaml.load(config_file.read(), Loader=yaml.UnsafeLoader) self.assertDictEqual(asset_config, { "mode": "conservative", "bpe_model_path": "model.bpe" })
def testOpenNMTTokenizer(self): self._testTokenizer(OpenNMTTokenizer(), "Hello world!", ["Hello", "world", "!"]) self._testDetokenizer( OpenNMTTokenizer(), [["Hello", "world", "■!"], ["Test"], ["My", "name"]], ["Hello world!", "Test", "My name"])
def testOpenNMTTokenizerAssets(self): asset_dir = self.get_temp_dir() # Write a dummy SentencePiece model. sp_model_path = os.path.join(asset_dir, "model.sp") with open(sp_model_path, "wb") as sp_model_file: sp_model_file.write(b"some model data\n") tokenizer = OpenNMTTokenizer(params={ "mode": "none", "sp_model_path": sp_model_path }) # By default, no assets are returned. assets = tokenizer.initialize({}) self.assertDictEqual(assets, {}) # Generated assets are prefixed but not existing resources. assets = tokenizer.initialize({}, asset_dir=asset_dir, asset_prefix="source_") self.assertIn("source_tokenizer_config.yml", assets) self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"])) self.assertIn("model.sp", assets) self.assertTrue(os.path.exists(assets["model.sp"])) # The tokenization configuration should not contain absolute paths to resources. with open(assets["source_tokenizer_config.yml"], "rb") as config_file: asset_config = yaml.load(config_file.read()) self.assertDictEqual(asset_config, { "mode": "none", "sp_model_path": "model.sp" })
def testOpenNMTTokenizer(self): self._testTokenizer(OpenNMTTokenizer(), "Hello world!", ["Hello", "world", "!"]) tok_config = os.path.join(self.get_temp_dir(), "tok_config.yml") with open(tok_config, "wb") as tok_config_file: tok_config_file.write(b"mode: aggressive\n" b"spacer_annotate: true\n" b"spacer_new: true\n") self._testTokenizer(OpenNMTTokenizer(configuration_file_or_key=tok_config), "Hello World-s", ["Hello", "▁", "World", "-", "s"]) self._testDetokenizer( OpenNMTTokenizer(), [["Hello", "world", "■!"], ["Test"], ["My", "name"]], ["Hello world!", "Test", "My name"])
def testOpenNMTTokenizerArguments(self): tokenizer = OpenNMTTokenizer(mode="aggressive", spacer_annotate=True, spacer_new=True) self._testTokenizer(tokenizer, "Hello World-s", ["Hello", "▁", "World", "-", "s"])
def testOpenNMTTokenizerFromConfiguration(self): params = { "mode": "aggressive", "spacer_annotate": True, "spacer_new": True } tok_config = os.path.join(self.get_temp_dir(), "tok_config.yml") with open(tok_config, "w") as tok_config_file: yaml.dump(params, tok_config_file) def _test(tokenizer): self._testTokenizer(tokenizer, "Hello World-s", ["Hello", "▁", "World", "-", "s"]) tokenizer = OpenNMTTokenizer(configuration_file_or_key=tok_config) _test(tokenizer) tokenizer = OpenNMTTokenizer( configuration_file_or_key="source_tokenization") tokenizer.initialize({"source_tokenization": tok_config}) _test(tokenizer) tokenizer = OpenNMTTokenizer( configuration_file_or_key="source_tokenization") tokenizer.initialize({"source_tokenization": params}) _test(tokenizer) tokenizer = OpenNMTTokenizer(params=params) _test(tokenizer)