def testOpenNMTTokenizer(self): self._testTokenizer( tokenizers.OpenNMTTokenizer(), ["Hello world!", "How are you?"], [["Hello", "world", "!"], ["How", "are", "you", "?"]]) self._testDetokenizer( tokenizers.OpenNMTTokenizer(), [["Hello", "world", "■!"], ["Test"], ["My", "name"]], ["Hello world!", "Test", "My name"])
def testOpenNMTTokenizerArguments(self): with self.assertRaises(ValueError): tokenizers.OpenNMTTokenizer(case_feature=True) tokenizer = tokenizers.OpenNMTTokenizer(mode="aggressive", spacer_annotate=True, spacer_new=True, case_feature=False) self._testTokenizer(tokenizer, ["Hello World-s"], [["Hello", "▁", "World", "-", "s"]])
def testOpenNMTTokenizerAssets(self): asset_dir = self.get_temp_dir() # Write a dummy BPE model. bpe_model_path = os.path.join(asset_dir, "model.bpe") with open(bpe_model_path, "w") as bpe_model_file: bpe_model_file.write("#version: 0.2\ne s</w>\n") tokenizer = tokenizers.OpenNMTTokenizer(mode="conservative", bpe_model_path=bpe_model_path) # Generated assets are prefixed but not existing resources. assets = tokenizer.export_assets(asset_dir, asset_prefix="source_") self.assertIn("source_tokenizer_config.yml", assets) self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"])) self.assertIn("model.bpe", assets) self.assertTrue(os.path.exists(assets["model.bpe"])) # The tokenization configuration should not contain absolute paths to resources. with open(assets["source_tokenizer_config.yml"], "rb") as config_file: asset_config = yaml.load(config_file.read(), Loader=yaml.UnsafeLoader) self.assertDictEqual(asset_config, { "mode": "conservative", "bpe_model_path": "model.bpe" })
def testOpenNMTTokenizerEmptyTensor(self): tokenizer = tokenizers.OpenNMTTokenizer() tokens = tokenizer.tokenize(tf.constant("")) self.assertIs(tokens.dtype, tf.string) self.assertListEqual(tokens.shape.as_list(), [0]) text = tokenizer.detokenize(tokens) self.assertIs(text.dtype, tf.string) self.assertListEqual(text.shape.as_list(), [])
def testOpenNMTTokenizerInferenceMode(self): tokenizer = tokenizers.OpenNMTTokenizer( mode="none", sp_model_path=sp_model, sp_nbest_size=64, sp_alpha=0.1, ) self._testTokenizer(tokenizer, ["appealing"], [["▁appealing"]], training=False)
def testOpenNMTTokenizerInFunction(self): tokenizer = tokenizers.OpenNMTTokenizer() @tf.function def _tokenize(text): return tokenizer.tokenize(text) tokens = _tokenize(tf.constant("Hello world!")) self.assertAllEqual(self.evaluate(tokens), [b"Hello", b"world", b"!"])
def initialize(self, metadata, asset_dir=None, asset_prefix=""): self.vocabulary_file = metadata[self.vocabulary_file_key] self.vocabulary_size = count_lines(self.vocabulary_file) + self.num_oov_buckets if self.tokenizer is None: tokenizer_config = _get_field(metadata, "tokenization", prefix=asset_prefix) if tokenizer_config: if isinstance(tokenizer_config, six.string_types) and compat.gfile_exists(tokenizer_config): with compat.gfile_open(tokenizer_config, mode="rb") as config_file: tokenizer_config = yaml.load(config_file) self.tokenizer = tokenizers.OpenNMTTokenizer(params=tokenizer_config) else: self.tokenizer = tokenizers.SpaceTokenizer() self.tokenizer.initialize(metadata) return super(TextInputter, self).initialize( metadata, asset_dir=asset_dir, asset_prefix=asset_prefix)
def main(): tf.get_logger().setLevel("INFO") parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("data", nargs="*", help="List of data files.") parser.add_argument( "--from_vocab", default=None, help="Build from a saved vocabulary (see also --from_format).", ) parser.add_argument( "--from_format", default="default", choices=["default", "sentencepiece"], help="The format of the saved vocabulary (see also --from_vocab).", ) parser.add_argument("--save_vocab", required=True, help="Output vocabulary file.") parser.add_argument("--min_frequency", type=int, default=1, help="Minimum word frequency.") parser.add_argument( "--size", type=int, default=0, help="Maximum vocabulary size. If = 0, do not limit vocabulary.", ) parser.add_argument( "--size_multiple", type=int, default=1, help= ("Ensure that the vocabulary size + 1 is a multiple of this value " "(+ 1 represents the <unk> token that will be added during the training." ), ) parser.add_argument( "--without_sequence_tokens", default=False, action="store_true", help= "If set, do not add special sequence tokens (start, end) in the vocabulary.", ) parser.add_argument( "--tokenizer_config", default=None, help= ("Tokenization configuration as a JSON string or a path to a YAML configuration file. " "When building a SentencePiece model and vocabulary, this is used as a " "pre-tokenization. SentencePiece will receive tokens instead of sentences as " "inputs."), ) parser.add_argument( "--sentencepiece", nargs="*", default=None, help= ("Build a SentencePiece model and vocabulary. This option accepts additional " "training parameters (e.g. --sentencepiece character_coverage=0.98)." ), ) args = parser.parse_args() special_tokens = [constants.PADDING_TOKEN] if not args.without_sequence_tokens: special_tokens.append(constants.START_OF_SENTENCE_TOKEN) special_tokens.append(constants.END_OF_SENTENCE_TOKEN) vocab = data.Vocab(special_tokens=special_tokens) num_oov_buckets = 1 if args.sentencepiece is not None: if args.min_frequency > 1: raise ValueError( "--min_frequency option is not supported when training a SentencePiece " "model and vocabulary") import pyonmttok if args.size_multiple == 1: vocab_size = args.size else: # Round vocabulary size to the next multiple of args.size_multiple vocab_size = (args.size - (args.size + num_oov_buckets) % args.size_multiple + args.size_multiple) if args.tokenizer_config: tokenizer = tokenizers.make_tokenizer(args.tokenizer_config) if not isinstance(tokenizer, tokenizers.OpenNMTTokenizer): tokenizer_type = tokenizer.__class__.__name__ raise ValueError( "Only tokenizer type 'OpenNMTTokenizer' can be used as a SentencePiece " "pre-tokenization, got tokenizer type '%s' instead." % tokenizer_type) else: tokenizer = None sp_params = dict( map(lambda arg: tuple(arg.split("=")), args.sentencepiece)) sp_trainer = pyonmttok.SentencePieceLearner( tokenizer=tokenizer.opennmt_tokenizer if tokenizer is not None else None, keep_vocab=True, vocab_size=vocab_size, **sp_params, ) for data_file in args.data: sp_trainer.ingest_file(data_file) sp_trainer.learn(args.save_vocab, verbose=True) model_path = args.save_vocab + ".model" vocab_path = args.save_vocab + ".vocab" if tokenizer is None: tf.get_logger().info( "Converting SentencePiece vocabulary to OpenNMT-tf format...") vocab.load(vocab_path, file_format="sentencepiece") else: tf.get_logger().info( "Applying SentencePiece model on data and extracting the %d most " "frequent tokens...", vocab_size, ) tokenizer = tokenizers.OpenNMTTokenizer(sp_model_path=model_path, **tokenizer.config) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=vocab_size) vocab.serialize(vocab_path) else: if args.from_vocab is not None: vocab.load(args.from_vocab, file_format=args.from_format) tokenizer = tokenizers.make_tokenizer(args.tokenizer_config) for data_file in args.data: vocab.add_from_text(data_file, tokenizer=tokenizer) vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency) vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=num_oov_buckets) vocab.serialize(args.save_vocab)
def testOpenNMTTokenizerArguments(self): tokenizer = tokenizers.OpenNMTTokenizer( mode="aggressive", spacer_annotate=True, spacer_new=True) self._testTokenizer(tokenizer, ["Hello World-s"], [["Hello", "▁", "World", "-", "s"]])