def build_tokenizer_by_config(self, tok_config, lang): if tok_config is None: tok_config = {"mode": "aggressive"} if lang == "zh": tok_config["segment_alphabet"] = ["Han"] tok_config["segment_alphabet_change"] = True # to avoid SentencePiece sampling if "sp_nbest_size" in tok_config: tok_config["sp_nbest_size"] = 0 return tokenizer.build_tokenizer(tok_config)
def build_tokenizer_by_config(self, tok_config, lang): if tok_config is None: tok_config = {"mode": "aggressive"} if lang == 'zh': tok_config['segment_alphabet'] = ['Han'] tok_config['segment_alphabet_change'] = True # to avoid SentencePiece sampling if 'sp_nbest_size' in tok_config: tok_config['sp_nbest_size'] = 0 return tokenizer.build_tokenizer(tok_config)
def _build_subword_learner(tok_config, result_dir, ref_tok_config=None): subword_config = tok_config.get("build_subword") if subword_config is None: return {} if ref_tok_config is None: ref_tok_config = tok_config subword_info = tokenizer.make_subword_learner( subword_config, result_dir, tokenizer=tokenizer.build_tokenizer(ref_tok_config)) return subword_info
def _build_process(self, config, side, build_state): # Disable subword regularization in inference. if self.process_type != prepoperator.ProcessType.TRAINING: config["bpe_dropout"] = 0 config["sp_nbest_size"] = 0 config["sp_alpha"] = 0 if config.get("restrict_subword_vocabulary", False): vocabulary_path = build_state.get("src_vocabulary" if side == "source" else "tgt_vocabulary") if vocabulary_path is None: raise ValueError( "restrict_subword_vocabulary is set but no vocabulary is set" ) # The open source Tokenizer does not accept the custom vocabulary format # produced by build_vocab so we create a temporary vocabulary with a simpler # format. with tempfile.NamedTemporaryFile(mode="w") as vocab_file: for token in tokenizer.load_vocabulary(vocabulary_path): vocab_file.write("%s\n" % token) vocab_file.flush() config["vocabulary_path"] = vocab_file.name current_tokenizer = tokenizer.build_tokenizer(config) else: current_tokenizer = tokenizer.build_tokenizer(config) previous_tokenizer = None if build_state: if side == "source": previous_tokenizer = build_state["src_tokenizer"] build_state["src_tokenizer"] = current_tokenizer else: previous_tokenizer = build_state["tgt_tokenizer"] build_state["tgt_tokenizer"] = current_tokenizer if (self.process_type == prepoperator.ProcessType.POSTPROCESS and not self._postprocess_only): return previous_tokenizer return current_tokenizer