Python build_tokenizer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nmtwizard.preprocess.tokenizer

메소드/함수: build_tokenizer

hotexamples.com에서의 예제들: 4

Python build_tokenizer - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nmtwizard.preprocess.tokenizer.build_tokenizer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: entrypoint.py 프로젝트: OpenNMT/nmt-wizard-docker

 def build_tokenizer_by_config(self, tok_config, lang):
     if tok_config is None:
         tok_config = {"mode": "aggressive"}
         if lang == "zh":
             tok_config["segment_alphabet"] = ["Han"]
             tok_config["segment_alphabet_change"] = True
     # to avoid SentencePiece sampling
     if "sp_nbest_size" in tok_config:
         tok_config["sp_nbest_size"] = 0
     return tokenizer.build_tokenizer(tok_config)

예제 #2

파일 보기

 def build_tokenizer_by_config(self, tok_config, lang):
     if tok_config is None:
         tok_config = {"mode": "aggressive"}
         if lang == 'zh':
           tok_config['segment_alphabet'] = ['Han']
           tok_config['segment_alphabet_change'] = True
     # to avoid SentencePiece sampling
     if 'sp_nbest_size' in tok_config:
         tok_config['sp_nbest_size'] = 0
     return tokenizer.build_tokenizer(tok_config)

예제 #3

파일 보기

def _build_subword_learner(tok_config, result_dir, ref_tok_config=None):
    subword_config = tok_config.get("build_subword")
    if subword_config is None:
        return {}
    if ref_tok_config is None:
        ref_tok_config = tok_config
    subword_info = tokenizer.make_subword_learner(
        subword_config,
        result_dir,
        tokenizer=tokenizer.build_tokenizer(ref_tok_config))
    return subword_info

예제 #4

파일 보기

    def _build_process(self, config, side, build_state):
        # Disable subword regularization in inference.
        if self.process_type != prepoperator.ProcessType.TRAINING:
            config["bpe_dropout"] = 0
            config["sp_nbest_size"] = 0
            config["sp_alpha"] = 0

        if config.get("restrict_subword_vocabulary", False):
            vocabulary_path = build_state.get("src_vocabulary" if side ==
                                              "source" else "tgt_vocabulary")
            if vocabulary_path is None:
                raise ValueError(
                    "restrict_subword_vocabulary is set but no vocabulary is set"
                )

            # The open source Tokenizer does not accept the custom vocabulary format
            # produced by build_vocab so we create a temporary vocabulary with a simpler
            # format.
            with tempfile.NamedTemporaryFile(mode="w") as vocab_file:
                for token in tokenizer.load_vocabulary(vocabulary_path):
                    vocab_file.write("%s\n" % token)
                vocab_file.flush()
                config["vocabulary_path"] = vocab_file.name
                current_tokenizer = tokenizer.build_tokenizer(config)
        else:
            current_tokenizer = tokenizer.build_tokenizer(config)

        previous_tokenizer = None
        if build_state:
            if side == "source":
                previous_tokenizer = build_state["src_tokenizer"]
                build_state["src_tokenizer"] = current_tokenizer
            else:
                previous_tokenizer = build_state["tgt_tokenizer"]
                build_state["tgt_tokenizer"] = current_tokenizer
        if (self.process_type == prepoperator.ProcessType.POSTPROCESS
                and not self._postprocess_only):
            return previous_tokenizer
        return current_tokenizer