示例#1
0
def _log_sample_data(model_dir: str, sp: spm.SentencePieceProcessor):
    training_data_path = Path(model_dir) / const.TRAINING_DATA
    if not training_data_path.is_file():
        logging.info("Training data not found for SP sampling")
        return

    with open(training_data_path) as fin:
        sample = fin.readline().strip()

    logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens")
    logging.info(
        "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n"
        .format(repr(sample),
                ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1))))
    logging.info(
        "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n"
        .format(repr(sample),
                ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
class SubwordTokenizer(Tokenizer):
    def __init__(self,
                 model_path: str = None,
                 nbest_size: int = None,
                 alpha: float = None):
        self._model_path = cached_path(model_path)
        self._processor = SentencePieceProcessor()
        self._processor.Load(self._model_path)
        self._nbest_size = nbest_size
        self._alpha = alpha

    def tokenize(self, text: str) -> List[Token]:
        if self._nbest_size and self._alpha:
            subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha)
        else:
            subwords = self._processor.EncodeAsPieces(text)
        tokens = [Token(s) for s in subwords]
        return tokens

    def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
        return [self.tokenize(text) for text in texts]