Exemplo n.º 1
0
def get_sentencepiece_tokenized_text(
    text: str, tokenizer: tokenization.FullTokenizer) -> TokenizedText:
  """Gets SentencePiece TokenizedText for a text with indices mapping."""
  tokens = [six.ensure_text(tk, "utf-8") for tk in tokenizer.tokenize(text)]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  chars_to_tokens = []
  for i, token in enumerate(tokens):
    num_chars = len(token)
    if i == 0:
      num_chars -= 1
    chars_to_tokens.extend([i] * num_chars)
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  tokenized_text = TokenizedText()
  tokenized_text.text = sentencepiece_detokenize(tokens)
  tokenized_text.tokens = tokens
  tokenized_text.token_ids = token_ids
  tokenized_text.chars_to_tokens = chars_to_tokens
  return tokenized_text
Exemplo n.º 2
0
def get_wordpiece_tokenized_text(
    text: str, tokenizer: tokenization.FullTokenizer) -> TokenizedText:
  """Gets WordPiece TokenizedText for a text with indices mapping."""
  unigrams, _, chars_to_unigrams = whitespace_split_with_indices(text)
  tokens, unigrams_to_tokens, tokens_to_unigrams = (
      wordpiece_tokenize_with_indices(unigrams, tokenizer))
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  tokenized_text = TokenizedText()
  tokenized_text.text = text
  tokenized_text.tokens = tokens
  tokenized_text.token_ids = token_ids
  tokenized_text.unigrams = unigrams
  tokenized_text.chars_to_unigrams = chars_to_unigrams
  tokenized_text.unigrams_to_tokens = unigrams_to_tokens
  tokenized_text.tokens_to_unigrams = tokens_to_unigrams
  return tokenized_text