Python TokenizerSpec.text_to_ids示例

编程语言: Python

命名空间/包名称: nemo.collections.common.tokenizers.tokenizer_spec

类/类型: TokenizerSpec

方法/功能: text_to_ids

hotexamples.com的示例: 2

Python TokenizerSpec.text_to_ids - 已找到2个示例。这些是从开源项目中提取的最受好评的nemo.collections.common.tokenizers.tokenizer_spec.TokenizerSpec.text_to_ids现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

text_to_tokens(4)

tokens_to_ids(4)

text_to_ids(2)

convert_tokens_to_ids(1)

ids_to_tokens(1)

tokenize(1)

示例#1

显示文件

 def get_ptune_query(
     self,
     content: Dict,
     prompt_token_id: int,
     max_seq_len: int,
     templates: List[int],
     tokenizer: TokenizerSpec,
 ):
     text_a = content['sentence']
     text_b = content['question']
     sentence_a = f" Paragraph: {text_a}"
     sentence_b = f" Question: {text_b}?"
     a_input_token_ids = tokenizer.text_to_ids(sentence_a)
     b_input_token_ids = tokenizer.text_to_ids(sentence_b)
     c_input_token_ids = tokenizer.text_to_ids(" Answer:")
     cut = 0
     total_num_ids = len(a_input_token_ids) + len(b_input_token_ids) + len(
         c_input_token_ids) + sum(templates)
     if total_num_ids > max_seq_len:
         logging.warning(
             "Input sequence is longer than the LM model max seq, will cut it off to fit"
         )
         cut = total_num_ids - max_seq_len
     return ([prompt_token_id] * templates[0] + a_input_token_ids[cut:] +
             [prompt_token_id] * templates[1] + b_input_token_ids +
             [prompt_token_id] * templates[2] + c_input_token_ids)

示例#2

显示文件

 def get_ptune_query(
     self,
     content: Dict,
     prompt_token_id: int,
     max_seq_len: int,
     templates: List[int],
     tokenizer: TokenizerSpec,
 ):
     all_ids = []
     limits = []
     for piece in self.pieces:
         if isinstance(piece, str):
             # replace variables if any
             variables = re.findall(r'{\w*}', piece)
             variable_text = {}
             limit_length = False
             for var in variables:
                 varname = var[1:-1]
                 variable_text[varname] = content[varname]
                 if varname == self.limit_length_field:
                     limit_length = True
             text = piece.format(**variable_text)
             text_ids = tokenizer.text_to_ids(text)
             all_ids.append(text_ids)
             limits.append(limit_length)
         else:
             # this is virtual token
             all_ids.append([prompt_token_id] * templates[piece])
             limits.append(False)
     total_num_of_ids = sum([len(i) for i in all_ids])
     if total_num_of_ids > max_seq_len:
         logging.warning(
             "Input sequence is longer than the LM model max seq, will cut it off to fit"
         )
         cut = total_num_of_ids - max_seq_len
         new_ids = []
         for i in range(len(limits)):
             if limits[i]:
                 if len(all_ids[i]) < cut:
                     raise ValueError(
                         f"Some other field length is too long, cutting {self.limit_length_field} is not enough"
                     )
                 new_ids.append(all_ids[i][cut:])
             else:
                 new_ids.append(all_ids[i])
         return functools.reduce(lambda x, y: x + y, new_ids)
     else:
         return functools.reduce(lambda x, y: x + y, all_ids)