def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((DataTrainingArguments, CustomOthersArguments)) (data_args, custom_args) = parser.parse_args_into_dataclasses() train_files = list(sorted(glob.glob(f'{data_args.train_dir}/*.{data_args.ext}'))) validation_files = list(sorted(glob.glob(f'{data_args.eval_dir}/*.{data_args.ext}'))) additional_special_tokens = ADDITIONAL_SPECIAL_TOKENS pre_tokenizer_func = PRE_TOKENIZERS_MAP.get(custom_args.pre_tokenizer_type, None) if pre_tokenizer_func is None: raise NotImplementedError elif custom_args.pre_tokenizer_type == 'sefr_cut': raise ValueError('sefr_cut is slow use fake_sefr_cu with sefr_cut_pre_tokenizer instead') if not os.path.exists(custom_args.output_file) or custom_args.overwrite_output_file: trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func, vocab_size=custom_args.vocab_size, vocab_min_freq=custom_args.vocab_min_freq, input_files=train_files, additional_special_tokens=additional_special_tokens) trainer.count_parallel() trainer.save_vocab(custom_args.output_file) if custom_args.pre_tokenizer_type == 'fake_sefr_cut': custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom( FakeSefrCustomTokenizer(PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token'])) else: custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom( CustomPreTokenizer(pre_tokenizer_func)) tokenizer = Tokenizer(models.WordLevel.from_file(custom_args.output_file, unk_token='<unk>')) tokenizer.pre_tokenizer = custom_pre_tokenizer if custom_args.debug: print('Tokenize following text.') texts = ['<s>โรนัลโดเขาได้เล่นกับทีม</s>', 'โปรตุเกสมีโรนัลโด', 'โรนัลโดเขาได้เล่นกับทีม\nโปรตุเกสมีโรนัลโด'] ids = [e.ids for e in tokenizer.encode_batch(texts)] decoded_texts = tokenizer.decode_batch(ids) decoded_texts = [text.replace(' ', '') for text in decoded_texts] for text, i, decoded_text in zip(texts, ids, decoded_texts): print('Text: ', text, '>>', 'Tokenized: ', i, '>>', 'Decoded: ', decoded_text) with open(validation_files[0], 'r') as f: while True: line = f.readline() if line: line = line.strip() if len(line) > 0 and not line.isspace(): encoded = tokenizer.encode(line) decoded = tokenizer.decode(encoded.ids).replace(' ', '') print('Text: ', line, '>>', encoded.ids, '>>', decoded) else: break
def test_decode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can decode single sequences output = tokenizer.decode([0, 1, 2, 3]) assert output == "my name is john" # Can decode batch output = tokenizer.decode_batch([[0, 1, 2, 3], [4]]) assert output == ["my name is john", "pair"]
class HuggingFaceWordLevelTokenizer(TokenizerBase): def __init__(self, **kwargs): super().__init__(**kwargs) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers self.tokenizer = Tokenizer( models.WordLevel(unk_token=self.unknown_token)) self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() if self.lower: self.tokenizer.normalizer = normalizers.Lowercase() def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None): from tokenizers import trainers trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size, special_tokens=list( self.special_tokens)) self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer) self.token_to_id = self.tokenizer.get_vocab() self.id_to_token = { token_id: token for token, token_id in self.token_to_id.items() } def encode(self, texts): id_seqs = self.tokenizer.encode_batch(texts) id_seqs = [id_seq.ids for id_seq in id_seqs] return self._post_process( id_seqs, pad_id=self.token_to_id[self.pad_token] if self.pad_token else None, sos_id=self.token_to_id[self.sos_token] if self.sos_token else None, eos_id=self.token_to_id[self.eos_token] if self.eos_token else None, ) def decode(self, id_seqs): self.tokenizer.decode_batch(id_seqs)
print(f"Transformer tokenizer took: {time_p} sec") print(f"SpeedUp Ratio: {time_p / time_r}") ids_r = [sentence.ids for sentence in encoded_r] diff_ids = 0 for i in range(0, len(encoded_r)): if encoded_r[i].ids != encoded_p[i]: diff_ids += 1 if args.debug: print(encoded_r[i].ids) print(encoded_p[i]) print(encoded_r[i].tokens) print(tok_p.tokenize(text[i])) print(text[i]) print("") print(f"Ids differences: {diff_ids}") decoded_r = tok_r.decode_batch([sentence.ids for sentence in encoded_r], False) decoded_p = [tok_p.decode(en) for en in encoded_p] diff_decoded = 0 for i in range(0, len(text)): if decoded_r[i] != decoded_p[i]: diff_decoded += 1 if args.debug: print(f"Original: {text[i]}") print(f"Rust: {decoded_r[i]}") print(f"Python: {decoded_p[i]}") print("") print(f"Decoding differences: {diff_decoded}")
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info('Reading model {}'.format(args.model)) ie_net = ie.read_network(model=model_xml, weights=model_bin) # check input and output names if len(ie_net.input_info) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(ie_net.input_info))) if len(ie_net.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(ie_net.outputs))) input_names = next(iter(ie_net.input_info)) output_names = next(iter(ie_net.outputs)) # load model to the device ie_net_exec = ie.load_network(network=ie_net, device_name=args.device) log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = ie_net.input_info[input_names].input_data.shape[1] eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for IE inputs = { input_names: model_input, } # infer by IE t_start = time.perf_counter() res = ie_net_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(max_length, 1 / (t_end - t_start), t_end - t_start)) outputs = res[output_names] next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)" .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) # check number inputs and outputs if len(model.inputs) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(model.inputs))) if len(model.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(model.outputs))) input_tensor = model.inputs[0].any_name if not args.dynamic_shape and ( model.inputs[0].partial_shape.is_dynamic or model.inputs[0].shape[1] != args.max_seq_len): model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(args.max_seq_len)]) }) if args.dynamic_shape: model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(0, args.max_seq_len)]) }) # load model to the device compiled_model = core.compile_model(model, args.device) output_tensor = compiled_model.outputs[0] infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = args.max_seq_len eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: model_input = input_ids if not args.dynamic_shape: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for OpenVINO runtime inputs = { input_tensor: model_input, } # infer by OpenVINO runtime t_start = time.perf_counter() outputs = infer_request.infer(inputs)[output_tensor] t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(model_input.shape[1], 1 / (t_end - t_start), t_end - t_start)) next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)". format(t_count, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))