def test_get_set_components(self): toki = Tokenizer(models.BPE()) toki.normalizer = normalizers.NFC() toki.pre_tokenizer = pre_tokenizers.ByteLevel() toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1)) toki.decoder = decoders.ByteLevel() tokenizer = BaseTokenizer(toki) assert isinstance(tokenizer.model, models.BPE) assert isinstance(tokenizer.normalizer, normalizers.NFC) assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel) assert isinstance(tokenizer.post_processor, processors.BertProcessing) assert isinstance(tokenizer.decoder, decoders.ByteLevel) tokenizer.model = models.Unigram() assert isinstance(tokenizer.model, models.Unigram) tokenizer.normalizer = normalizers.NFD() assert isinstance(tokenizer.normalizer, normalizers.NFD) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace) tokenizer.post_processor = processors.ByteLevel() assert isinstance(tokenizer.post_processor, processors.ByteLevel) tokenizer.decoder = decoders.WordPiece() assert isinstance(tokenizer.decoder, decoders.WordPiece)
def train_tokenizer(input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000): """ Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` :param input_dir: input directory containing jsonl files :param save_path: path to save tokenizer to :param tokenizer_type: type of tokenizer to train. :param vocab_size: int, size of tokenizer's vocab :return: """ if tokenizer_type == "BPE": model = models.BPE() else: raise NotImplementedError( f'Tokenizer type {tokenizer_type} not implemented') tokenizer = Tokenizer(model) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train_from_iterator(json_iterator(input_dir), trainer) # And Save it tokenizer.save(save_path, pretty=True) print(f'Tokenizer saved at {save_path}')
def get_tokenizer(args): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() if os.path.isdir(args.tokenizer_dir): vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json') merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) else: os.makedirs(args.tokenizer_dir) trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"]) files = [ os.path.join(args.data_dir, split) for split in ['train.json', 'val.json', 'test.json'] ] tokenizer.train(files=files, trainer=trainer) tokenizer.model.save(args.tokenizer_dir) return tokenizer
def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) return tokenizer
def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.RobertaProcessing( sep=(ot.sep_token, ot.sep_token_id), cls=(ot.cls_token, ot.cls_token_id), add_prefix_space=ot.add_prefix_space, trim_offsets=True, # True by default on Roberta (historical) ) return tokenizer
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE( vocab, merges, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def configure(self): self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer(BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def generate_tokenizer(equations, output, vocab_size): from tokenizers import Tokenizer, pre_tokenizers from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True) tokenizer.train(trainer, equations) tokenizer.save(path=output, pretty=False)
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, } super().__init__(tokenizer, parameters)
def setup_tokenizer(_): # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) normalizers = [NFKC()] tokenizer.normalizer = Sequence(normalizers) return tokenizer
def configure(self): if isinstance(Tokenizer, UnsupportedPackage): Tokenizer.raise_error(self.__provider__) self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer( BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def configure(self): if Tokenizer is None: raise ConfigError( "Annotation converter: wikitext2raw required tokenizers package installation. " "Please install it before usage.") self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer( BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def get_tokenizer(self, tokenizer_dir): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() vocab_fn = os.path.join(tokenizer_dir, 'vocab.json') merge_fn = os.path.join(tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) tokenizer.add_special_tokens(['[UNK]', '[PAD]', '[BOS]', '[EOS]']) return tokenizer
def get_tokenizer_trainer(): # START init_tokenizer_trainer from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) # END init_tokenizer_trainer trainer.show_progress = False return tokenizer, trainer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="</w>", fuse_unk=False, unk_token=str(unk_token), )) tokenizer.normalizer = normalizers.Sequence([ normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase() ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split( Regex( r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" ), behavior="removed", invert=True, ), pre_tokenizers.ByteLevel(add_prefix_space=False), ]) tokenizer.decoder = decoders.ByteLevel() # Hack to have a ByteLevel and TemplaceProcessor tokenizer.post_processor = processors.RobertaProcessing( sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), add_prefix_space=False, trim_offsets=False, ) return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
Although that way may not be obvious at first unless you're Dutch. Now is better than never. Although never is often better than *right* now. If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those! """.split("\n") if args.type == "gpt2": print("Running GPT-2 tokenizer") tok_p = GPT2Tokenizer.from_pretrained('gpt2') # Create a Tokenizer using BPE tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) # check number inputs and outputs if len(model.inputs) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(model.inputs))) if len(model.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(model.outputs))) input_tensor = model.inputs[0].any_name if not args.dynamic_shape and ( model.inputs[0].partial_shape.is_dynamic or model.inputs[0].shape[1] != args.max_seq_len): model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(args.max_seq_len)]) }) if args.dynamic_shape: model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(0, args.max_seq_len)]) }) # load model to the device compiled_model = core.compile_model(model, args.device) output_tensor = compiled_model.outputs[0] infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = args.max_seq_len eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: model_input = input_ids if not args.dynamic_shape: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for OpenVINO runtime inputs = { input_tensor: model_input, } # infer by OpenVINO runtime t_start = time.perf_counter() outputs = infer_request.infer(inputs)[output_tensor] t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(model_input.shape[1], 1 / (t_end - t_start), t_end - t_start)) next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)". format(t_count, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info('Reading model {}'.format(args.model)) ie_net = ie.read_network(model=model_xml, weights=model_bin) # check input and output names if len(ie_net.input_info) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(ie_net.input_info))) if len(ie_net.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(ie_net.outputs))) input_names = next(iter(ie_net.input_info)) output_names = next(iter(ie_net.outputs)) # load model to the device ie_net_exec = ie.load_network(network=ie_net, device_name=args.device) log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = ie_net.input_info[input_names].input_data.shape[1] eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for IE inputs = { input_names: model_input, } # infer by IE t_start = time.perf_counter() res = ie_net_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(max_length, 1 / (t_end - t_start), t_end - t_start)) outputs = res[output_names] next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)" .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
for s in g: f.write(s) f.write("\n\n") elif args.file_type == 'txt': shutil.copyfile(str(arch), str(fp)) data_files = glob(str(out_path / "*.txt")) data_files = random.sample(data_files, int(0.2 * len(data_files))) assert len(data_files) > 0, 'No data files found' # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train(trainer, data_files) # And Save it tokenizer_path = out_path / "byte-level-bpe.tokenizer.json" tokenizer.save(str(tokenizer_path), pretty=True) print(f'tokenizer saved at {str(tokenizer_path)}') return tokenizer_path
bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer ids = bert_tokenizer.encode(sentences[10]).ids bert_tokenizer.decode(ids) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) tokenizer.train_from_iterator(sentences, trainer=trainer) tokenizer.encode(sentences[4]).ids tokenizer.decode(tokenizer.encode(sentences[4]).ids) tokenizer.save('bert_out/test2') tokenizer.save_pretrained('bert_out/test')