def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) return tokenizer
def get_tokenizer(args): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() if os.path.isdir(args.tokenizer_dir): vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json') merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) else: os.makedirs(args.tokenizer_dir) trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"]) files = [ os.path.join(args.data_dir, split) for split in ['train.json', 'val.json', 'test.json'] ] tokenizer.train(files=files, trainer=trainer) tokenizer.model.save(args.tokenizer_dir) return tokenizer
def train_tokenizer(input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000): """ Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` :param input_dir: input directory containing jsonl files :param save_path: path to save tokenizer to :param tokenizer_type: type of tokenizer to train. :param vocab_size: int, size of tokenizer's vocab :return: """ if tokenizer_type == "BPE": model = models.BPE() else: raise NotImplementedError( f'Tokenizer type {tokenizer_type} not implemented') tokenizer = Tokenizer(model) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train_from_iterator(json_iterator(input_dir), trainer) # And Save it tokenizer.save(save_path, pretty=True) print(f'Tokenizer saved at {save_path}')
def test_get_set_components(self): toki = Tokenizer(models.BPE()) toki.normalizer = normalizers.NFC() toki.pre_tokenizer = pre_tokenizers.ByteLevel() toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1)) toki.decoder = decoders.ByteLevel() tokenizer = BaseTokenizer(toki) assert isinstance(tokenizer.model, models.BPE) assert isinstance(tokenizer.normalizer, normalizers.NFC) assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel) assert isinstance(tokenizer.post_processor, processors.BertProcessing) assert isinstance(tokenizer.decoder, decoders.ByteLevel) tokenizer.model = models.Unigram() assert isinstance(tokenizer.model, models.Unigram) tokenizer.normalizer = normalizers.NFD() assert isinstance(tokenizer.normalizer, normalizers.NFD) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace) tokenizer.post_processor = processors.ByteLevel() assert isinstance(tokenizer.post_processor, processors.ByteLevel) tokenizer.decoder = decoders.WordPiece() assert isinstance(tokenizer.decoder, decoders.WordPiece)
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.RobertaProcessing( sep=(ot.sep_token, ot.sep_token_id), cls=(ot.cls_token, ot.cls_token_id), add_prefix_space=ot.add_prefix_space, trim_offsets=True, # True by default on Roberta (historical) ) return tokenizer
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE( vocab, merges, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def configure(self): self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer(BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, } super().__init__(tokenizer, parameters)
def setup_tokenizer(_): # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) normalizers = [NFKC()] tokenizer.normalizer = Sequence(normalizers) return tokenizer
def configure(self): if isinstance(Tokenizer, UnsupportedPackage): Tokenizer.raise_error(self.__provider__) self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer( BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def get_tokenizer(self, tokenizer_dir): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() vocab_fn = os.path.join(tokenizer_dir, 'vocab.json') merge_fn = os.path.join(tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) tokenizer.add_special_tokens(['[UNK]', '[PAD]', '[BOS]', '[EOS]']) return tokenizer
def configure(self): if Tokenizer is None: raise ConfigError( "Annotation converter: wikitext2raw required tokenizers package installation. " "Please install it before usage.") self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer( BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def get_tokenizer_trainer(): # START init_tokenizer_trainer from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) # END init_tokenizer_trainer trainer.show_progress = False return tokenizer, trainer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="</w>", fuse_unk=False, unk_token=str(unk_token), )) tokenizer.normalizer = normalizers.Sequence([ normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase() ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split( Regex( r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" ), behavior="removed", invert=True, ), pre_tokenizers.ByteLevel(add_prefix_space=False), ]) tokenizer.decoder = decoders.ByteLevel() # Hack to have a ByteLevel and TemplaceProcessor tokenizer.post_processor = processors.RobertaProcessing( sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), add_prefix_space=False, trim_offsets=False, ) return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
Although never is often better than *right* now. If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those! """.split("\n") if args.type == "gpt2": print("Running GPT-2 tokenizer") tok_p = GPT2Tokenizer.from_pretrained('gpt2') # Create a Tokenizer using BPE tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) # check number inputs and outputs if len(model.inputs) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(model.inputs))) if len(model.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(model.outputs))) input_tensor = model.inputs[0].any_name if not args.dynamic_shape and ( model.inputs[0].partial_shape.is_dynamic or model.inputs[0].shape[1] != args.max_seq_len): model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(args.max_seq_len)]) }) if args.dynamic_shape: model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(0, args.max_seq_len)]) }) # load model to the device compiled_model = core.compile_model(model, args.device) output_tensor = compiled_model.outputs[0] infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = args.max_seq_len eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: model_input = input_ids if not args.dynamic_shape: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for OpenVINO runtime inputs = { input_tensor: model_input, } # infer by OpenVINO runtime t_start = time.perf_counter() outputs = infer_request.infer(inputs)[output_tensor] t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(model_input.shape[1], 1 / (t_end - t_start), t_end - t_start)) next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)". format(t_count, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
def train_custom_tokenizer(dataset, token_model, tknzr_file, vocab_size, vocab=None, pretrain_fast=False, max_input_chars_per_word=None, eos_token=None, bos_token=None, pad_token=None, mask_token=None, unk_token=None): """ Building a Tokenizer using HuggingFace library. The pipeline seems to be: - Model : algorithm that tokenizes, it is a mandatory component. There are only 4 models implemented (BPE, Unigram, WordLevel, WordPiece) - Normalizer : some preprocessing that could happen before, but doesn't necessarily have to - Pre-Tokenizer : splitting the input according to some rules - Post-Processing : needing to add some tokens/input after (mostly seems to be eos, bos tokens) - Decoder : certain previous pipeline steps need to be reversed for proper decoding - Trainer : The corresponding training algorithm for the model Note : Some pre-processing might need to happen beforehand in previous functions (might be easier using pandas before) Input token_model (str) : algorithm to use for tokenization dataset (class) : a python iterator that goes through the data to be used for training token_dir (str) : directory with tokenizers vocab_size (int) : size of the vocabulary to use tokenFilename (str) : filename of particular token we want to train. Will overwrite previously save files. vocab (list of str) : models other than BPE can use non-mandatory vocab as input max_input_chars_per_word : used for WordPiece Output tokenizer : huggingFace Tokenizer object, our fully trainer tokenizer """ special_token_lst = [ pad_token, bos_token, eos_token, mask_token, unk_token ] # NFKC normalizer_lst = [] pre_tokenizer_lst = [Whitespace, ByteLevel] decoder_lst = [] bos_idx = special_token_lst.index(bos_token) eos_idx = special_token_lst.index(eos_token) if token_model == 'BPE': model = BPE(unk_token=unk_token) Trainer = BpeTrainer elif token_model == 'Unigram': model = Unigram(vocab=vocab) Trainer = UnigramTrainer elif token_model == 'WordLevel': model = WordLevel(unk_token=unk_token, vocab=vocab) Trainer = WordLevelTrainer elif token_model == 'WordPiece': model = WordPiece(unk_token=unk_token, vocab=vocab, max_input_chars_per_word=max_input_chars_per_word) Trainer = WordPieceTrainer else: error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \ % VALID_TOKENIZATIONS raise SystemExit(error_msg) # instantiation tokenizer = Tokenizer(model) # Select a tokenization trainer if vocab_size is None: trainer = Trainer(show_progress=True, special_tokens=special_token_lst) else: trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_token_lst) # Set the normalizer tokenizer.normalizer = normalizers.Sequence( [fcn() for fcn in normalizer_lst]) # Set the pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [fcn() for fcn in pre_tokenizer_lst]) # Set the post-processing tokenizer.post_processor = processors.TemplateProcessing( single=bos_token + " $A " + eos_token, special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)], # pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1", ) # Set the decoder if ByteLevel in pre_tokenizer_lst: tokenizer.decoder = decoders.ByteLevel() if Metaspace in pre_tokenizer_lst: tokenizer.decoder = decoders.Metaspace() if token_model == 'WordPiece': tokenizer.decoder = decoders.WordPiece() # creating iterator def batch_iterator(): for i in np.arange(0, len(dataset)): yield dataset[i] # train call tokenizer.train_from_iterator(trainer=trainer, iterator=batch_iterator(), length=len(dataset)) if Path(tknzr_file).exists(): print(f"Warning : overwriting previously save tokenizer with\ same filename ( {tknzr_file} ).") tokenizer.save(tknzr_file) if pretrain_fast: tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file) tokenizer.pad_token = pad_token tokenizer.mask_token = mask_token return tokenizer
for s in g: f.write(s) f.write("\n\n") elif args.file_type == 'txt': shutil.copyfile(str(arch), str(fp)) data_files = glob(str(out_path / "*.txt")) data_files = random.sample(data_files, int(0.2 * len(data_files))) assert len(data_files) > 0, 'No data files found' # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train(trainer, data_files) # And Save it tokenizer_path = out_path / "byte-level-bpe.tokenizer.json" tokenizer.save(str(tokenizer_path), pretty=True) print(f'tokenizer saved at {str(tokenizer_path)}') return tokenizer_path
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info('Reading model {}'.format(args.model)) ie_net = ie.read_network(model=model_xml, weights=model_bin) # check input and output names if len(ie_net.input_info) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(ie_net.input_info))) if len(ie_net.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(ie_net.outputs))) input_names = next(iter(ie_net.input_info)) output_names = next(iter(ie_net.outputs)) # load model to the device ie_net_exec = ie.load_network(network=ie_net, device_name=args.device) log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = ie_net.input_info[input_names].input_data.shape[1] eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for IE inputs = { input_names: model_input, } # infer by IE t_start = time.perf_counter() res = ie_net_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(max_length, 1 / (t_end - t_start), t_end - t_start)) outputs = res[output_names] next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)" .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab