def __init__(self, opt: Opt, shared=None): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.add_prefix_space = opt.get('bpe_add_prefix_space', True) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space)
def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.special_tok_map = {} # map from HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if PathManager.exists(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if PathManager.exists(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.bpe_dropout: raise NotImplementedError( '--bpe-dropout is not supported with ByteLevelBPE because tokenizers ' 'library does not allow dynamically turning BPE on/off. You can use ' '--dict-tokenizer slow_bytelevel_bpe to gain this feature.' ) if self.lower: warn_once('Are you sure you want to lower case your BPE dictionary?') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).' ) if 'bpe_vocab' not in opt: raise ValueError('--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError('--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError( '--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe' ) if not PathManager.exists(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not PathManager.exists(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer( self.vocab_path, self.merge_path, self.add_prefix_space )
class HuggingFaceBpeHelper(object): @staticmethod def add_cmdline_args(argparser): parser = argparser.add_argument_group('ByteLevelBPE Arguments') parser.add_argument('--bpe-vocab', type=str, help='path to pre-trained tokenizer vocab') parser.add_argument('--bpe-merge', type=str, help='path to pre-trained tokenizer merge') parser.add_argument( '--bpe-add-prefix-space', type='bool', hidden=True, default=True, help='add prefix space before encoding', ) return parser def __init__(self, opt: Opt, shared=None): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.add_prefix_space = opt.get('bpe_add_prefix_space', True) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space) def encode(self, text: str) -> List[str]: return self.tokenizer.encode(text).tokens def decode(self, x: List[str]) -> str: return self.tokenizer.decode(self.tokenizer.token_to_id(c) for c in x)
class HuggingFaceByteLevelBPE(object): def __init__(self, cfg): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") bpe_vocab = file_utils.cached_path(cfg.bpe_vocab) bpe_merges = file_utils.cached_path(cfg.bpe_merges) self.bpe = ByteLevelBPETokenizer( bpe_vocab, bpe_merges, add_prefix_space=cfg.bpe_add_prefix_space, ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode([ int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split() ]) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ")
def test_basic_encode(self, roberta_files): tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"]) output = tokenizer.encode("The quick brown fox jumps over the lazy dog") assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "The", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ] assert output.offsets == [ (0, 3), (3, 9), (9, 15), (15, 19), (19, 25), (25, 30), (30, 34), (34, 39), (39, 43), ]
def __init__(self, version): # download vocab files cache = os.path.join(os.environ.get("CACHE_DIR", os.getcwd()), ".vector_cache") vocab_dir = os.path.join(cache, f"{version}") if not os.path.exists(vocab_dir): pretrained_tokenizer = AutoTokenizer.from_pretrained(version) pretrained_tokenizer.save_pretrained(vocab_dir) if "uncased" in version or "cased" not in version: lowercase = True # roberta, electra, bert-base-uncased else: lowercase = False # bert-cased if version.startswith("bert") or "electra" in version: vocab_path = os.path.join(vocab_dir, "vocab.txt") self.tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=lowercase) elif version.startswith("roberta"): vocab_path = os.path.join(vocab_dir, "vocab.json") merge_path = os.path.join(vocab_dir, "merges.txt") self.tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path, lowercase=lowercase) else: raise NotImplementedError self.cls_token = self.tokenizer._parameters["cls_token"] self.cls_token_id = self.tokenizer.token_to_id(self.cls_token) self.sep_token = self.tokenizer._parameters["sep_token"] self.sep_token_id = self.tokenizer.token_to_id(self.sep_token) self.pad_token = self.tokenizer._parameters["pad_token"] self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
class LineByLineTextDataset(Dataset): def __init__(self, args, file_path: str, block_size=512): assert os.path.isfile(file_path) self.block_size = block_size self.tokenizer = ByteLevelBPETokenizer( os.path.join(args.tokenizer_name, "vocab.json"), os.path.join(args.tokenizer_name, "merges.txt"), ) self.tokenizer._tokenizer.post_processor = RobertaProcessing( ("</s>", self.tokenizer.token_to_id("</s>")), ("<s>", self.tokenizer.token_to_id("<s>")), ) self.tokenizer.enable_truncation(max_length=block_size) logger.info("Creating features from dataset file at %s", file_path) self.examples = [] with open(file_path, encoding="utf-8") as f: for line in f: if len(line) > 0 and not line.isspace(): self.examples.append(line) def __len__(self): return len(self.examples) def __getitem__(self, i): return torch.tensor(self.tokenizer.encode(self.examples[i]).ids[: self.block_size - 2], dtype=torch.long)
def prepare_data(self, *args, **kwargs): dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") column_names = dataset.column_names def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] if (not os.path.exists("data/wiki-vocab.json")) or ( not os.path.exists("data/wiki-merges.txt")): print('TRAIN TOKENIZER') self.tokenizer.train_from_iterator(batch_iterator(), vocab_size=self.vocab_size) self.tokenizer.save_model("data/", "wiki") else: self.tokenizer = ByteLevelBPETokenizer("data/wiki-vocab.json", "data/wiki-merges.txt", add_prefix_space=True) dataset = load_dataset("wikitext", "wikitext-103-raw-v1") def tokenize_function(examples): return { 'input_ids': list( map(lambda x: x.ids, self.tokenizer.encode_batch(examples['text']))) } self.tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=column_names, num_proc=4)
def main(vocab, merges, data_path, lower, save_path): tokenizer = ByteLevelBPETokenizer(vocab, merges, lowercase=lower, add_prefix_space=True) sentiment_hash = dict((v[1:], tokenizer.token_to_id(v)) for v in ('Ġpositive', 'Ġnegative', 'Ġneutral')) print(sentiment_hash) train = pd.read_csv(os.path.join(data_path, 'train.csv')) dataset = [] n = nm = 0 score = 0 for line, row in train.iterrows(): if pd.isna(row.text) and pd.isna(row.selected_text): continue try: ann = annotate(tokenizer, row.text, row.selected_text.strip(' ')) except AssertionError: print(row.text, row.selected_text.strip(' ')) continue ann['sentiment'] = sentiment_hash[row.sentiment] ann['id'] = row.textID dataset.append(ann) decode = ann['text'][ ann['offsets'][ann['start']][0]:ann['offsets'][ann['end']][1]] if set(decode.split()) != set(ann['gt'].split()): nm += 1 score += jaccard(decode, ann['gt']) n += 1 print(f'not match {nm/n}\nBest score {score/n}') if not lower: save_path = 'cased_' + save_path joblib.dump(dataset, save_path, compress='zlib')
def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = ByteLevelBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["A", "Ġsentence"]
class HuggingFaceByteLevelBPE(object): @staticmethod def add_args(parser): # fmt: off parser.add_argument('--bpe-merges', help='path to merges.txt') parser.add_argument('--bpe-vocab', help='path to vocab.json') parser.add_argument('--bpe-add-prefix-space', action='store_true', help='add prefix space before encoding') # fmt: on def __init__(self, args): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") self.bpe = ByteLevelBPETokenizer( args.bpe_vocab, args.bpe_merges, add_prefix_space=getattr(args, "bpe_add_prefix_space", False), ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode([ int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split() ]) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ")
def get_tokenizer(path): tokenizer = ByteLevelBPETokenizer(path + 'vocab.json', path + 'merges.txt') tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) return tokenizer
def __init__(self, args, file_path: str, block_size=512): assert os.path.isfile(file_path) self.block_size = block_size self.tokenizer = ByteLevelBPETokenizer( os.path.join(args.tokenizer_name, "vocab.json"), os.path.join(args.tokenizer_name, "merges.txt"), ) self.tokenizer._tokenizer.post_processor = RobertaProcessing( ("</s>", self.tokenizer.token_to_id("</s>")), ("<s>", self.tokenizer.token_to_id("<s>")), ) self.tokenizer.enable_truncation(max_length=block_size) logger.info("Creating features from dataset file at %s", file_path) self.examples = [] with open(file_path, encoding="utf-8") as f: for line in f: if len(line) > 0 and not line.isspace(): self.examples.append(line)
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = False, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout) tokenizer.train( files=files, vocab_size=vocab_size - len(added_tokens), min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token], ) tokenizer.add_tokens(added_tokens) PREFIX = "aitextgen" save_path_str = "the current directory" if save_path == "" else save_path if serialize: logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " + "You will need this file to build the GPT2Tokenizer.") tokenizer.save(f"{PREFIX}.tokenizer.json") else: logger.info( f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. " + "You will need both files to build the GPT2Tokenizer.") tokenizer.save_model(save_path, PREFIX)
def __init__(self, cfg): super().__init__(cfg) self.scales = [str((cfg.load_size // (2**i))) for i in range(3)] self.scales.reverse() self.device_map = { 'style': self.devices[0], 'content': self.devices[0], 'img': self.devices[0] } self.network_names = [ 'style_model', 'content_model', 'generator', 'discriminators' ] self.device_name_map = { 'style_model': 'style', 'content_model': 'content', 'generators': 'img', 'discriminators': 'img' } tokenizer = ByteLevelBPETokenizer( "vocab.json", "merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) self.cold = True self.language_model = LanguageModel(cfg, tokenizer, self.device_map['style']).to( self.device_map['style']) self.content_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'], cfg).to(self.device_map['style']) self.style_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'], cfg).to(self.device_map['style']) self.generator = StyleGenerator(cfg).to(self.device_map['img']) self.discriminator = FeatureConvolutionalDiscriminator(cfg).to( self.device_map['img']) self.visual_names = ['visual_dict'] self.visual_dict = {'real': None, 'fake': None} self.loss_names = ['loss'] self.visualizer = Visualizer(cfg) self.generator_criterion = BinaryCrossEntropyLoss(cfg).to( self.device_map['img']) self.consistency_criterion = ColorConsistencyLoss(cfg).to( self.device_map['img']) self.distribution_criterion = KLDLoss().to(self.device_map['img']) self.latent_scale = int(cfg.load_size // (2**6)) self.latent_channels = int(cfg.latent_dim) // (self.latent_scale**2) self.channels_z = 8 * self.cfg.ngf - self.latent_channels
def inference(checkpoint_path, hyperparameters_path, tokenizer_path, merges_path, input='In 1691 Moscow established ', generated_length=64, random_selection=True): # Iitialize tokenizer and model from files tokenizer = ByteLevelBPETokenizer( tokenizer_path, merges_path, add_prefix_space=True, ) #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]")) #tokenizer2.pre_tokenizer2 = Whitespace() #tokenizer2 = Tokenizer.from_file("example/tokenizer.json") #initialize model model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path, hparams_file=hyperparameters_path) # Tokenize input sample encoded_sample = tokenizer.encode(input).ids for i in range(generated_length): input_ids = torch.unsqueeze(torch.tensor(encoded_sample, dtype=torch.long), axis=0) # Inference output, attn = model(input_ids) last_word = output[0][-1] if not random_selection: # Pick highest probability token from probability distributions prediction = torch.argmax(output, axis=2).squeeze(axis=0).tolist()[-1] else: # Pick Tokens acording to their probabilities prediction = torch.multinomial(torch.softmax(last_word, 0)**10, 1)[0] # Add prediciton to sequence encoded_sample.append(prediction) # Detokenize output sample decoded_output = tokenizer.decode(encoded_sample) #decoded_output2 = tokenizer2.decode(encoded_sample) output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample] #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample] #print('\n========================\n ORIGINAL BPE \n========================') #print(output_tokens2, decoded_output2, sep='\n') #print('\n========================\n MODIFIED BPE \n========================') return decoded_output, output_tokens, attn
def test_tokenizer(test_sentence, vocab_path, merge_path): r""" Illustrates how the individual Tokenizer works Args: test_sentence (:obj:`str`): Sentence for demonstration purposes vocab_path (:obj:`str`): Path where the vocabulary (most frequent tokens ranked by frequency) is saved merge_path (:obj:`str`): Path where the merges file is saved """ tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=512) print("Original sentence " + test_sentence) print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens)) encoding = tokenizer.encode(test_sentence) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
def train_tokenizer(data_path, wiki_text_file_path): # ToDo := Load if weights exists, else setup tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer_en.pad_token = tokenizer_en.eos_token vocab_size = tokenizer_en.vocab_size max_length = 1024 tokenizer_es = ByteLevelBPETokenizer() tokenizer_es.train( files=[str(wiki_text_file_path)], vocab_size=vocab_size, min_frequency=2, special_tokens=[EOF_TOKEN] ) tokenizer_es.enable_truncation(max_length=max_length) tokenizer_es_path = data_path/"BLBPE_tokenizer_es" tokenizer_es_path.mkdir(exist_ok=True, parents=True) tokenizer_es.save_model(str(tokenizer_es_path)) tokenizer_es = GPT2TokenizerFast.from_pretrained( str(tokenizer_es_path), pad_token=EOF_TOKEN ) tokenizer_es.model_max_length = max_length # tokenizer_es = ByteLevelBPETokenizer( # vocab_file=str(tokenizer_es_path/"vocab.json"), # merges_file=str(tokenizer_es_path/"merges.txt"), # ) # tokenizer_es.enable_truncation(max_length=1024) # ToDo := is this necessary # tokenizer_en.pad_token = tokenizer_en.eos_token return tokenizer_en, tokenizer_es
def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.special_tok_map = {} # map from HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) self.skip_special_tokens = opt.get('hf_skip_special_tokens', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if os.path.isfile(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if os.path.isfile(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.lower: warn_once('Are you sure you want to lower case your BPE dictionary?') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).' ) if 'bpe_vocab' not in opt: raise ValueError('--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError('--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError( '--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe' ) if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer( self.vocab_path, self.merge_path, self.add_prefix_space )
def save_sentense_piece_model(): ko_paths = ['./data/korean-english-park.dev.ko', './data/korean-english-park.train.ko'] en_paths = ['./data/korean-english-park.dev.en', './data/korean-english-park.train.en'] special_token = ["<pad>", "<bos>", "<eos>", "<unk>", "<mask>"] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=ko_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save("./create_spm", "ko") tokenizer.train(files=en_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save("./create_spm", "en")
def __init__(self, cfg): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") self.bpe = ByteLevelBPETokenizer( cfg.bpe_vocab, cfg.bpe_merges, add_prefix_space=cfg.bpe_add_prefix_space, )
def __init__(self, args): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") self.bpe = ByteLevelBPETokenizer( args.bpe_vocab, args.bpe_merges, add_prefix_space=getattr(args, "bpe_add_prefix_space", False), )
def __init__(self, model_name, vocab_file, *, merges_file=None, lowercase=True, handle_chinese_chars=False, dropout=None): self.model_name = model_name if model_name == 'bert': self._pad_token = '[PAD]' self._sep_token = '[SEP]' self._cls_token = '[CLS]' self._unk_token = '[UNK]' if dropout is not None: logger.warning( 'BPE dropout is not supported by BertWordPieceTokenizer.') self.tokenizer = BertWordPieceTokenizer( vocab_file, lowercase=lowercase, handle_chinese_chars=handle_chinese_chars, unk_token=self.unk_token, cls_token=self.cls_token, sep_token=self.sep_token) elif model_name == 'roberta': if merges_file is None: raise AttributeError( 'To use ByteLevelTokenizer, specify path to merges file.') self._pad_token = '<pad>' self._sep_token = '</s>' self._cls_token = '<s>' self._unk_token = '<unk>' try: self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, dropout=dropout) except TypeError as e: logger.warning( 'BPE dropout is not supported by ByteLevelBPETokenizer.') logger.error(e) self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file) else: raise NotImplementedError( f'Tokenizer initialization for model {model_name} is not implemented.' )
def __init__(self, text_files, dataset_info_path='', config_data=None): super().__init__() # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to # change the BERT configuration. vocab_size = 30000 self.info = f'hug{vocab_size}' with open(f'config/data/{config_data}.json') as json_file: tokenizer_from = json.load(json_file)['tokenizer_from'] config_name = config_data if tokenizer_from == "" else tokenizer_from print( os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')) # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of # CPU for no reason (it is much slower). Maybe it will be fixed in the future. if not os.path.isfile( os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')): text_files = text_files() self.tokenizer = ByteLevelBPETokenizer() # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files with open('/tmp/text_files.txt', 'wb') as outfile: for filename in tqdm( text_files, desc='Joining all files into one for tokenization'): with open(filename, 'rb') as readfile: shutil.copyfileobj(readfile, outfile) text_files = '/tmp/text_files.txt' self.tokenizer.train(text_files, vocab_size=vocab_size, special_tokens=special_tokens) self.tokenizer.save(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}') # No "else", always load for consistency vocab_file = os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json') merges_file = os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-merges.txt') self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file) self.tokenizer.add_special_tokens(special_tokens) self.index_special_tokens = { tok: self.tokenizer.encode(tok).ids[0] for tok in special_tokens }
def __init__(self, tokenizer_dir: str, max_line_length: Union[int, None] = 50, padding_id: int = 0): super().__init__() assert exists(join(tokenizer_dir, "vocab.json")), f"vocab.json file missing in '{tokenizer_dir}'" assert exists(join(tokenizer_dir, "merges.txt")), f"merges.txt file missing in '{tokenizer_dir}'" self.tokenizer = ByteLevelBPETokenizer(vocab_file=join(tokenizer_dir, "vocab.json"), merges_file=join(tokenizer_dir, "merges.txt")) self.max_line_length = max_line_length self.padding_id = padding_id self.char_re = re.compile(rf"[^{string.printable}]")
def __init__(self, model_dir, device="cpu"): super().__init__() self.model_dir = abspath(model_dir) assert exists(self.model_dir ), f"model directory '{self.model_dir}' does not exist" assert exists(join(self.model_dir, "classes.json") ), f"classes file does not exist in {self.model_dir}" assert exists( join(self.model_dir, "config.json" )), f"configuration file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "merges.txt")), f"merges file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "weights.pt")), f"weights file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "vocab.json")), f"vocab file does not exist in {self.model_dir}" with open(join(self.model_dir, "classes.json"), "r") as classes_file: self.class_to_index = json.load(classes_file) self.index_to_class = { v: k for k, v in self.class_to_index.items() } with open(join(self.model_dir, "config.json"), "r") as config_file: self.model_config = json.load(config_file) if not torch.cuda.is_available(): device = "cpu" self.device = torch.device(device) self.model = LSTMTagger( vocab_size=self.model_config["vocab_size"], embedding_dim=self.model_config["embedding_dim"], lstm_dim=self.model_config["lstm_dim"], n_classes=len(self.class_to_index)).to(self.device) weights = torch.load(join(self.model_dir, "weights.pt"), map_location=device) self.model.load_state_dict(weights) self.model = self.model.eval() self.tokenizer = ByteLevelBPETokenizer( vocab_file=join(self.model_dir, "vocab.json"), merges_file=join(self.model_dir, "merges.txt"), lowercase=self.model_config["lowercase"]) self.noise_re = re.compile(r"[^A-Za-z ]") self.department_re = re.compile(r"(?:,\s*)?[^,]*Department[^,]*(?:,)", re.IGNORECASE)
def tokenize_hf(df, text_col='text', outfile=None): tokenizer = ByteLevelBPETokenizer( merges_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-merges.txt", vocab_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-vocab.json") tok_snts = [] if outfile is not None: f = open(outfile, 'w', encoding='utf8') data = df if text_col is None else df[text_col] for snt in data: tokenized_snt = tokenizer.encode(snt) if outfile is not None: f.write("{}\n".format("\t".join(tokenized_snt.tokens))) else: tok_snts.append(tokenized_snt.tokens) return tok_snts
def Tok_Train(input_file_path,vocab_size,output_path): """Train a Simple BPE Tokenizer""" GPTToken = ByteLevelBPETokenizer(lowercase=True) GPTToken.enable_padding() GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"]) GPTToken.save_model(output_path) return None
def load_custom_tokenizer(self, path): tokenizer = ByteLevelBPETokenizer(path + "-vocab.json", path + "-merges.txt") # Add preprocessing tokens like Roberta tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) return PreTrainedTokenizerFast(tokenizer, pad_token="<pad>", mask_token="<mask>", unk_token="<unk>", bos_token="<s>", eos_token="</s>")
def __init__(self, data_dir: str = 'data/wikitext-2', train_batch_size: int = 64, val_batch_size: int = 64, dataloader_num_workers: int = 4, seq_length: int = 64, vocab_size=30000): super().__init__() self.train_batch_size = train_batch_size self.val_batch_size = val_batch_size self.dataloader_num_workers = dataloader_num_workers self.seq_length = seq_length self.vocab_size = vocab_size self.tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)