def test_tokenizer(test_sentence, vocab_path, merge_path): r""" Illustrates how the individual Tokenizer works Args: test_sentence (:obj:`str`): Sentence for demonstration purposes vocab_path (:obj:`str`): Path where the vocabulary (most frequent tokens ranked by frequency) is saved merge_path (:obj:`str`): Path where the merges file is saved """ tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=512) print("Original sentence " + test_sentence) print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens)) encoding = tokenizer.encode(test_sentence) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
class HuggingFaceByteLevelBPE(object): def __init__(self, cfg): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") bpe_vocab = file_utils.cached_path(cfg.bpe_vocab) bpe_merges = file_utils.cached_path(cfg.bpe_merges) self.bpe = ByteLevelBPETokenizer( bpe_vocab, bpe_merges, add_prefix_space=cfg.bpe_add_prefix_space, ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode([ int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split() ]) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ")
def test_basic_encode(self, roberta_files): tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"]) output = tokenizer.encode("The quick brown fox jumps over the lazy dog") assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "The", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ] assert output.offsets == [ (0, 3), (3, 9), (9, 15), (15, 19), (19, 25), (25, 30), (30, 34), (34, 39), (39, 43), ]
def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = ByteLevelBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["A", "Ġsentence"]
class HuggingFaceByteLevelBPE(object): @staticmethod def add_args(parser): # fmt: off parser.add_argument('--bpe-merges', help='path to merges.txt') parser.add_argument('--bpe-vocab', help='path to vocab.json') parser.add_argument('--bpe-add-prefix-space', action='store_true', help='add prefix space before encoding') # fmt: on def __init__(self, args): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") self.bpe = ByteLevelBPETokenizer( args.bpe_vocab, args.bpe_merges, add_prefix_space=getattr(args, "bpe_add_prefix_space", False), ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode([ int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split() ]) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ")
class LineByLineTextDataset(Dataset): def __init__(self, args, file_path: str, block_size=512): assert os.path.isfile(file_path) self.block_size = block_size self.tokenizer = ByteLevelBPETokenizer( os.path.join(args.tokenizer_name, "vocab.json"), os.path.join(args.tokenizer_name, "merges.txt"), ) self.tokenizer._tokenizer.post_processor = RobertaProcessing( ("</s>", self.tokenizer.token_to_id("</s>")), ("<s>", self.tokenizer.token_to_id("<s>")), ) self.tokenizer.enable_truncation(max_length=block_size) logger.info("Creating features from dataset file at %s", file_path) self.examples = [] with open(file_path, encoding="utf-8") as f: for line in f: if len(line) > 0 and not line.isspace(): self.examples.append(line) def __len__(self): return len(self.examples) def __getitem__(self, i): return torch.tensor(self.tokenizer.encode(self.examples[i]).ids[: self.block_size - 2], dtype=torch.long)
class HuggingFaceBpeHelper(object): @staticmethod def add_cmdline_args(argparser): parser = argparser.add_argument_group('ByteLevelBPE Arguments') parser.add_argument('--bpe-vocab', type=str, help='path to pre-trained tokenizer vocab') parser.add_argument('--bpe-merge', type=str, help='path to pre-trained tokenizer merge') parser.add_argument( '--bpe-add-prefix-space', type='bool', hidden=True, default=True, help='add prefix space before encoding', ) return parser def __init__(self, opt: Opt, shared=None): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.add_prefix_space = opt.get('bpe_add_prefix_space', True) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space) def encode(self, text: str) -> List[str]: return self.tokenizer.encode(text).tokens def decode(self, x: List[str]) -> str: return self.tokenizer.decode(self.tokenizer.token_to_id(c) for c in x)
def inference(checkpoint_path, hyperparameters_path, tokenizer_path, merges_path, input='In 1691 Moscow established ', generated_length=64, random_selection=True): # Iitialize tokenizer and model from files tokenizer = ByteLevelBPETokenizer( tokenizer_path, merges_path, add_prefix_space=True, ) #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]")) #tokenizer2.pre_tokenizer2 = Whitespace() #tokenizer2 = Tokenizer.from_file("example/tokenizer.json") #initialize model model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path, hparams_file=hyperparameters_path) # Tokenize input sample encoded_sample = tokenizer.encode(input).ids for i in range(generated_length): input_ids = torch.unsqueeze(torch.tensor(encoded_sample, dtype=torch.long), axis=0) # Inference output, attn = model(input_ids) last_word = output[0][-1] if not random_selection: # Pick highest probability token from probability distributions prediction = torch.argmax(output, axis=2).squeeze(axis=0).tolist()[-1] else: # Pick Tokens acording to their probabilities prediction = torch.multinomial(torch.softmax(last_word, 0)**10, 1)[0] # Add prediciton to sequence encoded_sample.append(prediction) # Detokenize output sample decoded_output = tokenizer.decode(encoded_sample) #decoded_output2 = tokenizer2.decode(encoded_sample) output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample] #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample] #print('\n========================\n ORIGINAL BPE \n========================') #print(output_tokens2, decoded_output2, sep='\n') #print('\n========================\n MODIFIED BPE \n========================') return decoded_output, output_tokens, attn
def load_sentence_piece_model(): tokenizer = ByteLevelBPETokenizer(path_vocab, path_model) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=512) encoding = tokenizer.encode("배고파요") print(encoding.tokens) print(encoding.special_tokens_mask) print(encoding.ids) print(encoding.normalized_str)
def inference(): from tokenizers import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing ''' initialize tokenizer with saved model files ''' tokenizer = ByteLevelBPETokenizer( "./tok_checkpoints/tokenizer_model-vocab.json", "./tok_checkpoints/tokenizer_model-merges.txt", ) ''' optional step : preprocess the strings Ex: add <s> and </s> as BOS and EOS tokens to the string pad string to some max length and truncate string to some max length ''' tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='<pad>', pad_id=tokenizer.get_vocab()['<pad>'], length=20) tokenizer.enable_truncation(max_length=20) ''' tokenize/encode strings ''' input_ids = tokenizer.encode("Hello World, Whats up!!!").ids print("input ids", input_ids) tokens = tokenizer.encode("Hello World, Whats up!!!").tokens print("tokens", tokens) ''' tokenize/encode batch of string ''' batch_tokenized = tokenizer.encode_batch( ["Hello World, Whats up!!!", "Whata whata wa wada wada"]) input_ids = [i.ids for i in batch_tokenized] print("input ids", input_ids) tokens = [i.tokens for i in batch_tokenized] print("tokens", tokens)
def tokenize_hf(df, text_col='text', outfile=None): tokenizer = ByteLevelBPETokenizer( merges_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-merges.txt", vocab_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-vocab.json") tok_snts = [] if outfile is not None: f = open(outfile, 'w', encoding='utf8') data = df if text_col is None else df[text_col] for snt in data: tokenized_snt = tokenizer.encode(snt) if outfile is not None: f.write("{}\n".format("\t".join(tokenized_snt.tokens))) else: tok_snts.append(tokenized_snt.tokens) return tok_snts
class FullTokenizer(object): """Runs end-to-end tokenziation.""" def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.tokenizer = ByteLevelBPETokenizer(vocab_file + '/vocab.json', vocab_file + '/merges.txt') def tokenize(self, text): return self.tokenizer.encode(text).ids def convert_tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(tok) for tok in tokens] def convert_ids_to_tokens(self, ids): return self.tokenizer.decode(ids)
def test_lowerspace(self, roberta_files): tokenizer = ByteLevelBPETokenizer( roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True ) output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog") assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "Ġthe", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ]
def get_french_vocab(model_name): root = Path(os.getcwd()).parent.parent.parent french_corpus = "Datasets/corpora/fr/text" fr_corpus_path = os.path.join(root, french_corpus) files = [] for dir_ in os.listdir(fr_corpus_path): fr_corpus_dir = os.path.join(fr_corpus_path, dir_) for text_file in os.listdir(fr_corpus_dir): text_file = os.path.join(fr_corpus_dir, text_file) files.append(text_file) tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files, vocab_size=20000, min_frequency=2, show_progress=True, special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"]) print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens) tokenizer.save(model_name)
class ByteBPETokenizer: def __init__(self, vocab_json, merge_txt, max_length=750): self.tokenizer = ByteLevelBPETokenizer(vocab_json, merge_txt) self.tokenizer.enable_truncation(max_length=max_length) self.tokenizer.enable_padding(max_length=max_length) self.tokenizer.add_special_tokens(["[PAD]", "[CLS]"]) # self.tokenizer.post_processor = RobertaProcessing(("</s>", 2), ("<s>", 1)) # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def encode(self, review): review = clean_sentence(review) encoded = self.tokenizer.encode(review.lower()) # pp_encoded = self.tokenizer.post_process(encoded) return encoded def tokenize2Index(self, review, should_stem=False): encoded = self.encode(review) return encoded.ids def trainBPE(self, paths, vocab_size=30000, min_frequency=10, special_tokens=["[PAD]", "[CLS]"]): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens) tokenizer.save("yelp_bpe/", "yelp-bpe")
) i = 0 j = 0 with open(input_file) as ifile: with open(output_file, "w") as ofile: line_write = "" token_count = 0 for i, line in enumerate(ifile): # print('>>>>>>>',line, '<<<<<<') i = i + 1 if i % 50000 == 0: print('total:', i, line) if line == "\n": continue line = line.replace("\n", "") line = line.replace("\r", "") encoded = tokenizer.encode(line) token_count = token_count + len(encoded.tokens) if (token_count >= 256): # print(token_count, '>>>>>>>',line_write, '<<<<<<') j = j + 1 if j % 10000 == 0: print(j, token_count, line_write) ofile.write(line_write + "\n") line_write = line token_count = len(encoded.tokens) continue line_write = line_write + " " + line ofile.write(line_write + "\n")
from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing tokenizer = ByteLevelBPETokenizer( "./EsperBERTo/vocab.json", "./EsperBERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.encode("Mi estas Julien.") tokenizer.encode("Mi estas Julien.").tokens # Check that PyTorch sees it import torch torch.cuda.is_available() from transformers import RobertaConfig from transformers import RobertaTokenizerFast from transformers import RobertaForMaskedLM config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6,
"<unk>", "<mask>", ]) tokenizer.save_model("BERT/sumerianBERTo") tokenizer = ByteLevelBPETokenizer( "BERT/sumerianBERTo/vocab.json", "BERT/sumerianBERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.encode("dumu a-li2-wa-aq-rum") print(tokenizer.encode("dumu a-li2-wa-aq-rum").tokens) # Configuration config = RobertaConfig( vocab_size=52_000, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) print(config) tokenizer = RobertaTokenizerFast.from_pretrained("BERT/sumerianBERTo", max_len=512) model = RobertaForMaskedLM(config=config)
print("Files processed: %d, Total files: %d" % (count, len(files))) # load raw html data fileData = io.open(file, "r", errors="ignore").readlines() fileData = ''.join(str(line) for line in fileData) fileData = fileData.replace("\n", " ") # ignore the website if language is other than english if isIgnoreOtherLanguages == 1: inputLanguage = detectLanguage(fileData) if inputLanguage != "en": ignoredFiles[file] = True continue # tokenize html code output = tokenizer.encode(fileData) outputDict = collections.Counter(output.ids) # add counts to a dictionary for tfidf scores. for token in outputDict: docDict[token].append(file) print("\nAssigning tfidf weights to tokens...\n") features = [] htmlLabels = [] totalFilesUnderConsideration = len(files) - len(ignoredFiles) count = 0 for i in range(0, len(files)): file = files[i] label = labels[i] count = count + 1
allow_pickle=True).item() print("Document frequency dictionary loaded...") # Testing print("Loading webpage...") try: request = requests.get(websiteToTest) webpageHtml = str(request.text) webpageHtml = webpageHtml.replace("\n", " ") except Exception as e: print('\n', e) print("\nAn error occurred, exiting now... ") exit() # Convert text into feature vector output = tokenizer.encode(webpageHtml) outputDict = collections.Counter(output.ids) # Apply tfidf weighting totalFilesUnderConsideration = docDict["totalFilesUnderConsideration"] array = [0] * tokenizerVocabSize for item in outputDict: if len(docDict[item]) > 0: array[item] = (outputDict[item]) * (math.log10( totalFilesUnderConsideration / len(docDict[item]))) # Getting predictions predictionProbability = model.predict_proba([array])[0][1] print( "\n****************************\n--> Probability that the website is phishing: %.2f" % (predictionProbability * 100))
tokenizer = ByteLevelBPETokenizer() tokenizer.train(["sp_data/mono/all.en-fr"], vocab_size=60000) # with open("sp_data/mono/all.en-fr") as r, open("sp_data/mono/all.en-fr.wordpiece", "w") as w: # lines = r.readlines() # for line in lines: # encoded = tokenizer.encode(line[:-1]) # w.write(" ".join(encoded.tokens)) # w.write("\n") with open("sp_data/para/dev/newstest2013-ref.en") as r, open( "sp_data/para/dev/newstest2013-ref.en.bytebpe", "w") as w: lines = r.readlines() for line in lines: encoded = tokenizer.encode(line[:-1]) w.write(" ".join(encoded.tokens)) w.write("\n") with open("sp_data/para/dev/newstest2013-ref.fr") as r, open( "sp_data/para/dev/newstest2013-ref.fr.bytebpe", "w") as w: lines = r.readlines() for line in lines: encoded = tokenizer.encode(line[:-1]) w.write(" ".join(encoded.tokens)) w.write("\n") with open("sp_data/para/dev/newstest2014-fren-src.en") as r, open( "sp_data/para/dev/newstest2014-fren-src.en.bytebpe", "w") as w: lines = r.readlines() for line in lines:
class MetaCAT(object): r''' TODO: Add documentation ''' def __init__(self, tokenizer=None, embeddings=None, cntx_left=20, cntx_right=20, save_dir='./meta_cat/', pad_id=30000, device='cpu'): self.tokenizer = tokenizer if embeddings is not None: self.embeddings = torch.tensor(embeddings, dtype=torch.float32) else: self.embeddings = None self.cntx_left = cntx_left self.cntx_right = cntx_right self.save_dir = save_dir self.pad_id = pad_id self.device = torch.device(device) self.category_name = None self.category_values = {} self.i_category_values = {} self.model = None # TODO: A shitty solution, make right at some point if not self.save_dir.endswith("/"): self.save_dir = self.save_dir + "/" def train(self, json_path, category_name=None, model_name='BERT_GRU', Bio_BERT_PATH=None, lr=0.01, test_size=0.1, batch_size=100, nepochs=20, lowercase=True, class_weights=None, cv=0, ignore_cpos=False, model_config={}, tui_filter=None, fine_tune=False, auto_save_model=True, score_average='weighted', replace_center=None, seed=11): r''' TODO: Docs ''' set_all_seeds(seed) data = json.load(open(json_path, 'r')) # Create directories if they don't exist if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) # Prepare the data data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase, tui_filter=tui_filter, replace_center=replace_center) if category_name is not None: self.category_name = category_name # Check is the name there if self.category_name not in data: raise Exception( "The category name does not exist in this json file. You've provided '{}', while the possible options are: {}" .format(self.category_name, " | ".join(list(data.keys())))) data = data[self.category_name] if not fine_tune: # Encode the category values data, self.category_values = encode_category_values(data) self.i_category_values = { v: k for k, v in self.category_values.items() } else: # We already have everything, just get the data data, _ = encode_category_values(data, vals=self.category_values) # Convert data tkns to ids data = tkns_to_ids(data, self.tokenizer) if not fine_tune: if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) bid = model_config.get("bid", True) num_layers = model_config.get("num_layers", 2) input_size = model_config.get("input_size", 300) hidden_size = model_config.get("hidden_size", 300) dropout = model_config.get("dropout", 0.5) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses, bid=bid, num_layers=num_layers, input_size=input_size, hidden_size=hidden_size, dropout=dropout) if model_name == 'bert_gru': from medcat.utils.models import BERT_GRU nclasses = len(self.category_values) bid = model_config.get("bid", True) num_layers = model_config.get("num_layers", 5) input_size = model_config.get("input_size", 768) hidden_size = model_config.get("hidden_size", 768) dropout = model_config.get("dropout", 0.5) self.model = BERT_GRU(Bio_BERT_PATH, nclasses=nclasses, bid=bid, num_layers=num_layers, input_size=input_size, hidden_size=hidden_size, dropout=dropout) if cv == 0: (f1, p, r, cls_report) = train_network( self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=self.device, class_weights=class_weights, ignore_cpos=ignore_cpos, save_dir=self.save_dir, auto_save_model=auto_save_model, score_average=score_average) elif cv > 0: # Mainly for testing, not really used in a normal workflow f1s = [] ps = [] rs = [] cls_reports = [] for i in range(cv): # Reset the model if fine_tune: self.load_model(model=model_name) else: if model_name == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses) (_f1, _p, _r, _cls_report) = train_network( self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), lr=lr, test_size=test_size, pad_id=self.pad_id, batch_size=batch_size, nepochs=nepochs, device=self.device, class_weights=class_weights, ignore_cpos=ignore_cpos, save_dir=self.save_dir, score_average=score_average) f1s.append(_f1) ps.append(_p) rs.append(_r) cls_reports.append(_cls_report) f1 = np.average(f1s) p = np.average(ps) r = np.average(rs) # Average cls reports cls_report = {} _cls_report = cls_reports[0] for label in _cls_report.keys(): cls_report[label] = {} if type(_cls_report[label]) == dict: for score in _cls_report[label].keys(): cls_report[label][score] = sum( [r[label][score] for r in cls_reports]) / len(cls_reports) print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r)) return {'f1': f1, 'p': p, 'r': r, 'cls_report': cls_report} def eval(self, json_path, batch_size=100, lowercase=True, ignore_cpos=False, tui_filter=None, score_average='weighted', replace_center=None): data = json.load(open(json_path, 'r')) # Prepare the data data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase, tui_filter=tui_filter, replace_center=replace_center) # Check is the name there if self.category_name not in data: raise Exception( "The category name does not exist in this json file.") data = data[self.category_name] # We already have everything, just get the data data, _ = encode_category_values(data, vals=self.category_values) # Convert data tkns to ids data = tkns_to_ids(data, self.tokenizer) # Run evaluation result = eval_network(self.model, data, max_seq_len=(self.cntx_left + self.cntx_right + 1), pad_id=self.pad_id, batch_size=batch_size, device=self.device, ignore_cpos=ignore_cpos, score_average=score_average) return result def predicit_one(self, text, start, end): """ A test function, not useful in any other case """ text = text.lower() doc_text = self.tokenizer.encode(text) ind = 0 for ind, pair in enumerate(doc_text.offsets): if start >= pair[0] and start <= pair[1]: break _start = max(0, ind - self.cntx_left) _end = min(len(doc_text.tokens), ind + 1 + self.cntx_right) tkns = doc_text.ids[_start:_end] cpos = self.cntx_left + min(0, ind - self.cntx_left) x = torch.tensor([tkns], dtype=torch.long).to(self.device) cpos = torch.tensor([cpos], dtype=torch.long).to(self.device) self.model.eval() outputs_test = self.model(x, cpos) inv_map = {v: k for k, v in self.category_values.items()} return inv_map[int(np.argmax(outputs_test.detach().numpy()[0]))] def save(self, full_save=False): if full_save: # Save tokenizer and embeddings, slightly redundant if hasattr(self.tokenizer, 'save_model'): # Support the new save in tokenizer 0.8.2+ self.tokenizer.save_model(self.save_dir, name='bbpe') else: # Old way of saving models self.tokenizer.save(self.save_dir, name='bbpe') # Save embeddings np.save(open(self.save_dir + "embeddings.npy", 'wb'), np.array(self.embeddings)) # The lstm model is saved during training, don't do it here #save the config. self.save_config() def save_config(self): # TODO: Add other parameters, e.g replace_center, ignore_cpos etc. path = self.save_dir + "vars.dat" to_save = { 'category_name': self.category_name, 'category_values': self.category_values, 'i_category_values': self.i_category_values, 'pad_id': self.pad_id, 'cntx_left': self.cntx_left, 'cntx_right': self.cntx_right } with open(path, 'wb') as f: pickle.dump(to_save, f) def load_config(self): """ Loads variables of this object """ path = self.save_dir + "vars.dat" with open(path, 'rb') as f: to_load = pickle.load(f) self.category_name = to_load['category_name'] self.category_values = to_load['category_values'] self.i_category_values = to_load['i_category_values'] self.cntx_left = to_load['cntx_left'] self.cntx_right = to_load['cntx_right'] self.pad_id = to_load.get('pad_id', 0) def load_model(self, model='lstm'): # Load MODEL if model == 'lstm': from medcat.utils.models import LSTM nclasses = len(self.category_values) self.model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses) path = self.save_dir + "lstm.dat" self.model.load_state_dict(torch.load(path, map_location=self.device)) def load(self, model='lstm', tokenizer_name='bbpe'): """ Loads model and config for this meta annotation """ # Load tokenizer if it is None if self.tokenizer is None: vocab_file = self.save_dir + "{}-vocab.json".format(tokenizer_name) merges_file = self.save_dir + "{}-merges.txt".format( tokenizer_name) self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, lowercase=True) # Load embeddings if None if self.embeddings is None: embeddings = np.load(open(self.save_dir + "embeddings.npy", 'rb')) self.embeddings = torch.tensor(embeddings, dtype=torch.float32) # Load configuration self.load_config() # Load MODEL self.load_model(model=model) self.model.to(self.device) def __call__(self, doc, lowercase=True): """ Spacy pipe method """ data = [] id2row = {} text = doc.text if lowercase: text = text.lower() doc_text = self.tokenizer.encode(text) x = [] cpos = [] # Only loop through non overlapping entities for ent in doc.ents: start = ent.start_char end = ent.end_char ind = 0 for ind, pair in enumerate(doc_text.offsets): if start >= pair[0] and start <= pair[1]: break _start = max(0, ind - self.cntx_left) _end = min(len(doc_text.tokens), ind + 1 + self.cntx_right) _ids = doc_text.ids[_start:_end] _cpos = self.cntx_left + min(0, ind - self.cntx_left) id2row[ent._.id] = len(x) x.append(_ids) cpos.append(_cpos) max_seq_len = (self.cntx_left + self.cntx_right + 1) x = np.array([ (sample + [self.pad_id] * max(0, max_seq_len - len(sample)))[0:max_seq_len] for sample in x ]) x = torch.tensor(x, dtype=torch.long).to(self.device) cpos = torch.tensor(cpos, dtype=torch.long).to(self.device) # Nearly impossible that we need batches, so I'll ignore it if len(x) > 0: self.model.eval() outputs = self.model(x, cpos).detach().to('cpu').numpy() outputs = np.argmax(outputs, axis=1) for ent in doc.ents: val = self.i_category_values[outputs[id2row[ent._.id]]] if ent._.meta_anns is None: ent._.meta_anns = {self.category_name: val} else: ent._.meta_anns[self.category_name] = val return doc
class HuggingFaceBpeHelper(BPEHelper): """ HuggingFace's ByteLevelBPE Tokenizer. Fast because Rust. """ def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.special_tok_map = {} # map from HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if PathManager.exists(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if PathManager.exists(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.bpe_dropout: raise NotImplementedError( '--bpe-dropout is not supported with ByteLevelBPE because tokenizers ' 'library does not allow dynamically turning BPE on/off. You can use ' '--dict-tokenizer slow_bytelevel_bpe to gain this feature.' ) if self.lower: warn_once('Are you sure you want to lower case your BPE dictionary?') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).' ) if 'bpe_vocab' not in opt: raise ValueError('--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError('--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError( '--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe' ) if not PathManager.exists(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not PathManager.exists(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer( self.vocab_path, self.merge_path, self.add_prefix_space ) def helper_encode(self, text: str) -> List[str]: """ Decode list of tokens into text string. :param tokens: list of tokens :param delimiter: string delimiter for tokens :return text: decoded text """ return self.tokenizer.encode(text).tokens def helper_decode( self, tokens: List[str], token_ids: List[int], delimiter: str ) -> str: """ Decode list of tokens into text string. :param tokens: list of tokens :param token_ids: list of token ids :param delimiter: string delimiter for tokens :return text: decoded text """ text = self.tokenizer.decode(token_ids, skip_special_tokens=False) return text def add_special_tokens(self, dict_agent, special_tokens: List[str]): """ Add special tokens to the tokenizer and dict_agent. """ logging.debug(f'adding the following special tokens: {special_tokens}') self.tokenizer.add_special_tokens(special_tokens) # add to HF for tok in special_tokens: parlai_key = dict_agent[tok] hf_key = self.tokenizer.token_to_id(tok) self.special_tok_map[parlai_key] = hf_key def sync_with_dict(self, dict_agent): """ Sync the dictionary agent with Hugging Face tokenizer's BPE dict. Called only once on initialization. """ special_tokens = [ dict_agent.null_token, dict_agent.start_token, dict_agent.end_token, dict_agent.unk_token, ] self.add_special_tokens(dict_agent, special_tokens) for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)): token = self.tokenizer.id_to_token(i) dict_agent.add_token(token) # We don't have access to the hugging face word frequency table, # just set it to 1 instead dict_agent.freq[token] = 1 def save(self, dir_name: str, file_name: str): """ Save appropriate files. :param dir_name: directory to save. :param file_name: file to save. """ self.tokenizer.save_model(dir_name, file_name)
if TRAIN_BASE: # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model("tokenizer") inp = 'print("Hello World!")' tokenizer = GPT2Tokenizer.from_pretrained('tokenizer') tokenizer.add_special_tokens({ "bos_token": "<s>", "pad_token": "<pad>", "eos_token": "</s>", "unk_token": "<unk>", "mask_token": "<mask>", }) t = tokenizer.encode(inp) print(t)
'Initial alphabet for ByteLevel BPE as defined in pre_tokenizers.ByteLevel.alphabet(): ', alphabet) # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'], ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = ByteLevelBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), add_prefix_space=True, ) # Test encoding logger.info( 'Tokens and their ids from ByteLevelBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT' ) encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT', pad_to_max_length=True) logger.info(encoded.tokens) logger.info(encoded.ids) logger.info('done!')
min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) end = datetime.now() print("train ByteLevelBPETokenizer : %s" % str(end - start)) tokenizer.save_model("vocab") # 결과 확인 from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing tokenizer = ByteLevelBPETokenizer( "vocab/vocab.json", "vocab/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.encode("확인 문장(형태소 분석된 형태로 입력)").tokens
from tokenizers.processors import BertProcessing paths = [ str(x) for x in Path('/Users/uri/Documents/Uri/Projects/Bertnik/data/for_training' ).glob("*.txt") ] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=70000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("איך בין א סטודענט פון תל אביב").tokens) # Save files to disk tokenizer.save_model(".", "bertnik")
]) tokenizer.save_model(SAVE_MODEL) tokenizer = ByteLevelBPETokenizer( SAVE_MODEL + "/vocab.json", SAVE_MODEL + "/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained(SAVE_MODEL, max_len=512) model = RobertaForMaskedLM(config=config) print(model.num_parameters()) dataset = LineByLineTextDataset(
# Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save(".", "rubinberto") tokenizer = ByteLevelBPETokenizer( "rubinberto-vocab.json", "rubinberto-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print( tokenizer.encode( "А можно вспоминать не о событиях, а, например, о чувствах, испытываемых нами за «отчетный период»." ).tokens)
def load_french_vocab(model_name): #tokenizer = PreTrainedTokenizerFast(tokenizer_object=model_name) #print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens) tokenizer = ByteLevelBPETokenizer("wiki_fr_tokenizer.json", add_prefix_space=True) print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)