def main(): args = cmd_args() outdir = args.o if args.o else os.path.dirname(args.i) target_special_tokens, subtoken_special_tokens = get_special_tokens( args.preset) with tempfile.TemporaryDirectory() as tmp_dir: targets_file = os.path.join(tmp_dir, "labels.txt") subtokens_file = os.path.join(tmp_dir, "subtokens.txt") print(f"Creating training files for BPE") create_bpe_training_file(args.i, targets_file, subtokens_file) if args.preset == Preset.variable: print("Variable preset") subtoken_tokenizer = SentencePieceBPETokenizer() target_tokenizer = SentencePieceBPETokenizer() print(f"Training subtoken tokenizer") subtoken_tokenizer.add_special_tokens(subtoken_special_tokens) print(f"Training target tokenizer") target_tokenizer.add_special_tokens(target_special_tokens) target_tokenizer.train(files=[targets_file], vocab_size=args.target_vocab) subtoken_tokenizer.train(files=[subtokens_file], vocab_size=args.subtoken_vocab) target_tokenizer.save(outdir, "target.bpe") subtoken_tokenizer.save(outdir, "subtoken.bpe")
def train(corpus_list, vocab_size, output, output_name=None): print("create tokenizer...") tokenizer = SentencePieceBPETokenizer() print("load corpus list...") corpus_list = open(corpus_list).read().split('\n')[:-1] print("train tokenizer...") tokenizer.train( corpus_list, vocab_size=vocab_size, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) print("save model...") tokenizer.save_model(output, output_name)
def main(): args = cmd_args() outdir = args.o if args.o else os.path.dirname(args.i) print( f"Training SentencePiece to create a vocabulary of size {args.vocab_size}" ) with tempfile.TemporaryDirectory() as tmp_dir: train_file = os.path.join(tmp_dir, "train.txt") create_bpe_training_file(args.i, train_file) tokenizer = SentencePieceBPETokenizer() tokenizer.train(files=[train_file], vocab_size=args.vocab_size) tokenizer.save(outdir, args.n)
class TokenizerWrapper: def __init__(self, tok_type, unk_token, sep_token, cls_token, pad_token, mask_token): self.tok_type = tok_type if self.tok_type == 'bpe': self.tokenizer = ByteLevelBPETokenizer() elif self.tok_type == 'wordpiece': self.tokenizer = BertWordPieceTokenizer(unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token) elif self.tok_type == 'sentencepiece': self.tokenizer = SentencePieceBPETokenizer(unk_token=unk_token) def train(self, data_file, vocab_size, special_tokens): if self.tok_type in ['bpe', 'wordpiece', 'sentencepiece']: self.tokenizer.train([data_file], vocab_size=vocab_size, special_tokens=special_tokens) def tokenize(self, text): if self.tok_type in ['bpe', 'wordpiece', 'sentencepiece']: return self.tokenizer.encode(text).tokens elif self.tok_type == 'word': return nltk.tokenize.word_tokenize(text) elif self.tok_type == 'char': return [c for c in text] else: raise Exception('Unknown tokenizer: ' + self.tok_type) def decode(self, tokens, blank_token): if self.tok_type in ['bpe', 'wordpiece', 'sentencepiece']: ids = [self.tokenizer.token_to_id(t) for t in tokens] ids = [ i if i != None else self.tokenizer.token_to_id(blank_token) for i in ids ] return self.tokenizer.decode(ids, skip_special_tokens=False) elif self.tok_type == 'word': return ' '.join(tokens) elif self.tok_type == 'char': return ''.join(tokens) else: raise Exception('Unknown tokenizer: ' + self.tok_type)
def build_bpe(vocab_size=10000): # Initialize a tokenizer tokenizer = SentencePieceBPETokenizer() #mypath = "../../Downloads/riksdagens_protokoll_1920-2020/annual" mypath = "../../Desktop/cood/python/machine-learning/old-school/markov-lstm-killer/data/fi" onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] print("ONL", onlyfiles) paths = [mypath + "/" + f for f in onlyfiles] #paths = paths[:5] # COPY FILES txts = [] for path, fname in zip(paths, onlyfiles): if path[-4:] == ".txt": localpath = "data/" + fname txts.append(localpath) infile = open(path) outfile = open(localpath, "w") for line in infile: clean_line = cleanup(line) + "\n" outfile.write(clean_line) outfile.close() # Then train it! #tokenizer.train([ "../../Downloads/riksdagens_protokoll_1920-2020/annual/prot_2019.txt" ], vocab_size=15000) tokenizer.train(txts, vocab_size=vocab_size) # Now, let's use it: s = "Det politiska arbetet har redan börjat på olika sätt, med resor, besök, möten, politikutveckling, motionsskrivande och mycket annat. Jag har sett att ni redan har varit aktiva under ett antal veckor, och jag kan försäkra er att det även gäller talmanspresidiet. Nu är det dags att med tillförsikt påbörja ett nytt riksdagsår. Jag hoppas att ni alla ser fram emot det lika mycket som jag gör." #s = "Ite en oo viel mitää hyvää kyl sielt syöny." #s = "ja kieltämät siihe tommoste kokonaisii sanoi merkitsevät tavumerkit on huomattavasti näppärämpii ku ääniä tarkottavat aakkoset joist pitää rakentaa jokane sana" encoded = tokenizer.encode(s) print(encoded.ids) print(encoded.tokens) # And finally save it somewhere tokenizer.save("./bpe-fi.tokenizer.json")
def train_kenlm_language_model(input_data_paths, output_model_dir): output_model_dir = Path(output_model_dir) output_model_dir.mkdir(exist_ok=True, parents=True) output_model_path = output_model_dir / 'kenlm_model.arpa' with log_action('Training tokenizer'): tokenizer = SentencePieceBPETokenizer() tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000) tokenizer.save(str(output_model_dir), 'spm_tokenizer') with log_action('Tokenizing'): tokenized_data_paths = get_temp_filepaths(len(input_data_paths)) for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths): encodings = tokenizer.encode_batch(read_lines(input_data_path)) write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path) with log_action('Training language model'): kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ') command = ( f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}' ) run_command(command, mute=False) [path.unlink() for path in tokenized_data_paths] return output_model_dir
# for prefix in prefixes: # input_dir_gs = os.path.join( # STORAGE_BUCKET, # "data/corpus/%s_lower/zhwiki-latest-pages-articles_%s_lower.txt" % (prefix, prefix) # ) # input_dir_local = "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix # tf.gfile.Copy(input_dir_gs, input_dir_local, overwrite=True) for vocab_size in vocab_sizes: for prefix in prefixes: try: tokenizer_name = prefix + "_" + str(vocab_size) tokenizer = SentencePieceBPETokenizer() tokenizer.train( [ "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix # "./zhwiki-latest-pages-articles_lower.txt" ], vocab_size=vocab_size, show_progress=True, min_frequency=1, special_tokens=[ "<unk>", "[SEP]", "[CLS]", "[PAD]", "[MASK]" ]) tokenizer.save("data_proc/tokenizers/sentencepiece", tokenizer_name) except Exception as e: print(e)
import argparse from tokenizers import SentencePieceBPETokenizer from tokenizers.trainers import BpeTrainer parser = argparse.ArgumentParser() parser.add_argument("--corpus_file", type=str) parser.add_argument("--vocab_size", type=int, default=32000) parser.add_argument("--limit_alphabet", type=int, default=6000) args = parser.parse_args() tokenizer = SentencePieceBPETokenizer( vocab_file=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, # Must be False if cased model lowercase=False, wordpieces_prefix="##") tokenizer.train(files=[args.corpus_file], limit_alphabet=args.limit_alphabet, vocab_size=args.vocab_size) tokenizer.save("./", "ch-{}-wpm-{}".format(args.limit_alphabet, args.vocab_size))
class BPE: """ An implementation of Byte-Pair Encoding (BPE) which supports - Character BPE - Byte BPE - WordPiece BPE - SentencePiece BPE """ def __init__(self, args): self.args = args if self.args.type == "byte": self.tokenizer = ByteLevelBPETokenizer( add_prefix_space=True, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=None, merges_file=None, dropout=None, continuing_subword_prefix=None, end_of_word_suffix=None) elif self.args.type == "char": self.tokenizer = CharBPETokenizer( unk_token=unk_token, # required suffix=suffix_token, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=None, merges_file=None, dropout=None) elif self.args.type == "bert": self.tokenizer = BertWordPieceTokenizer( clean_text=True, # required handle_chinese_chars=True, # required strip_accents=True, # required lowercase=True, # required vocab_file=None, # add_special_tokens=True, unk_token=BUNK, sep_token=BSEP, cls_token=BCLS, wordpieces_prefix=BPRE) elif self.args.type == "sent": self.tokenizer = SentencePieceBPETokenizer( add_prefix_space=True, # required unk_token=unk_token, replacement=rep_token, vocab_file=None, merges_file=None, dropout=None) else: raise Exception("Not implement yet") pass @staticmethod def load(vocab_file=None): if not os.path.exists(vocab_file): raise Exception("{} is not exist".format(vocab_file)) path, filename = os.path.split(vocab_file) ttype = filename.split("_")[0] merges_file = os.path.join( path, filename.replace("vocab.json", "merges.txt")) if ttype == "byte": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = ByteLevelBPETokenizer( add_prefix_space=True, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=vocab_file, merges_file=merges_file, dropout=None, continuing_subword_prefix=None, end_of_word_suffix=None) elif ttype == "char": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = CharBPETokenizer( unk_token=unk_token, # required suffix=suffix_token, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=vocab_file, merges_file=merges_file, dropout=None) elif ttype == "bert": tokenizer = BertWordPieceTokenizer( clean_text=True, # required handle_chinese_chars=True, # required strip_accents=True, # required lowercase=True, # required vocab_file=vocab_file, # add_special_tokens=True, unk_token=BUNK, sep_token=BSEP, cls_token=BCLS, wordpieces_prefix=BPRE) elif ttype == "sent": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = SentencePieceBPETokenizer( add_prefix_space=True, # required unk_token=unk_token, replacement=rep_token, vocab_file=vocab_file, merges_file=merges_file, dropout=None) else: raise Exception("Not implement yet") return tokenizer def train(self): files, vocab_size, min_frequency = self.args.files, self.args.vocab_size, self.args.min_frequency limit_alphabet = self.args.limit_alphabet files = glob.glob(files) if not files: print(f"File does not exist: {args.files}") exit(1) if self.args.type == "bert": # special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] special_tokens = [BPAD, BUNK, BCLS, BSEP, BMASK] else: # special_tokens = ["<unk>"] special_tokens = [pad_token, unk_token] if self.args.type == "byte": self.tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, show_progress=True) elif self.args.type == "char": self.tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, limit_alphabet=limit_alphabet, initial_alphabet=[], suffix=suffix_token, show_progress=True) elif self.args.type == "bert": self.tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, limit_alphabet=limit_alphabet, initial_alphabet=[], wordpieces_prefix=BPRE, show_progress=True) elif self.args.type == "sent": self.tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, limit_alphabet=limit_alphabet, initial_alphabet=[], show_progress=True) else: raise Exception("Not implement yet") if not os.path.exists(self.args.out): os.mkdir(self.args.out) self.tokenizer.save(self.args.out, self.args.type + "_level-bpe") @staticmethod def tokens2ids(pretrained_tokenizer, sos=False, eos=False, add_special_tokens=False): """ :param pretrained_tokenizer: pretrained tokenizer :return: a token2index function """ def f(sent): if sos: sent = SOT + " " + sent if eos: sent = sent + " " + EOT tokenized_ids = pretrained_tokenizer.encode( sent, add_special_tokens=add_special_tokens).ids return tokenized_ids return f @staticmethod def collate_fn(padding_value=0, batch_first=True): def collate(examples): source = pad_sequence([torch.tensor(d[0]) for d in examples], batch_first=batch_first, padding_value=padding_value) target = pad_sequence([ torch.tensor(d[1]) if d[1] is not None else torch.empty(0) for d in examples ], batch_first=batch_first, padding_value=padding_value) return source, target return collate
from pathlib import Path from omegaconf import OmegaConf from tokenizers import SentencePieceBPETokenizer root_dir = Path("../..") config_dir = root_dir / "configs" dataset_config = OmegaConf.load(config_dir / "data" / "wmt14.en-de.yaml") tokenizer_config = OmegaConf.load(config_dir / "tokenizer" / "sentencepiece_bpe_wmt14_en-de.yaml") tokenizer = SentencePieceBPETokenizer() tokenizer.train( [ str(root_dir / dataset_config.path.source_train), str(root_dir / dataset_config.path.target_train), ], vocab_size=tokenizer_config.vocab_size, min_frequency=tokenizer_config.min_frequency, special_tokens=list(tokenizer_config.special_tokens), limit_alphabet=tokenizer_config.limit_alphabet, ) tokenizer.save_model(directory=".", name=tokenizer_config.tokenizer_name)
special_tokens = [ '[PAD]', '[UNK]', '[SEP]', '[P0]', '[P1]', '[DOC_SEP]' ] tokenizer = SentencePieceBPETokenizer(unk_token='[UNK]') texts = [ path.join(DATA_PATH, item) for item in listdir(DATA_PATH) if item.endswith('.txt') ] tokenizer.train(texts, vocab_size=VOCAB_SIZE, min_frequency=10, special_tokens=special_tokens ) SAVE_PATH = path.join(DATA_PATH, 'vocab') if not path.isdir(SAVE_PATH): import os os.makedirs(SAVE_PATH) tokenizer.save(SAVE_PATH, 'en')
args = parser.parse_args() files = glob.glob(args.files) if not files: logger.info(f'File does not exist: {args.files}') exit(1) # Initialize an empty tokenizer tokenizer = SentencePieceBPETokenizer(add_prefix_space=True) # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<unk>'], limit_alphabet=1000 ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = SentencePieceBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), add_prefix_space=True ) # Test encoding
from google.cloud import storage from nltk.corpus import stopwords import nltk nltk.download('stopwords') # client = storage.Client() storage_client = storage.Client() bucket = storage_client.get_bucket('assignment1bdia') from tokenizers import SentencePieceBPETokenizer #Initialize a tokenizer tokenizer = SentencePieceBPETokenizer() #Then train it! tokenizer.train('Trained_words.txt') #And finally save it somewhere tokenizer.save("tokenizer", "my-bpe") #Initialize a tokenizer vocab = "tokenizer/my-bpe-vocab.json" merges = "tokenizer/my-bpe-merges.txt" tokenizer = SentencePieceBPETokenizer(vocab, merges) global_list = [] # scrape from input to output class Scrape(beam.DoFn): def process(self, element): inputs_pattern = 'gs://assignment1bdia/data/export_dataframe.csv'
class TextProcessor: def __init__(self, tok_model_path: Optional[str] = None): self.languages = {} if tok_model_path is not None: self.tokenizer = SentencePieceBPETokenizer( tok_model_path + "/vocab.json", tok_model_path + "/merges.txt", ) with open(os.path.join(tok_model_path, "langs"), "rb") as fp: self.languages: Dict[str, int] = pickle.load(fp) self.init_properties(self.languages) def init_properties(self, languages: Dict[str, int] = None): self.max_len = 512 self.pad_token = "<pad>" self.mask_token = "<mask>" self.unk_token = "<unk>" self.sep_token = "</s>" self.bos = "<s>" self.special_tokens = [ self.pad_token, self.bos, self.unk_token, self.mask_token, self.sep_token ] + list(languages.keys()) self.languages = languages def train_tokenizer(self, paths: List[str], vocab_size: int, to_save_dir: str, languages: Dict[str, int]): self.tokenizer = SentencePieceBPETokenizer() self.init_properties(languages) self.tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=5, special_tokens=self.special_tokens) self.save(directory=to_save_dir) def _tokenize(self, line) -> Encoding: return self.tokenizer.encode(line) def save(self, directory): self.tokenizer.save(directory) with open(os.path.join(directory, "langs"), "wb") as fp: pickle.dump(self.languages, fp) def tokenize_one_line(self, line, ignore_middle_eos: bool = False) -> List[int]: tokenized = [] spl = [sen for sen in line.split("</s>") if len(sen.strip()) > 0] if spl[0].startswith("<"): words = spl[0].strip().split(" ") spl[0] = " ".join(words[1:]) tokenized += [self.token_id(words[0])] for sen in spl: tokenized += self._tokenize(sen).ids if not ignore_middle_eos: tokenized += [self.sep_token_id()] if ignore_middle_eos: tokenized += [self.sep_token_id()] return tokenized def tokenize_one_sentence(self, line) -> List[int]: """ Assume that the sentence has language id in the first token and end of sentence as the end! :param line: :return: """ spl = line.strip().split(" ") lang_id, sen, eos = spl[0], " ".join(spl[1:-1]), spl[-1] tokenized = [self.token_id(lang_id) ] + self._tokenize(sen).ids + [self.token_id(eos)] return tokenized def tokenize_lines(self, line, blind_split: bool = False, split_len: int = 512) -> List[List[int]]: """ :param line: :param blind_split: If True, just splits the tokenized data into chunks without considering that every vector should start with a first word in sentence. :return: """ tokenized = [] if len(self.languages) > 0: spl = [sen for sen in line.split("</s>") if len(sen.strip()) > 0] lang_id = [] if spl[0].startswith("<"): words = spl[0].strip().split(" ") lang_id = [self.token_id(words[0])] spl[0] = " ".join(words[1:]) max_len = 0 for sen in spl: toks = self._tokenize(sen).ids tokenized += lang_id + toks + [self.sep_token_id()] max_len = max(max_len, len(toks) + 1) else: tokenized = self._tokenize(line.strip()).ids if blind_split: num_pads = (split_len - (len(tokenized) % split_len)) pad_arr = [self.pad_token_id()] * num_pads tokenized = np.array(tokenized + pad_arr) reshaped = tokenized.reshape((-1, split_len)) return reshaped else: return self.split_tokenized(tokenized, min(max_len, self.max_len)) def tokenize(self, lines) -> List[List[int]]: lines = [ line.strip() for line in lines.strip().split("\n") if len(line.strip()) > 0 ] tokenized = self.tokenizer.encode_batch(lines) return [tok.ids for tok in tokenized] def pad_token_id(self) -> int: return self.tokenizer.token_to_id(self.pad_token) def mask_token_id(self) -> int: return self.tokenizer.token_to_id(self.mask_token) def unk_token_id(self) -> int: return self.tokenizer.token_to_id(self.unk_token) def bos_token_id(self) -> int: return self.tokenizer.token_to_id(self.bos) def sep_token_id(self) -> int: return self.tokenizer.token_to_id(self.sep_token) def token_id(self, token: str) -> int: tok_id = self.tokenizer.token_to_id(token) if tok_id is None: return 0 return tok_id def id2token(self, id: int) -> str: return self.tokenizer.id_to_token(id) def vocab_size(self) -> int: return self.tokenizer.get_vocab_size() def is_lang(self, id) -> bool: return self.tokenizer.id_to_token(id) in self.languages def lang_id(self, tok): if tok in self.languages: return self.languages[tok] return 0 def split_tokenized(self, tokenized: List[int], max_length: int = 512) -> List[List[int]]: """ Based on self.max_len, splits very long sequences to smaller ones. Here we assume to not have any overlapping sequences. If the first token is a language, we add it to every new sequence. :return: """ if len(tokenized) <= max_length: sequences = [tokenized] sequences[-1] = sequences[-1] + ( max_length - len(sequences[-1])) * [self.pad_token_id()] return sequences has_lang = self.is_lang(tokenized[0]) sequence = tokenized[0:] if has_lang else tokenized seq_len = len(sequence) sep_id = self.sep_token_id() max_len = max_length - 1 if has_lang else max_length cur_start = 0 sequences = [] built_seq = [] truncated = False # Shows if previous sequence is truncated due to its length. used_ends = set() while cur_start < seq_len: if not truncated or not has_lang: cur_end = min(seq_len, cur_start + max_len) else: cur_end = min(seq_len, cur_start + max_len + 1) subseq = sequence[cur_start:cur_end] built_seq += subseq sep_positions = [ i for i, id in enumerate(built_seq) if id == sep_id ] if len(sep_positions) > 0: if sep_positions[-1] in used_ends: truncated = True else: built_seq = built_seq[:sep_positions[-1] + 1] truncated = False else: truncated = True assert built_seq[-1] == sequence[len(built_seq) - 1] if has_lang and len(subseq) < max_len + 1: subseq = [tokenized[0]] + subseq sequences.append(subseq) cur_start = len(built_seq) used_ends.add(cur_start - 1) if len(sequences[-1]) < max_length: sequences[-1] = sequences[-1] + ( max_length - len(sequences[-1])) * [self.pad_token_id()] assert built_seq[-1] == sequence[len(built_seq) - 1] return sequences