def train(self, raw_text_path): "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`" from sentencepiece import SentencePieceTrainer vocab_sz = self._get_vocab_sz( raw_text_path) if self.vocab_sz is None else self.vocab_sz spec_tokens = ['\u2581' + s for s in self.special_toks] SentencePieceTrainer.Train(" ".join([ f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}", f"--character_coverage={self.char_coverage} --model_type={self.model_type}", f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1", f"--user_defined_symbols={','.join(spec_tokens)}" ])) raw_text_path.unlink() return self.cache_dir / 'spm.model'
def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False): """Creates Sentencepiece word model with given words plus special tokens. The tokens of the resulting model are, in this order: <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s> *=if requested by args. The words in the input vocab are plain text, without the whitespace marker. That makes this function interchangeable with _make_vocab_file(). Args: vocab: a list of strings with the words to put into the model's vocabulary. Do not include special tokens here. prefix: an optional string, to change the filename prefix for the model (relative to the temporary directory created by this function). add_mask_token: an optional bool, whether to include a [MASK] token. Returns: The absolute filename of the created Sentencepiece model file. """ model_prefix = os.path.join( tempfile.mkdtemp(dir=self.get_temp_dir()), # New subdir each time. prefix) input_file = model_prefix + "_train_input.txt" # Create input text for training the sp model from the tokens provided. # Repeat tokens, the earlier the more, because they are sorted by frequency. input_text = [] for i, token in enumerate(vocab): input_text.append(" ".join([token] * (len(vocab) - i))) with tf.io.gfile.GFile(input_file, "w") as f: f.write("\n".join(input_text + [""])) control_symbols = "[CLS],[SEP]" full_vocab_size = len( vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>. if add_mask_token: control_symbols += ",[MASK]" full_vocab_size += 1 flags = dict(model_prefix=model_prefix, model_type="word", input=input_file, pad_id=0, unk_id=1, control_symbols=control_symbols, vocab_size=full_vocab_size, bos_id=full_vocab_size - 2, eos_id=full_vocab_size - 1) SentencePieceTrainer.Train(" ".join( ["--{}={}".format(k, v) for k, v in flags.items()])) return model_prefix + ".model"
def train_sentencepiece(dataset, vocab_size, maxchars=1e7, character_coverage=1.0, model_path='wmt_model.model', model_type='unigram', data_keys=('inputs', 'targets')): """Train SentencePiece tokenizer from subset of tf dataset. Args: dataset: tf.dataset vocab_size: int: size of vocab tokens to train. maxchars: int: number of characters to use for sentencepiece training. character_coverage: amount of characters covered by the model, good defaults are 0.9995 for languages with rich character set like Japanese or Chinese and 1.0 for other languages with small character set. model_path: str: path of model file to save vocab model to. model_type: str: type of sentencepiece vocab to train. data_keys: Tuple[str]: keys of dataset to use for training. Returns: path to the trained sentencepiece vocabulary model. """ abs_model_path = os.path.abspath(os.path.expanduser(model_path)) fname, _ = dump_chars_to_textfile(dataset, maxchars=maxchars, data_keys=data_keys) with tempfile.NamedTemporaryFile(delete=False, prefix='/tmp/sp_tmp') as model_fp: pass # we just want a prefix'd tmp-filename argstr = ' '.join( [f'--input={fname}', f'--vocab_size={vocab_size}', f'--character_coverage={character_coverage}', f'--model_prefix={model_fp.name}', f'--model_type={model_type}']) SentencePieceTrainer.Train(argstr) if jax.host_id() == 0: # Use an intermediate filename that is renamed to the target name to address # create and fill delays. copy_rename_path = abs_model_path + '.rntmp' tf.io.gfile.copy(model_fp.name + '.model', copy_rename_path, overwrite=True) tf.io.gfile.rename(copy_rename_path, abs_model_path, overwrite=True) logging.info('copied %s to %s', model_fp.name+'.model', abs_model_path) else: while not tf.io.gfile.exists(abs_model_path): time.sleep(1) time.sleep(1) return abs_model_path
def train_subwords(train_path, model_path, model_type, vocab_size): temp = tempfile.NamedTemporaryFile(mode="w", delete=False) for text, title in parse_ria_json(train_path): temp.write(text + "\n") temp.write(title + "\n") temp.close() if not os.path.exists(model_path): os.makedirs(model_path) cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, os.path.join(model_path, model_type), vocab_size, model_type) sp_trainer.Train(cmd) os.unlink(temp.name)
def train_subwords(train_path, model_path, model_type, vocab_size, config_path): temp = tempfile.NamedTemporaryFile(mode="w", delete=False) params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) for text, summary in reader.parse_set(train_path): temp.write(text + "\n") temp.write(summary + "\n") temp.close() if not os.path.exists(model_path): os.makedirs(model_path) cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, os.path.join(model_path, model_type), vocab_size, model_type) sp_trainer.Train(cmd) os.unlink(temp.name)
def train(input_file, opencorpora_file): records = [] with open(input_file, "r") as r: next(r) reader = csv.reader(r) for row in reader: _, _, text, _ = row text = text.replace("\n", " ").lower() nn_count = text.count("нн") if nn_count == 1: records.append((text, 0)) with open(opencorpora_file, "r") as r: for line in r: text = line.strip().lower() if "нн" in text: records.append((text, 1)) random.shuffle(records) border = int(0.8 * len(records)) train = records[:border] val = records[border:] model_path = "subword_model" if False: temp = tempfile.NamedTemporaryFile(mode="w", delete=False) for text, _ in train: temp.write(text + "\n") temp.close() cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, model_path, 30000, "bpe") sp_trainer.Train(cmd) os.unlink(temp.name) processor = sp_processor() processor.load(model_path + ".model") fixed_train = [] for text, label in train: text = " ".join(tokenize(processor, text)) fixed_train.append((text, label)) fixed_val = [] for text, label in val: text = " ".join(tokenize(processor, text)) fixed_val.append((text, label)) to_ft_format(fixed_train, "nn_train.txt") to_ft_format(fixed_val, "nn_val.txt")
def __spm_create(self): if os.path.isfile("data/love.model"): return 0 params = '--input=' + c.data_text_path + \ ' --model_type=' + c.model_type[0] + \ ' --model_prefix=data/love ' \ ' --vocab_size=2507' \ ' --max_sentence_length=999999' \ ' --character_coverage=1.0' \ ' --pad_id=0 --pad_piece=[PAD]' \ ' --unk_id=1 --unk_piece=[UNK]' \ ' --bos_id=2 --bos_piece=[BOS]' \ ' --eos_id=3 --eos_piece=[EOS]' \ ' --user_defined_symbols=[SEP],[CLS],[MASK]' # 0.9995 for english, 1.0 for Korean SentencePieceTrainer.Train(params)
def train_sentencepiece(texts:Collection[str], path:PathOrStr, pre_rules: ListRules=None, post_rules:ListRules=None, vocab_sz:int=None, max_vocab_sz:int=30000, model_type:str='unigram', max_sentence_len:int=20480, lang='en', char_coverage=None, tmp_dir='tmp'): "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`" from sentencepiece import SentencePieceTrainer cache_dir = Path(path)/tmp_dir os.makedirs(cache_dir, exist_ok=True) if vocab_sz is None: vocab_sz=get_default_size(texts, max_vocab_sz) raw_text_path = cache_dir / 'all_text.txt' with open(raw_text_path, 'w') as f: f.write("\n".join(texts)) spec_tokens = ['\u2581'+s for s in defaults.text_spec_tok] SentencePieceTrainer.Train(" ".join([ f"--input={raw_text_path} --max_sentence_length={max_sentence_len}", f"--character_coverage={ifnone(char_coverage, 1 if lang in full_char_coverage_langs else 0.99)}", f"--unk_id={len(defaults.text_spec_tok)} --pad_id=-1 --bos_id=-1 --eos_id=-1", f"--user_defined_symbols={','.join(spec_tokens)}", f"--model_prefix={cache_dir/'spm'} --vocab_size={vocab_sz} --model_type={model_type}"])) return cache_dir
def train(input_file): records = [] with open(input_file, "r") as r: next(r) reader = csv.reader(r) for row in reader: _, text, label = row text = text.replace("\n", " ").lower() tjsya_count = text.count("ться") tsya_count = text.count("тся") if (tjsya_count != 0 and tsya_count == 0) or (tjsya_count == 0 and tsya_count != 0): records.append((text, label)) random.shuffle(records) border = int(0.8 * len(records)) train = records[:border] val = records[border:] model_path = "subword_model" if True: temp = tempfile.NamedTemporaryFile(mode="w", delete=False) for text, _ in train: temp.write(text + "\n") temp.close() cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, model_path, 30000, "bpe") sp_trainer.Train(cmd) os.unlink(temp.name) processor = sp_processor() processor.load(model_path + ".model") fixed_train = [] for text, label in train: text = " ".join(tokenize(processor, text)) fixed_train.append((text, label)) fixed_val = [] for text, label in val: text = " ".join(tokenize(processor, text)) fixed_val.append((text, label)) to_ft_format(fixed_train, "grammar_endings_train.txt") to_ft_format(fixed_val, "grammar_endings_val.txt")
def train(cls, model_type: str, vocab_size: int, model_path: str, files: Iterator[str], no_split_toks: Optional[List[str]] = None, char_coverage: float = 0): """ Train Sentence Piece Model :param model_type: sentence piece model type: {unigram, BPE, word, char} :param vocab_size: target vocabulary size :param model_path: where to store model :param files: input files :param no_split_toks: Don't split these tokens :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage 0.9995% :return: """ model_prefix = model_path.replace('.model', '') files = ','.join(files) # remove duplicates arg = f"--input={files} --vocab_size={vocab_size} --model_prefix={model_prefix}" \ f" --model_type={model_type} --pad_id={cls.pad_idx} --bos_id={cls.bos_idx}" \ f" --eos_id={cls.eos_idx} --unk_id={cls.unk_idx} --hard_vocab_limit=false" if char_coverage > 0: assert 0 < char_coverage <= 1 arg += f" --character_coverage={char_coverage}" # CLS token goes in the beginning because we need it get index 4 extra = [cls.cls_tok] + (no_split_toks or []) no_split_toks_str = ','.join(extra) arg += f" --user_defined_symbols={no_split_toks_str}" if model_type == 'bpe': # BPE can have longer sentences, default is 2048 arg += " --max_sentence_length=8192" if model_type == 'word': arg += ' --use_all_vocab' log.info(f"SPM: {arg}") SentencePieceTrainer.Train(arg) log.info("Training complete") if not model_path.endswith('.model'): model_path += '.model' model = SPField(model_path) for piece, idx in cls.reserved(): assert model.piece_to_id(piece) == idx return model
def _create_fake_sentencepiece_model(output_dir): vocab = ['a', 'b', 'c', 'd', 'e', 'abc', 'def', 'ABC', 'DEF'] model_prefix = os.path.join(output_dir, 'spm_model') input_text_file_path = os.path.join(output_dir, 'train_input.txt') with tf.io.gfile.GFile(input_text_file_path, 'w') as f: f.write(' '.join(vocab + ['\n'])) # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>. full_vocab_size = len(vocab) + 7 flags = dict(model_prefix=model_prefix, model_type='word', input=input_text_file_path, pad_id=0, unk_id=1, control_symbols='[CLS],[SEP],[MASK]', vocab_size=full_vocab_size, bos_id=full_vocab_size - 2, eos_id=full_vocab_size - 1) SentencePieceTrainer.Train(' '.join( ['--{}={}'.format(k, v) for k, v in flags.items()])) return model_prefix + '.model'
def train(model_type: str, vocab_size: int, model_path: str, files: Iterator[str], no_split_toks: Optional[List[str]] = None, cover_all_chars: bool = False): """ Train Sentence Piece Model :param model_type: sentence piece model type: {unigram, BPE, word, char} :param vocab_size: target vocabulary size :param model_path: where to store model :param files: input files :param no_split_toks: Don't split these tokens :return: """ model_prefix = model_path.replace('.model', '') files = ','.join(files) # remove duplicates arg = f"--input={files} --vocab_size={vocab_size} --model_prefix={model_prefix}" \ f" --model_type={model_type} --pad_id={PAD_TOK[1]} --bos_id={BOS_TOK[1]}" \ f" --eos_id={EOS_TOK[1]} --unk_id={UNK_TOK[1]} --hard_vocab_limit=false" if cover_all_chars: arg += f" --character_coverage=1.0" # CLS token goes in the beginning because we need it get index 4 cls_tok_str = CLS_TOK[0] if no_split_toks: no_split_toks_str = ','.join([cls_tok_str] + no_split_toks) else: no_split_toks_str = cls_tok_str arg += f" --user_defined_symbols={no_split_toks_str}" if model_type == 'bpe': # BPE can have longer sentences, default is 2048 arg += " --max_sentence_length=8192" log.info(f"SPM: {arg}") SentencePieceTrainer.Train(arg) log.info("Training complete") if not model_path.endswith('.model'): model_path += '.model' model = Field(model_path) for piece, idx in RESERVED_TOKS: assert model.piece_to_id(piece) == idx return model
def setUp(self): super().setUp() # Make a sentencepiece model. tmp_dir = self.get_temp_dir() tempfile.mkdtemp(dir=tmp_dir) vocab = ["a", "b", "c", "d", "e", "abc", "def", "ABC", "DEF"] model_prefix = os.path.join(tmp_dir, "spm_model") input_text_file_path = os.path.join(tmp_dir, "train_input.txt") with tf.io.gfile.GFile(input_text_file_path, "w") as f: f.write(" ".join(vocab + ["\n"])) # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>. full_vocab_size = len(vocab) + 7 flags = dict( model_prefix=model_prefix, model_type="word", input=input_text_file_path, pad_id=0, unk_id=1, control_symbols="[CLS],[SEP],[MASK]", vocab_size=full_vocab_size, bos_id=full_vocab_size-2, eos_id=full_vocab_size-1) SentencePieceTrainer.Train( " ".join(["--{}={}".format(k, v) for k, v in flags.items()])) self._spm_path = model_prefix + ".model"
def train_sentencepiece(file_path: str, model_path: str, vocab_size: int, character_coverage: float, model_type: str): """Train SentencePiece tokenizer from subset of tf dataset. Args: file_path: path of data to train sentencepiece. model_path: path of model file to save vocab model to. vocab_size: size of vocab tokens to train. character_coverage: amount of characters covered by the model, good defaults are 0.9995 for languages with rich character set like Japanese or Chinese and 1.0 for other languages with small character set. model_type: type of sentencepiece vocab to train. Returns: path to the trained sentencepiece vocabulary model. """ argstr = " ".join([ f"--input={file_path}", f"--vocab_size={vocab_size}", f"--character_coverage={character_coverage}", f"--model_prefix={model_path}", f"--model_type={model_type}", "--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2" ]) SentencePieceTrainer.Train(argstr)
def train_model(self, train_config=None): ''' https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb see from this tutorial for sentence piece training ''' config = train_config if train_config else self.config param = "" param += "--input={} ".format(config["corpus"]) param += "--model_prefix={} ".format(config["model_prefix"]) param += "--vocab_size={} ".format(config["vocab_size"]) param += "--model_type={} ".format(config.get("model_type", "unigram")) param += "--character_coverage={} ".format( config.get("character_coverage", 0.995)) param += "--mining_sentence_size={} ".format( config.get("mining_sentence_size", 5000000)) param += "--input_sentence_size={} ".format( config.get("input_sentence_size", 5000000)) param += "--max_sentencepiece_length={} ".format( config.get("max_sentencepiece_length", 5)) try: SentencePieceTrainer.Train(param) self.sp.Load(config["model_prefix"] + ".model") except: raise ValueError(" training word piece model failed ")
def train_sentencepiece(dataset, vocab_size, maxchars=1e7, character_coverage=1.0, model_path='wmt_model.model', model_type='unigram', data_keys=('inputs', 'targets')): """Train SentencePiece tokenizer from subset of tf dataset. Args: dataset: tf.dataset vocab_size: int: size of vocab tokens to train. maxchars: int: number of characters to use for sentencepiece training. model_path: str: path of model file to save vocab model to. model_type: str: type of sentencepiece vocab to train. data_keys: Tuple[str]: keys of dataset to use for training. Returns: path to the trained sentencepiece vocabulary model. """ abs_model_path = os.path.abspath(os.path.expanduser(model_path)) fname, _ = dump_chars_to_textfile(dataset, maxchars=maxchars, data_keys=data_keys) with tempfile.NamedTemporaryFile(delete=False, prefix='/tmp/sp_tmp') as model_fp: pass # we just want a prefix'd tmp-filename argstr = ' '.join([ f'--input={fname}', f'--vocab_size={vocab_size}', f'--character_coverage={character_coverage}', f'--model_prefix={model_fp.name}', f'--model_type={model_type}' ]) SentencePieceTrainer.Train(argstr) tf.io.gfile.copy(model_fp.name + '.model', abs_model_path, overwrite=True) logging.info('copied %s to %s', model_fp.name + '.model', abs_model_path) return abs_model_path
#encoding: utf-8 # portal from fairseq: https://github.com/pytorch/fairseq/blob/master/scripts/spm_train.py import sys from sentencepiece SentencePieceTrainer if __name__ == "__main__": SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
def preprocess(self, directory: str, prefix: str, part: str, spm_model: SentencePieceProcessor = None, pretrain_emb=True, vocab_size=3000, embedding_size=600, max_sentence_length=16384, workers=3, skip_gramm=False): # Check data files existing workdir = os.path.join(directory, prefix) os.makedirs(workdir, exist_ok=True) data_part_file = os.path.join(directory, part + ".tsv") if not os.path.exists(data_part_file): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), data_part_file) if part not in ['train', 'develop']: assert spm_model is not None, "For non train part, `spm_model` must be specified." else: # Train sentecepiece: logging.info("Start training sentecepiece") spm_directory = os.path.join(workdir, "spm") spm_params = ( "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}" .format(data_part_file, spm_directory, vocab_size, max_sentence_length)) SentencePieceTrainer.Train(spm_params) spm_model = SentencePieceProcessor() spm_model.load(spm_directory + ".model") if pretrain_emb: # Train word2vec logging.info("Start training word2vec") train_senteces = SentenceIterator(data_part_file, spm_model) logging.info("Loaded train sentences") w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers, vector_size=embedding_size, sg=int(skip_gramm)) w2v_model_filename = os.path.join(workdir, "word2vec.model") w2v_model.save(w2v_model_filename) # Export embeddings logging.info("Export embeddings") embeddings_filename = os.path.join(workdir, "embedding.npy") export_embeddings(embeddings_filename, spm_model, w2v_model) logging.info("Embeddings have been saved into {}".format( embeddings_filename)) logging.info("Start exporting data file") source_file_name = os.path.join(directory, part + ".tsv") exported_file_name = os.path.join(workdir, part + ".npy") sentence_iterator = SentenceIterator(source_file_name, spm_model) sentence_iterator.export(exported_file_name) logging.info("{} exported".format(exported_file_name)) logging.info("Data preprocessing completed")
def preprocess(directory: str, prefix: str, part: str, spm: SentencePieceProcessor = None, pretrain_emb=True, vocab_size=30000, embedding_size=300, max_sentence_length=16384, workers=3, skip_gramm=False): """Preprocess dataset. Args: directory (str): Dataset directory. prefix (str): Dataset preprocessing prefix. part (str): Dataset part. :attr:`directory` must contain :attr:`part`.tsv file with data. spm (SentencePieceProcessor, optional): Defaults to None. Sentecepiece model. pretrain_emb (bool, optional): Defaults to True. Whether to pretrain embeddings. vocab_size (int, optional): Defaults to 30000. Vocabulary size. embedding_size (int, optional): Defaults to 300. Pretrained embedding size. max_sentence_length (int, optional): Defaults to 16384. Maximum sentence length for sentencepiece. workers (int, optional): Defaults to 3. Number of workers. skip_gramm (bool, optional): Defaults to False. Whether to use skip-gram type of Word2Vec training. Raises: FileNotFoundError: Raises if source data file doesn't exist. """ data_workdir = os.path.join(directory, prefix) part_source_filename = os.path.join(directory, part + ".tsv") part_exported_filename = os.path.join(data_workdir, part + ".npy") spm_filename = os.path.join(data_workdir, "spm.model") spm_directory = os.path.join(data_workdir, "spm") w2v_model_filename = os.path.join(data_workdir, "word2vec.model") embeddings_filename = os.path.join(data_workdir, "embedding.npy") logger.info("Preprocess {}/{} dataset.".format(data_workdir, part)) os.makedirs(data_workdir, exist_ok=True) if not os.path.exists(part_source_filename): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), part_source_filename) if part not in ["train", "dev"]: assert spm is not None, "For non train part, `spm` must be specified." else: logger.info("Start training sentencepiece") spm_params = ( "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " "--input={} --model_prefix={} --vocab_size={} --max_sentence_length={}".format( part_source_filename, spm_directory, vocab_size, max_sentence_length ) ) SentencePieceTrainer.Train(spm_params) spm = SentencePieceProcessor() spm.load(spm_filename) if pretrain_emb: logger.info("Start training Word2Vec embeddings") train_senteces = SentenceIterator(part_source_filename, spm) logger.info("Loaded train sentences") w2v_model = Word2Vec(train_senteces, min_count=0, workers=workers, size=embedding_size, sg=int(skip_gramm)) w2v_model.save(w2v_model_filename) # Export embeddings logger.info("Export embeddings") export_embeddings(embeddings_filename, spm, w2v_model) logger.info("Embeddings have been saved into {}".format(embeddings_filename)) logger.info("Start exporting data file") sentence_iterator = SentenceIterator(part_source_filename, spm) sentence_iterator.export(part_exported_filename) logger.info("{} exported".format(part_exported_filename))
text = re.sub('─', '─', text) text = re.sub('•', '•', text) text = re.sub('☆', '☆', text) text = re.sub('’', '’', text) text = re.sub('‎', '', text) text = re.sub('Ñ€', 'p', text) out.write(item[1:-1] + '\n') tq.update() def fetch_text(text_file, loop=None): if loop is None: loop = asyncio.get_event_loop() torrent = database.Torrent('207.148.124.42', loop=loop) queue = asyncio.Queue(10000) loop.run_until_complete( asyncio.gather(torrent.fetch_text(queue), write(queue, text_file))) if __name__ == '__main__': # fetch_text(sys.argv[1]) Trainer.Train( f'--input={sys.argv[1]} --model_prefix={sys.argv[2]} --vocab_size={sys.argv[3]}' ) tokenizer = Tokenizer() tokenizer.Load('spm.model') with open(sys.argv[1]) as input: for line in input: print(line) print(tokenizer.encode_as_pieces(line))
def main(argv): SentencePieceTrainer.Train(' '.join(argv[1:]))
with open(raw_text_path, 'r') as f: for line in f.readlines(): cnt.update(line.split()) if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz res = len(cnt)//4 while res%8 != 0: res+=1 return res def train(self, raw_text_path): "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`" from sentencepiece import SentencePieceTrainer vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz spec_tokens = ['\u2581'+s for s in self.special_toks] SentencePieceTrainer.Train(" ".join([ f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}", f"--character_coverage={self.char_coverage} --model_type={self.model_type}", f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1", f"--user_defined_symbols={','.join(spec_tokens)}"])) raw_text_path.unlink() return self.cache_dir/'spm.model' def setup(self, items, rules): if self.tok is not None: return {'sp_model': self.sp_model} raw_text_path = self.cache_dir/'texts.out' with open(raw_text_path, 'w') as f: for t in progress_bar(apply_rules(items, rules), total=len(items), leave=False): f.write(f'{t}\n') return {'sp_model': self.train(raw_text_path)} def pipe(self, items): for t in items: yield self.tok.EncodeAsPieces(t)
from glob import glob from sentencepiece import SentencePieceTrainer NUM_THREADS = 24 VOCABSIZE = 30_001 NUM_SENTS = 100_000_000 SOURCE_PATH = '/home/s2971992/Bertje/clean-data-v2/*/*.txt' # SOURCE_PATH = '/Volumes/Data/Corpora/DutchWebNews/clean/*.txt' input_paths = list(glob(SOURCE_PATH)) input_path = ','.join(input_paths) print('Total number of files: {}'.format(len(input_paths))) cmd = '--input={} --vocab_size={} --num_threads={} --input_sentence_size={} --shuffle_input_sentence=true --model_type=unigram --split_by_number=false --split_by_unicode_script=false --model_prefix=dutch --bos_piece=[CLS] --eos_piece=[SEP] --unk_piece=[UNK] --control_symbols=[PAD],[MASK]'.format( input_path, VOCABSIZE, NUM_THREADS, NUM_SENTS ) trainer = SentencePieceTrainer.Train(cmd)
def main(): parser = argparse.ArgumentParser(description="Create vocabulary") parser.add_argument("--dataset_dir", type=str) parser.add_argument("--model_prefix", default="tokenizer", type=str) parser.add_argument("--num_placeholders", default=100, type=int) parser.add_argument("--sample_size", default=1e7, type=int) parser.add_argument("--train_path", type=str) parser.add_argument("--vocab_filename", default="vocab.txt", type=str) parser.add_argument("--vocab_size", default=32000, type=int) args = parser.parse_args() if args.dataset_dir is not None and args.train_path is not None: print("Only one of 'dataset_dir' and 'train_path' can be specified") return elif args.dataset_dir is not None: # If the dataset is distributed across multiple files, merge into one # file before proceeding # filepaths = glob.glob(os.path.join(args.dataset_dir, "**", "*.txt")) filepaths = glob.glob(os.path.join(args.dataset_dir, "*.txt")) print( "Found {} files, concatenenating dataset into one file...".format( len(filepaths))) with open(MERGED_FILE, "w") as f: for filepath in tqdm(filepaths): f.write(open(filepath, "r", errors="ignore").read()) train_path = MERGED_FILE elif args.train_path is not None: train_path = args.train_path else: print("One of 'dataset_dir' and 'train_path' must be specified") return SPT.Train("--input={} ".format(train_path) + "--model_prefix={} ".format(args.model_prefix) + "--vocab_size={} ".format(args.vocab_size - args.num_placeholders) + "--input_sentence_size={} ".format(args.sample_size) + "--shuffle_input_sentence=true " + "--hard_vocab_limit=false " + "--bos_id=-1 " + "--eos_id=-1") # Add BERT control symbols vocab = ["[PAD]"] tokens = [] with open("{}.vocab".format(args.model_prefix), "r") as f: # Skip first <unk> token f.seek(8) # Read tokens from each line and parse for vocab for line in f: piece = line.split("\t")[0] if piece.startswith("▁"): token = piece[1:] else: token = "##{}".format(piece) tokens.append(token) vocab.extend( ["[unused{}]".format(i) for i in range(args.vocab_size - len(tokens))]) vocab.extend(["[UNK]", "[CLS]", "[SEP]", "[MASK]"]) vocab.extend(tokens) # Save vocabulary to output file with open(args.vocab_filename, "w") as f: for token in vocab: f.write("{}\n".format(token))
def main(): options = parse_args() torch.manual_seed(options.seed) basename = os.path.splitext(os.path.basename(options.input))[0] out_dir = options.out_dir or "data/{}/".format(basename) spinner = Halo(spinner="dots", placement="right") with open(options.input, "r", encoding="utf8") as fd: reader = csv.reader(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="") lines = [[line[0]] for line in reader] if not os.path.exists(out_dir): os.makedirs(out_dir) output_full = os.path.join(out_dir, "{}.tsv".format(basename)) with open(output_full, "w", encoding="utf8") as fd: writer = csv.writer(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="") writer.writerows(lines) vocab_size = 32000 spiece_out = os.path.join(out_dir, "spiece") spiece_args = ( "--input={} " "--model_prefix={} " "--vocab_size={} " "--character_coverage=1.0" ).format(output_full, spiece_out, vocab_size) SentencePieceTrainer.Train(spiece_args) # Load the generated vocabulary with open("{}.vocab".format(spiece_out), "r", encoding="utf8") as fd: reader = csv.reader( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) vocab = [line[0] for line in reader] # Remove the special tokens <unk>, <s>, </s> vocab = vocab[3:] # Convert to BERT style bert_vocab = [ v[1:] if v.startswith("▁") else "##{}".format(v) for v in vocab if v != "▁" ] # Add BERT's special tokens to the beginning bert_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + bert_vocab # Fill up with unused tokens pad_size = vocab_size - len(bert_vocab) bert_vocab += ["unused{}".format(i) for i in range(pad_size)] with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows([[b] for b in bert_vocab]) # Convert to GPT-2 style # Unfortunately it's slow and tedious. spinner.start(text="Generating BPE vocabulary") gpt2_vocab = ["Ġ{}".format(v[1:]) if v.startswith("▁") else v for v in vocab] # Add the GPT-2 special token to the end gpt2_vocab.append("<|endoftext|>") with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf8") as fd: json.dump({v: i for i, v in enumerate(gpt2_vocab)}, fd, ensure_ascii=False) spiece_processor = SentencePieceProcessor() spiece_processor.Load("{}.model".format(spiece_out)) # Encode the whole text encoded = [ [" ".join(spiece_processor.EncodeAsPieces(line[0])).replace("▁", "Ġ")] for line in lines ] tmp_encoded_fd, tmp_encoded_path = tempfile.mkstemp() tmp_bpe_fd, tmp_bpe_path = tempfile.mkstemp() try: # Write the encoded text to a temporary file. with os.fdopen(tmp_encoded_fd, "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows(encoded) learn_bpe( open(tmp_encoded_path, "r", encoding="utf8"), open(tmp_bpe_path, "w", encoding="utf8"), num_symbols=vocab_size, ) with open(tmp_bpe_path, "r", encoding="utf8") as fd: reader = csv.reader( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) seen = set() merges = [] for line in reader: # Get rid of the </w> tokens line = line[0].replace("</w>", "") # Remove duplicates (due to </w> tokens) if line not in seen: seen.add(line) merges.append([line]) with open(os.path.join(out_dir, "merges.txt"), "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows(merges) finally: os.remove(tmp_encoded_path) os.remove(tmp_bpe_path) spinner.stop()