def __init__(self, pretrained_model: str, vocab_file: str, pad_token: str = '[PAD]', unk_token: str = '[UNK]', bos_token: str = '[BOS]', eos_token: str = '[EOS]'): tokenizer = SentencePiece.load(pretrained_model) super(PretrainedTokenizer, self).__init__(tokenizer, vocab_file, pad_token, unk_token, bos_token, eos_token)
def build(args): tokenizer = SentencePiece.train( input=args.corpus, model_prefix=args.prefix, vocab_size=args.vocab_size, model_type=args.model_type, character_coverage=args.character_coverage, max_sentence_length=args.max_sentence_length, pad_token=args.pad_token, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token)
def build(args): if args.tokenizer == 'sentencepiece': tokenizer = SentencePiece.train( input=args.corpus, model_prefix=args.prefix, vocab_size=args.vocab_size, model_type=args.model_type, character_coverage=args.character_coverage, max_sentence_length=args.max_sentence_length, pad_token=args.pad_token, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token) else: tokenizer = TOKENIZER[args.tokenizer] vocab = Vocab(vocab_size=args.vocab_size, pad_token=args.pad_token, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token) vocab.build(args.corpus, tokenizer, args.max_sentence_length) vocab.save(args.prefix)
import fasttext import prenlp from prenlp.data import Normalizer from prenlp.tokenizer import SentencePiece # Data preparation imdb_train, imdb_test = prenlp.data.IMDB() # Corpus preparation for training SentencePiece corpus_path = 'corpus.txt' with open(corpus_path, 'w', encoding='utf-8') as writer: for text, label in imdb_train: writer.write(text.strip() + '\n') # Preprocessing tokenizer = SentencePiece() tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=10000) tokenizer.load('sentencepiece.model') normalizer = Normalizer(url_repl=' ', tag_repl=' ', emoji_repl=' ', email_repl=' ', tel_repl=' ') for dataset in [imdb_train, imdb_test]: for i, (text, label) in enumerate(dataset): dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) prenlp.data.fasttext_transform(imdb_train, 'imdb.train')