Пример #1
0
    def __init__(self, pretrained_model: str, vocab_file: str,
                 pad_token: str = '[PAD]',
                 unk_token: str = '[UNK]',
                 bos_token: str = '[BOS]',
                 eos_token: str = '[EOS]'):
        tokenizer = SentencePiece.load(pretrained_model)

        super(PretrainedTokenizer, self).__init__(tokenizer, vocab_file, pad_token, unk_token, bos_token, eos_token)
Пример #2
0
def build(args):
    tokenizer = SentencePiece.train(
        input=args.corpus,
        model_prefix=args.prefix,
        vocab_size=args.vocab_size,
        model_type=args.model_type,
        character_coverage=args.character_coverage,
        max_sentence_length=args.max_sentence_length,
        pad_token=args.pad_token,
        unk_token=args.unk_token,
        bos_token=args.bos_token,
        eos_token=args.eos_token)
Пример #3
0
def build(args):
    if args.tokenizer == 'sentencepiece':
        tokenizer = SentencePiece.train(
            input=args.corpus,
            model_prefix=args.prefix,
            vocab_size=args.vocab_size,
            model_type=args.model_type,
            character_coverage=args.character_coverage,
            max_sentence_length=args.max_sentence_length,
            pad_token=args.pad_token,
            unk_token=args.unk_token,
            bos_token=args.bos_token,
            eos_token=args.eos_token)
    else:
        tokenizer = TOKENIZER[args.tokenizer]
        vocab = Vocab(vocab_size=args.vocab_size,
                      pad_token=args.pad_token,
                      unk_token=args.unk_token,
                      bos_token=args.bos_token,
                      eos_token=args.eos_token)
        vocab.build(args.corpus, tokenizer, args.max_sentence_length)
        vocab.save(args.prefix)
import fasttext
import prenlp
from prenlp.data import Normalizer
from prenlp.tokenizer import SentencePiece

# Data preparation
imdb_train, imdb_test = prenlp.data.IMDB()

# Corpus preparation for training SentencePiece
corpus_path = 'corpus.txt'
with open(corpus_path, 'w', encoding='utf-8') as writer:
    for text, label in imdb_train:
        writer.write(text.strip() + '\n')

# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path,
                model_prefix='sentencepiece',
                vocab_size=10000)
tokenizer.load('sentencepiece.model')
normalizer = Normalizer(url_repl=' ',
                        tag_repl=' ',
                        emoji_repl=' ',
                        email_repl=' ',
                        tel_repl=' ')

for dataset in [imdb_train, imdb_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))

prenlp.data.fasttext_transform(imdb_train, 'imdb.train')