def main(config): print(config) list_of_tokens = [] if config.is_tokenized: # read tokens with open(config.corpus, 'r', encoding='utf8') as reader: for li, line in enumerate(reader): list_of_tokens += line.strip().split() else: # select tokenizer if config.tokenizer == 'mecab': from konlpy.tag import Mecab tokenizer = Tokenizer(tokenization_fn=Mecab().morphs) # tokenization & read tokens with open(config.corpus, 'r', encoding='utf8') as reader: for li, line in enumerate(reader): list_of_tokens += tokenizer.tokenize(line.strip()) # build vocabulary vocab = Vocab(list_of_tokens=list_of_tokens, unk_token=config.unk_token, pad_token=config.pad_token, bos_token=config.bos_token, eos_token=config.eos_token, min_freq=config.min_freq, lower=config.lower) vocab.build() print('Vocabulary size: ', len(vocab)) # save vocabulary with open(config.vocab, 'wb') as writer: pickle.dump(vocab, writer) print('Vocabulary saved to', config.vocab)
from nltk.tokenize import word_tokenize tokenization_fn = word_tokenize elif config.tokenizer ==TOKENIZER[1]: from konlpy.tag import Mecab tokenization_fn = Mecab().morphs tokenizer = Tokenizer(tokenization_fn=tokenization_fn, is_sentence=config.is_sentence, max_seq_length=config.max_seq_length) # Tokenization & read tokens list_of_tokens = [] with open(config.corpus, 'r', encoding='-utf-8', errors='ignore') as reader: for li, line in enumerate(reader): text = ' '.join(line.split('\t')[1:]).strip() list_of_tokens += tokenizer.tokenize(text) # Build vocabulary vocab = Vocab(list_of_tokens=list_of_tokens, unk_token=config.unk_token, pad_token=config.pad_token, bos_token=config.bos_token, eos_token=config.eos_token, min_freq=config.min_freq, lower=config.lower) vocab.build() if config.pretrained_vectors: pretrained_vectors = load_pretrained(fname=config.pretrained_vectors) vocab.from_pretrained(pretrained_vectors=pretrained_vectors) print('Vocabulary size: ', len(vocab))