# korean vocab split_ko = Stemmer(language='ko') count_ko = Counter( itertools.chain.from_iterable(tr_dataset['ko'].apply( split_ko.extract_stem).tolist())) list_of_token_ko = sorted( [token[0] for token in count_ko.items() if token[1] >= 15]) tmp_vocab = nlp.Vocab(Counter(list_of_token_ko), bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None) vocab_ko.embedding = array with open(data_dir / 'vocab_ko.pkl', mode='wb') as io: pickle.dump(vocab_ko, io) # english vocab split_en = Stemmer(language='en') count_en = Counter( itertools.chain.from_iterable(tr_dataset['en'].apply( split_en.extract_stem).tolist())) list_of_token_en = [token[0] for token in count_en.items() if token[1] >= 15] tmp_vocab = nlp.Vocab(Counter(list_of_token_en)) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy()
# extracting morph in sentences list_of_tokens = tr["document"].apply(split_morphs).tolist() # generating the vocab token_counter = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=token_counter, min_freq=10, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko") tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab = Vocab( tmp_vocab.idx_to_token, padding_token="<pad>", unknown_token="<unk>", bos_token=None, eos_token=None, ) vocab.embedding = array # saving vocab with open(nsmc_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config.update({"vocab": str(nsmc_dir / "vocab.pkl")}) config.save("conf/dataset/nsmc.json")
import pandas as pd import itertools import gluonnlp as nlp from pathlib import Path from collections import Counter from model.split import split_morphs from model.utils import Vocab from utils import Config qpair_dir = Path("qpair") config = Config("conf/dataset/qpair.json") train = pd.read_csv(config.train, sep="\t") list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist() list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist() list_of_tokens = list_of_tokens_qa + list_of_tokens_qb count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy() with open(qpair_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config.update({"vocab": str(qpair_dir / "vocab.pkl")}) config.save("conf/dataset/qpair.json")