def __init__(self, prior_config, rl_config, beam_size=5): self.prior_config = prior_config self.rl_config = rl_config self.rl_config.beam_size = beam_size print('Loading Vocabulary...') self.vocab = Vocab() self.vocab.load(prior_config.word2id_path, prior_config.id2word_path) self.prior_config.vocab_size = self.vocab.vocab_size self.rl_config.vocab_size = self.vocab.vocab_size print(f'Vocabulary size: {self.vocab.vocab_size}') self.eval_data = self.get_data_loader() self.build_models()
def __init__(self, config, val_config): self.config = config self.val_config = val_config vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) self.vocab = vocab self.config.vocab_size = vocab.vocab_size # To initialize simulated conversations self.start_sentences = self.load_sentences(self.config.dataset_dir) self.eval_data = self.get_data_loader(train=False) self.build_models() if self.config.load_rl_ckpt: self.load_models() self.set_up_optimizers() self.set_up_summary() self.set_up_logging() if self.config.rl_batch_size == self.config.beam_size: raise ValueError('Decoding breaks if batch_size == beam_size')
# extracting morph in sentences list_of_tokens = tr["document"].apply(split_morphs).tolist() # generating the vocab token_counter = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=token_counter, min_freq=10, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko") tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab = Vocab( tmp_vocab.idx_to_token, padding_token="<pad>", unknown_token="<unk>", bos_token=None, eos_token=None, ) vocab.embedding = array # saving vocab with open(nsmc_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config.update({"vocab": str(nsmc_dir / "vocab.pkl")}) config.save("conf/dataset/nsmc.json")
# korean vocab split_ko = Stemmer(language='ko') count_ko = Counter( itertools.chain.from_iterable(tr_dataset['ko'].apply( split_ko.extract_stem).tolist())) list_of_token_ko = sorted( [token[0] for token in count_ko.items() if token[1] >= 15]) tmp_vocab = nlp.Vocab(Counter(list_of_token_ko), bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None) vocab_ko.embedding = array with open(data_dir / 'vocab_ko.pkl', mode='wb') as io: pickle.dump(vocab_ko, io) # english vocab split_en = Stemmer(language='en') count_en = Counter( itertools.chain.from_iterable(tr_dataset['en'].apply( split_en.extract_stem).tolist())) list_of_token_en = [token[0] for token in count_en.items() if token[1] >= 15] tmp_vocab = nlp.Vocab(Counter(list_of_token_en)) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy()
urlretrieve('https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params', filename=ptr_bert_path) ptr_bert = torch.load(ptr_bert_path) ptr_bert = OrderedDict([(('bert.' + k), ptr_bert.get(k)) for k in ptr_bert.keys()]) torch.save(ptr_bert, ptr_bert_path) else: print('Already you have pytorch_model_skt.bin!') if not ptr_vocab_path.exists(): urlretrieve('https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json', filename=ptr_vocab_path) ptr_bert_vocab = BERTVocab.from_json(ptr_vocab_path.open(mode='rt').read()) vocab = Vocab(ptr_bert_vocab.idx_to_token, padding_token="[PAD]", unknown_token="[UNK]", bos_token=None, eos_token=None, reserved_tokens=["[CLS]", "[SEP]", "[MASK]"], token_to_idx=ptr_bert_vocab.token_to_idx) # save vocab with open(ptr_vocab_path.with_suffix('.pkl'), mode="wb") as io: pickle.dump(vocab, io) else: print('Already you have pytorch_model_skt_vocab.json!') if not ptr_tokenizer_path.exists(): urlretrieve('https://kobert.blob.core.windows.net/models/kobert/tokenizer/tokenizer_78b3253a26.model', filename=ptr_tokenizer_path) else: print('Already you have pytorch_model_skt_tokenizer.model')
import json import pickle from model.utils import Vocab from bert.tokenization import BertTokenizer with open('experiment/config.json') as f: params = json.loads(f.read()) # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list', do_lower_case=False) idx_to_token = list(ptr_tokenizer.vocab.keys()) # generate vocab token_vocab = Vocab(idx_to_token, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], unknown_token_idx=1) # save vocab token_vocab_path = params['filepath'].get('token_vocab') with open(token_vocab_path, 'wb') as f: pickle.dump(token_vocab, f)
import pandas as pd import itertools import gluonnlp as nlp from pathlib import Path from collections import Counter from model.split import split_morphs from model.utils import Vocab from utils import Config qpair_dir = Path("qpair") config = Config("conf/dataset/qpair.json") train = pd.read_csv(config.train, sep="\t") list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist() list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist() list_of_tokens = list_of_tokens_qa + list_of_tokens_qb count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy() with open(qpair_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config.update({"vocab": str(qpair_dir / "vocab.pkl")}) config.save("conf/dataset/qpair.json")
def main(): """ here is the plan: for each dialogue create a history sequence of sentences seperated by <s>. The sentences in the history must occur in a short time span from another so they are relevant. The last sentence becomes the response where the response must also be in the span :return: """ parser = argparse.ArgumentParser() parser.add_argument( "-dataset_dir", default="./datasets/personachat/raw", type=str, required=False, help="The input data dir. Should contain the xml for the task.") parser.add_argument("-output_dir", default="./datasets/personachat/", type=str, required=False, help="The output data dir.") parser.add_argument("-type", default="none_original", type=str, required=False, help="The genres you would like to use.") parser.add_argument("-max_sentence_tokens", default=30, type=int, help="the maximum amout of sentence tokens") parser.add_argument( "-a_nice_note", default="only dialogues 1-10", type=str, required=False, help="leave a nice lil note for yourself in the future") parser.add_argument( '-train_split', default=0.9, type=float, help= 'fraction of dataset to use for training, remainder is halved for val & test' ) parser.add_argument('-vocab_size', default=20000, type=int, help='maximum size of the vocabulary for training') args = parser.parse_args() filename = os.path.join(args.dataset_dir, "train_{}.txt".format(args.type)) conversations = create_dialogues(filename, args.max_sentence_tokens) for conversation in conversations: for utterance in conversation: if len(utterance) != args.max_sentence_tokens: print('Length of utterance not equal max: %s' % len(utterance)) exit() print(conversations[0]) # shuffle dataset random.seed('seed') random.shuffle(conversations) print('Number of conversations: %s' % len(conversations)) mean_n_convos = sum([len(conv) for conv in conversations]) / len(conversations) print('Average utterances per conversations: %s' % mean_n_convos) # this is format needed to train dialogue models on this domain def format_for_dialogue(conversations): conversation_length = [len(conv) for conv in conversations] sentence_length = [[ sum([1 for token in sent if token != '<pad>']) for sent in conv ] for conv in conversations] sentences = conversations return conversation_length, sentence_length, sentences val_idx = int(len(conversations) * args.train_split) test_idx = (len(conversations) + val_idx) // 2 print(val_idx) train_convos = conversations[:val_idx] val_convos = conversations[val_idx:test_idx] test_convos = conversations[test_idx:] # construct vocab vocab = Vocab() vocab.add_dataframe(train_convos, tokenized=True) vocab.update(args.vocab_size) print('Vocab size: %s' % len(vocab)) word2id_path = os.path.join(args.output_dir, 'word2id.pkl') id2word_path = os.path.join(args.output_dir, 'id2word.pkl') vocab.pickle(word2id_path, id2word_path) print('Split: train %s, val %s, test %s' % (len(train_convos), len(val_convos), len(test_convos))) os.makedirs(args.output_dir, exist_ok=True) train_convo_len, train_sent_len, train_sent = format_for_dialogue( train_convos) print('Example data') print(train_convo_len[0]) print(train_sent_len[0]) print(train_sent[0]) print() os.makedirs(os.path.join(args.output_dir, 'train'), exist_ok=True) pickle.dump( train_convo_len, open(os.path.join(args.output_dir, 'train', 'conversation_length.pkl'), 'wb')) pickle.dump( train_sent_len, open(os.path.join(args.output_dir, 'train', 'sentence_length.pkl'), 'wb')) pickle.dump( train_sent, open(os.path.join(args.output_dir, 'train', 'sentences.pkl'), 'wb')) val_convo_len, val_sent_len, val_sent = format_for_dialogue(val_convos) os.makedirs(os.path.join(args.output_dir, 'valid'), exist_ok=True) pickle.dump( val_convo_len, open(os.path.join(args.output_dir, 'valid', 'conversation_length.pkl'), 'wb')) pickle.dump( val_sent_len, open(os.path.join(args.output_dir, 'valid', 'sentence_length.pkl'), 'wb')) pickle.dump( val_sent, open(os.path.join(args.output_dir, 'valid', 'sentences.pkl'), 'wb')) test_convo_len, test_sent_len, test_sent = format_for_dialogue(test_convos) os.makedirs(os.path.join(args.output_dir, 'test'), exist_ok=True) pickle.dump( test_convo_len, open(os.path.join(args.output_dir, 'test', 'conversation_length.pkl'), 'wb')) pickle.dump( test_sent_len, open(os.path.join(args.output_dir, 'test', 'sentence_length.pkl'), 'wb')) pickle.dump( test_sent, open(os.path.join(args.output_dir, 'test', 'sentences.pkl'), 'wb'))
import json import pickle from model.utils import Vocab from bert.tokenization import BertTokenizer with open('experiment/config.json') as f: params = json.loads(f.read()) # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list', do_lower_case=False) idx_to_token = list(ptr_tokenizer.vocab.keys()) # generate vocab token_vocab = Vocab(idx_to_token, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], unknown_token_idx=1) label_vocab = Vocab(['<split>', '<non_split>'], unknown_token=None, bos_token=None, eos_token=None) # save vocab token_vocab_path = params['filepath'].get('token_vocab') label_vocab_path = params['filepath'].get('label_vocab') with open(token_vocab_path, 'wb') as f: pickle.dump(token_vocab, f) with open(label_vocab_path, 'wb') as f: pickle.dump(label_vocab, f)
def __init__(self, id, name, checkpoint_path, max_conversation_length=5, max_sentence_length=30, is_test_bot=False, rl=False, safe_mode=True): """ All chatbots should extend this class and be registered with the @registerbot decorator :param id: An id string, must be unique! :param name: A user-friendly string shown to the end user to identify the chatbot. Should be unique. :param checkpoint_path: Directory where the trained model checkpoint is saved. :param max_conversation_length: Maximum number of conversation turns to condition on. :param max_sentence_length: Maximum number of tokens per sentence. :param is_test_bot: If True, this bot it can be chosen from the list of bots you see at /dialogadmins screen, but will never be randomly assigned to users landing on the home page. """ self.id = id self.name = name self.checkpoint_path = checkpoint_path self.max_conversation_length = max_conversation_length self.max_sentence_length = max_sentence_length self.is_test_bot = is_test_bot self.safe_mode = safe_mode print("\n\nCreating chatbot", name) self.config = get_config_from_dir(checkpoint_path, mode='test', load_rl_ckpt=rl) self.config.beam_size = 5 print('Loading Vocabulary...') self.vocab = Vocab() self.vocab.load(self.config.word2id_path, self.config.id2word_path) print(f'Vocabulary size: {self.vocab.vocab_size}') self.config.vocab_size = self.vocab.vocab_size # If checkpoint is for an emotion model, load that pickle file emotion_sentences = None if self.config.emotion: emotion_sentences = load_pickle(self.config.emojis_path) # Load infersent embeddings if necessary infersent_sentences = None if self.config.infersent: print('Loading infersent sentence embeddings...') infersent_sentences = load_pickle(self.config.infersent_path) embedding_size = infersent_sentences[0][0].shape[0] self.config.infersent_output_size = embedding_size self.data_loader = get_loader( sentences=load_pickle(self.config.sentences_path), conversation_length=load_pickle( self.config.conversation_length_path), sentence_length=load_pickle(self.config.sentence_length_path), vocab=self.vocab, batch_size=self.config.batch_size, emojis=emotion_sentences) if self.config.model in VariationalModels: self.solver = VariationalSolver(self.config, None, self.data_loader, vocab=self.vocab, is_train=False) elif self.config.model == 'Transformer': self.solver = ParlAISolver(self.config) else: self.solver = Solver(self.config, None, self.data_loader, vocab=self.vocab, is_train=False) self.solver.build()
list_of_tokens = [ token_count[0] for token_count in token_counter.items() if token_count[1] >= min_freq ] list_of_tokens = sorted(list_of_tokens) list_of_tokens.insert(0, '<pad>') list_of_tokens.insert(0, '<unk>') tmp_vocab = nlp.Vocab(counter=Counter(list_of_tokens), min_freq=1, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab = Vocab(list_of_tokens, padding_token='<pad>', unknown_token='<unk>', bos_token=None, eos_token=None) vocab.embedding = array # saving vocab with open('data/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io) data_config.vocab = 'data/vocab.pkl' data_config.save('data/config.json')
line = line.strip() if line: data.append(line.split('\t')[1:]) else: dataset.append([list(elm) for elm in zip(*data)]) data = [] continue except StopIteration: print('parsing is done') label_counter = nlp.data.count_tokens( itertools.chain.from_iterable(map(lambda elm: elm[1], dataset))) tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None) label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None) with open('./data/label_vocab.pkl', mode='wb') as io: pickle.dump(label_vocab, io) tr, val = train_test_split(dataset, test_size=.1, random_state=777) token_counter = nlp.data.count_tokens( itertools.chain.from_iterable(map(lambda elm: elm[0], tr))) tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_token_vocab.set_embedding(ptr_embedding) token_vocab = Vocab(tmp_token_vocab.idx_to_token) token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy() with open('./data/token_vocab.pkl', mode='wb') as io: pickle.dump(token_vocab, io)
import pickle from pathlib import Path from model.utils import Vocab from utils import Config LIST_OF_CHOSUNG = [ "ㄱ", "ㄲ", "ㄴ", "ㄷ", "ㄸ", "ㄹ", "ㅁ", "ㅂ", "ㅃ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅉ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ" ] LIST_OF_JUNGSUNG = [ "ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ", "ㅗ", "ㅘ", "ㅙ", "ㅚ", "ㅛ", "ㅜ", "ㅝ", "ㅞ", "ㅟ", "ㅠ", "ㅡ", "ㅢ", "ㅣ" ] LIST_OF_JONGSUNG = [ " ", "ㄱ", "ㄲ", "ㄳ", "ㄴ", "ㄵ", "ㄶ", "ㄷ", "ㄹ", "ㄺ", "ㄻ", "ㄼ", "ㄽ", "ㄾ", "ㄿ", "ㅀ", "ㅁ", "ㅂ", "ㅄ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ" ] LIST_OF_JAMOS = sorted( set(LIST_OF_CHOSUNG + LIST_OF_JUNGSUNG + LIST_OF_JONGSUNG)) vocab = Vocab(list_of_tokens=LIST_OF_JAMOS, bos_token=None, eos_token=None) nsmc_dir = Path("nsmc") with open(nsmc_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config = Config("conf/dataset/nsmc.json") config.update({"vocab": str(nsmc_dir / "vocab.pkl")}) config.save("conf/dataset/nsmc.json")
import pickle from model.utils import Vocab from pretrained.tokenization import BertTokenizer # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained( 'pretrained/vocab.korean.rawtext.list', do_lower_case=False) list_of_tokens = list(ptr_tokenizer.vocab.keys()) # generate vocab vocab = Vocab(list_of_tokens, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], token_to_idx={'[UNK]': 1}) # save vocab with open('pretrained/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
# [n_conversations, conversation_length (various)] conversation_length = [ min(len(conversation), max_conv_len) for conversation in conversations ] sentences, sentence_length = preprocess_utils.pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl')) to_pickle(conversations, split_data_dir.joinpath('sentences.pkl')) to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(ubuntu_dir.joinpath('word2id.pkl'), ubuntu_dir.joinpath('id2word.pkl')) print('Done!')
train = pd.read_csv(config.train, sep="\t") list_of_tokens_qa = train["question1"].apply( lambda sen: split_morphs(sen)).tolist() list_of_tokens_qb = train["question2"].apply( lambda sen: split_morphs(sen)).tolist() list_of_tokens = list_of_tokens_qa + list_of_tokens_qb count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) morph_vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) morph_vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy() with open(qpair_dir / "morph_vocab.pkl", mode="wb") as io: pickle.dump(morph_vocab, io) config.update({"coarse_vocab": str(qpair_dir / "morph_vocab.pkl")}) # jamo chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] jungsung_list = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ',
import pickle from model.utils import Vocab chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] jungsung_list = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] jongsung_list = [ ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] list_of_jamos = sorted(set(chosung_list + jungsung_list + jongsung_list)) vocab = Vocab(list_of_jamos, bos_token=None, eos_token=None) with open('data/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
config = Config("conf/dataset/sample.json") tr = pd.read_csv(config.train, sep='\t') # korean vocab split_ko = Stemmer(language='ko') count_ko = Counter( itertools.chain.from_iterable(tr['ko'].apply( split_ko.extract_stem).tolist())) tmp_vocab = nlp.Vocab(count_ko, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko', load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) vocab_ko.embedding = array vocab_ko_filepath = sample_dir / "vocab_ko.pkl" config.update({"source_vocab": str(vocab_ko_filepath)}) with open(vocab_ko_filepath, mode='wb') as io: pickle.dump(vocab_ko, io) # english vocab split_en = Stemmer(language='en') count_en = Counter( itertools.chain.from_iterable(tr['en'].apply( split_en.extract_stem).tolist())) tmp_vocab = nlp.Vocab(count_en) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple',
def load_pickle(path): with open(path, 'rb') as f: return pickle.load(f) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--checkpoint', type=str, default=None) parser.add_argument('--mode', type=str, default='test') # or valid kwargs = parser.parse_args() config = get_config_from_dir(kwargs.checkpoint, mode=kwargs.mode) print(config) print('Loading Vocabulary...') vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size emotion_sentences = None if config.emotion: emotion_sentences = load_pickle(config.emojis_path) # Load infersent embeddings if necessary infersent_sentences = None if config.infersent: print('Loading infersent sentence embeddings...') infersent_sentences = load_pickle(config.infersent_path) embedding_size = infersent_sentences[0][0].shape[0]
else: print("Already you have {}".format(config_filename)) print("Saving the config of {} is done.".format(args.type)) # saving vocab of pretraining model ptr_tokenizer = BertTokenizer.from_pretrained( args.type, do_lower_case="uncased" in args.type ) idx_to_token = list(ptr_tokenizer.vocab.keys()) token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)} vocab = Vocab( list_of_tokens=idx_to_token, unknown_token="[UNK]", padding_token="[PAD]", bos_token=None, eos_token=None, reserved_tokens=["[CLS]", "[SEP]", "[MASK]"], token_to_idx=token_to_idx ) vocab_filename = "{}-vocab.pkl".format(args.type) vocab_filepath = ptr_dir / vocab_filename if not vocab_filepath.exists(): with open(vocab_filepath, mode="wb") as io: pickle.dump(vocab, io) else: print("Already you have {}".format(vocab_filename)) print("Saving the vocab of {} is done".format(args.type))