def __init__(self, prior_config, rl_config, beam_size=5): self.prior_config = prior_config self.rl_config = rl_config self.rl_config.beam_size = beam_size print('Loading Vocabulary...') self.vocab = Vocab() self.vocab.load(prior_config.word2id_path, prior_config.id2word_path) self.prior_config.vocab_size = self.vocab.vocab_size self.rl_config.vocab_size = self.vocab.vocab_size print(f'Vocabulary size: {self.vocab.vocab_size}') self.eval_data = self.get_data_loader() self.build_models()
def __init__(self, vocab: Vocab) -> None: """Instantiating MultiChannelEmbedding class Args: vocab (model.utils.Vocab): the instance of model.utils.Vocab """ super(MultiChannelEmbedding, self).__init__() self._static = nn.Embedding.from_pretrained( torch.from_numpy(vocab.embedding), freeze=True, padding_idx=vocab.to_indices(vocab.padding_token)) self._non_static = nn.Embedding.from_pretrained( torch.from_numpy(vocab.embedding), freeze=False, padding_idx=vocab.to_indices(vocab.padding_token))
def __init__(self, vocab: Vocab, method: str, encoder_output_dim: int, decoder_hidden_dim: int, drop_ratio: int = .2) -> None: """Instantiating Encoder class Args: vocab (model.utils.Vocab): the instance of model.utils.Vocab method (str): the method of attention, 'dot', 'general', 'concat' encoder_hidden_dim (int): the dimension of hidden state and cell state of encoder decoder_hidden_dim (int): the dimension of hidden state and cell state of decoder drop_ratio (float): ratio of drop out, default 0.2 """ super(AttnDecoder, self).__init__() self._emb = Embedding(vocab=vocab, padding_idx=vocab.to_indices( vocab.padding_token), freeze=False, permuting=False, tracking=False) self._ops = nn.LSTM(self._emb._ops.embedding_dim, decoder_hidden_dim, batch_first=True, num_layers=2, dropout=drop_ratio) self._attn = GlobalAttn(method=method, encoder_output_dim=encoder_output_dim, decoder_hidden_dim=decoder_hidden_dim) self._concat = nn.Linear(encoder_output_dim + decoder_hidden_dim, self._emb._ops.embedding_dim, bias=False) # self._classify = nn.Linear(self._emb._ops.embedding_dim, len(vocab)) self._dropout = nn.Dropout(p=drop_ratio)
def __init__(self, num_classes: int, lstm_hidden_dim: int, da: int, r: int, hidden_dim: int, vocab: Vocab) -> None: """Instantiating SAN class Args: num_classes (int): the number of classes lstm_hidden_dim (int): the number of features in the hidden states in bi-directional lstm da (int): the number of features in hidden layer from self-attention r (int): the number of aspects of self-attention hidden_dim (int): the number of features in hidden layer from mlp vocab (model.utils.Vocab): the instance of model.utils.Vocab """ super(SAN, self).__init__() self._embedding = Embedding(vocab, padding_idx=vocab.to_indices( vocab.padding_token), freeze=False, permuting=False, tracking=True) self._pipe = Linker(permuting=False) self._bilstm = BiLSTM(self._embedding._ops.embedding_dim, lstm_hidden_dim, using_sequence=True) self._attention = SelfAttention(2 * lstm_hidden_dim, da, r) self._fc1 = nn.Linear(2 * lstm_hidden_dim * r, hidden_dim) self._fc2 = nn.Linear(hidden_dim, num_classes)
def __init__(self, num_classes: int, embedding_dim: int, vocab: Vocab) -> None: """Instantiating CharCNN class Args: num_classes (int): the number of classes embedding_dim (int): the dimension of embedding vector for token vocab (model.utils.Vocab): the instance of model.utils.Vocab """ super(CharCNN, self).__init__() self._extractor = nn.Sequential( nn.Embedding(len(vocab), embedding_dim, vocab.to_indices(vocab.padding_token)), Permute(), nn.Conv1d(in_channels=embedding_dim, out_channels=256, kernel_size=7), nn.ReLU(), nn.MaxPool1d(3, 3), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=7), nn.ReLU(), nn.MaxPool1d(3, 3), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.MaxPool1d(3, 3), Flatten()) self._classifier = nn.Sequential( nn.Linear(in_features=1792, out_features=512), nn.ReLU(), nn.Dropout(), nn.Linear(in_features=512, out_features=512), nn.ReLU(), nn.Dropout(), nn.Linear(in_features=512, out_features=num_classes)) self.apply(self._initailze)
def __init__(self, vocab: Vocab, encoder_hidden_dim: int, drop_ratio: int = .2) -> None: """Instantiating Encoder class Args: vocab (model.utils.Vocab): the instance of model.utils.Vocab encoder_hidden_dim (int): the dimension of hidden state and cell state drop_ratio (float): ratio of drop out, default 0.2 """ super(BidiEncoder, self).__init__() self._emb = Embedding(vocab=vocab, padding_idx=vocab.to_indices( vocab.padding_token), freeze=False, permuting=False, tracking=True) self._linker = Linker(permuting=False) self._ops = nn.LSTM(self._emb._ops.embedding_dim, encoder_hidden_dim, batch_first=True, num_layers=2, dropout=drop_ratio, bidirectional=True)
def __init__(self, config, val_config): self.config = config self.val_config = val_config vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) self.vocab = vocab self.config.vocab_size = vocab.vocab_size # To initialize simulated conversations self.start_sentences = self.load_sentences(self.config.dataset_dir) self.eval_data = self.get_data_loader(train=False) self.build_models() if self.config.load_rl_ckpt: self.load_models() self.set_up_optimizers() self.set_up_summary() self.set_up_logging() if self.config.rl_batch_size == self.config.beam_size: raise ValueError('Decoding breaks if batch_size == beam_size')
import json import pickle from model.utils import Vocab from bert.tokenization import BertTokenizer with open('experiment/config.json') as f: params = json.loads(f.read()) # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list', do_lower_case=False) idx_to_token = list(ptr_tokenizer.vocab.keys()) # generate vocab token_vocab = Vocab(idx_to_token, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], unknown_token_idx=1) # save vocab token_vocab_path = params['filepath'].get('token_vocab') with open(token_vocab_path, 'wb') as f: pickle.dump(token_vocab, f)
import pandas as pd import itertools import gluonnlp as nlp from pathlib import Path from collections import Counter from model.split import split_morphs from model.utils import Vocab from utils import Config qpair_dir = Path("qpair") config = Config("conf/dataset/qpair.json") train = pd.read_csv(config.train, sep="\t") list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist() list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist() list_of_tokens = list_of_tokens_qa + list_of_tokens_qb count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy() with open(qpair_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config.update({"vocab": str(qpair_dir / "vocab.pkl")}) config.save("conf/dataset/qpair.json")
class DBCQ: def __init__(self, prior_config, rl_config, beam_size=5): self.prior_config = prior_config self.rl_config = rl_config self.rl_config.beam_size = beam_size print('Loading Vocabulary...') self.vocab = Vocab() self.vocab.load(prior_config.word2id_path, prior_config.id2word_path) self.prior_config.vocab_size = self.vocab.vocab_size self.rl_config.vocab_size = self.vocab.vocab_size print(f'Vocabulary size: {self.vocab.vocab_size}') self.eval_data = self.get_data_loader() self.build_models() def build_models(self): rl_config = copy.deepcopy(self.rl_config) rl_config.checkpoint = None print('Building Q network') if rl_config.model in VariationalModels: self.q_net = VariationalSolver( rl_config, None, self.eval_data, vocab=self.vocab, is_train=False) else: self.q_net = Solver( rl_config, None, self.eval_data, vocab=self.vocab, is_train=False) self.q_net.build() self.load_q_network() print('Building prior network') if self.prior_config.model in VariationalModels: self.pretrained_prior = VariationalSolver( self.prior_config, None, self.eval_data, vocab=self.vocab, is_train=False) else: self.pretrained_prior = Solver( self.prior_config, None, self.eval_data, vocab=self.vocab, is_train=False) self.pretrained_prior.build() # Freeze the weights so they stay constant self.pretrained_prior.model.eval() for params in self.pretrained_prior.model.parameters(): params.requires_grad = False self.q_net.model.eval() for params in self.q_net.model.parameters(): params.requires_grad = False def load_q_network(self): """Load parameters from RL checkpoint""" print(f'Loading parameters for Q net from {self.rl_config.checkpoint}') q_ckpt = torch.load(self.rl_config.checkpoint) q_ckpt = convert_old_checkpoint_format(q_ckpt) self.q_net.model.load_state_dict(q_ckpt) # Ensure weights are initialized to be on the GPU when necessary if torch.cuda.is_available(): print('Converting checkpointed model to cuda tensors') self.q_net.model.cuda() def get_data_loader(self): # If checkpoint is for an emotion model, load that pickle file emotion_sentences = None if self.prior_config.emotion: emotion_sentences = load_pickle(self.prior_config.emojis_path) # Load infersent embeddings if necessary infersent_sentences = None if self.prior_config.infersent: print('Loading infersent sentence embeddings...') infersent_sentences = load_pickle(self.prior_config.infersent_path) embedding_size = infersent_sentences[0][0].shape[0] self.prior_config.infersent_output_size = embedding_size return get_loader( sentences=load_pickle(self.prior_config.sentences_path), conversation_length=load_pickle( self.prior_config.conversation_length_path), sentence_length=load_pickle(self.prior_config.sentence_length_path), vocab=self.vocab, batch_size=self.prior_config.batch_size, emojis=emotion_sentences, infersent=infersent_sentences) def interact(self, max_conversation_length=5, sample_by='priority', debug=True): model_name = self.prior_config.model context_sentences = [] print("Time to start a conversation with the chatbot! It's name is", model_name) username = input("What is your name? ") print("Let's start chatting. You can type 'quit' at any time to quit.") utterance = input("Input: ") print("\033[1A\033[K") # Erases last line of output while (utterance.lower() != 'quit' and utterance.lower() != 'exit'): # Process utterance sentences = utterance.split('/') # Code and decode user input to show how it is transformed for model coded, lens = self.pretrained_prior.process_user_input(sentences) decoded = [self.vocab.decode(sent) for i, sent in enumerate( coded) if i < lens[i]] print(username + ':', '. '.join(decoded)) # Append to conversation context_sentences.extend(sentences) gen_response = self.generate_response_to_input( context_sentences, max_conversation_length, sample_by=sample_by, debug=debug) # Append generated sentences to conversation context_sentences.append(gen_response) # Print and get next user input print("\n" + model_name + ": " + gen_response) utterance = input("Input: ") print("\033[1A\033[K") def process_raw_text_into_input(self, raw_text_sentences, max_conversation_length=5, debug=False,): sentences, lens = self.pretrained_prior.process_user_input( raw_text_sentences, self.rl_config.max_sentence_length) # Remove any sentences of length 0 sentences = [sent for i, sent in enumerate(sentences) if lens[i] > 0] good_raw_sentences = [sent for i, sent in enumerate( raw_text_sentences) if lens[i] > 0] lens = [l for l in lens if l > 0] # Trim conversation to max length sentences = sentences[-max_conversation_length:] lens = lens[-max_conversation_length:] good_raw_sentences = good_raw_sentences[-max_conversation_length:] convo_length = len(sentences) # Convert to torch variables input_sentences = to_var(torch.LongTensor(sentences)) input_sentence_length = to_var(torch.LongTensor(lens)) input_conversation_length = to_var(torch.LongTensor([convo_length])) if debug: print('\n**Conversation history:**') for sent in sentences: print(self.vocab.decode(list(sent))) return (input_sentences, input_sentence_length, input_conversation_length) def duplicate_context_for_beams(self, sentences, sent_lens, conv_lens, beams): conv_lens = conv_lens.repeat(len(beams)) # [beam_size * sentences, sentence_len] if len(sentences) > 1: targets = torch.cat( [torch.cat([sentences[1:,:], beams[i,:].unsqueeze(0)], 0) for i in range(len(beams))], 0) else: targets = beams # HRED if self.rl_config.model not in VariationalModels: sent_lens = sent_lens.repeat(len(beams)) return sentences, sent_lens, conv_lens, targets # VHRED, VHCR new_sentences = torch.cat( [torch.cat([sentences, beams[i,:].unsqueeze(0)], 0) for i in range(len(beams))], 0) new_len = to_var(torch.LongTensor([self.rl_config.max_sentence_length])) sent_lens = torch.cat( [torch.cat([sent_lens, new_len], 0) for i in range(len(beams))]) return new_sentences, sent_lens, conv_lens, targets def generate_response_to_input(self, raw_text_sentences, max_conversation_length=5, sample_by='priority', emojize=True, debug=True): with torch.no_grad(): (input_sentences, input_sent_lens, input_conv_lens) = self.process_raw_text_into_input( raw_text_sentences, debug=debug, max_conversation_length=max_conversation_length) # Initialize a tensor for beams beams = to_var(torch.LongTensor( np.ones((self.rl_config.beam_size, self.rl_config.max_sentence_length)))) # Create a batch with the context duplicated for each beam (sentences, sent_lens, conv_lens, targets) = self.duplicate_context_for_beams( input_sentences, input_sent_lens, input_conv_lens, beams) # Continuously feed beam sentences into networks to sample the next # best word, add that to the beam, and continue for i in range(self.rl_config.max_sentence_length): # Run both models to obtain logits prior_output = self.pretrained_prior.model( sentences, sent_lens, conv_lens, targets, rl_mode=True) all_prior_logits = prior_output[0] q_output = self.q_net.model( sentences, sent_lens, conv_lens, targets, rl_mode=True) all_q_logits = q_output[0] # Select only those logits for next word q_logits = all_q_logits[:, i, :].squeeze() prior_logits = all_prior_logits[:, i, :].squeeze() # Get prior distribution for next word in each beam prior_dists = torch.nn.functional.softmax(prior_logits, 1) for b in range(self.rl_config.beam_size): # Sample from the prior bcq_n times for each beam dist = torch.distributions.Categorical(prior_dists[b,:]) sampled_idxs = dist.sample_n(self.rl_config.bcq_n) # Select sample with highest q value q_vals = torch.stack( [q_logits[b, idx] for idx in sampled_idxs]) _, best_word_i = torch.max(q_vals, 0) best_word = sampled_idxs[best_word_i] # Update beams beams[b, i] = best_word (sentences, sent_lens, conv_lens, targets) = self.duplicate_context_for_beams( input_sentences, input_sent_lens, input_conv_lens, beams) generated_sentences = beams.cpu().numpy() if debug: print('\n**All generated responses:**') for gen in generated_sentences: print(detokenize(self.vocab.decode(list(gen)))) gen_response = self.pretrained_prior.select_best_generated_response( generated_sentences, sample_by, beam_size=self.rl_config.beam_size) decoded_response = self.vocab.decode(list(gen_response)) decoded_response = detokenize(decoded_response) if emojize: inferred_emojis = self.pretrained_prior.botmoji.emojize_text( raw_text_sentences[-1], 5, 0.07) decoded_response = inferred_emojis + " " + decoded_response return decoded_response
import pickle from model.utils import Vocab chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] jungsung_list = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] jongsung_list = [ ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] list_of_jamos = sorted(set(chosung_list + jungsung_list + jongsung_list)) vocab = Vocab(list_of_jamos, bos_token=None, eos_token=None) with open('data/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
# extracting morph in sentences list_of_tokens = tr["document"].apply(split_morphs).tolist() # generating the vocab token_counter = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=token_counter, min_freq=10, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko") tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab = Vocab( tmp_vocab.idx_to_token, padding_token="<pad>", unknown_token="<unk>", bos_token=None, eos_token=None, ) vocab.embedding = array # saving vocab with open(nsmc_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config.update({"vocab": str(nsmc_dir / "vocab.pkl")}) config.save("conf/dataset/nsmc.json")
def __init__(self, id, name, checkpoint_path, max_conversation_length=5, max_sentence_length=30, is_test_bot=False, rl=False, safe_mode=True): """ All chatbots should extend this class and be registered with the @registerbot decorator :param id: An id string, must be unique! :param name: A user-friendly string shown to the end user to identify the chatbot. Should be unique. :param checkpoint_path: Directory where the trained model checkpoint is saved. :param max_conversation_length: Maximum number of conversation turns to condition on. :param max_sentence_length: Maximum number of tokens per sentence. :param is_test_bot: If True, this bot it can be chosen from the list of bots you see at /dialogadmins screen, but will never be randomly assigned to users landing on the home page. """ self.id = id self.name = name self.checkpoint_path = checkpoint_path self.max_conversation_length = max_conversation_length self.max_sentence_length = max_sentence_length self.is_test_bot = is_test_bot self.safe_mode = safe_mode print("\n\nCreating chatbot", name) self.config = get_config_from_dir(checkpoint_path, mode='test', load_rl_ckpt=rl) self.config.beam_size = 5 print('Loading Vocabulary...') self.vocab = Vocab() self.vocab.load(self.config.word2id_path, self.config.id2word_path) print(f'Vocabulary size: {self.vocab.vocab_size}') self.config.vocab_size = self.vocab.vocab_size # If checkpoint is for an emotion model, load that pickle file emotion_sentences = None if self.config.emotion: emotion_sentences = load_pickle(self.config.emojis_path) # Load infersent embeddings if necessary infersent_sentences = None if self.config.infersent: print('Loading infersent sentence embeddings...') infersent_sentences = load_pickle(self.config.infersent_path) embedding_size = infersent_sentences[0][0].shape[0] self.config.infersent_output_size = embedding_size self.data_loader = get_loader( sentences=load_pickle(self.config.sentences_path), conversation_length=load_pickle( self.config.conversation_length_path), sentence_length=load_pickle(self.config.sentence_length_path), vocab=self.vocab, batch_size=self.config.batch_size, emojis=emotion_sentences) if self.config.model in VariationalModels: self.solver = VariationalSolver(self.config, None, self.data_loader, vocab=self.vocab, is_train=False) elif self.config.model == 'Transformer': self.solver = ParlAISolver(self.config) else: self.solver = Solver(self.config, None, self.data_loader, vocab=self.vocab, is_train=False) self.solver.build()
class Chatbot(ABC): def __init__(self, id, name, checkpoint_path, max_conversation_length=5, max_sentence_length=30, is_test_bot=False, rl=False, safe_mode=True): """ All chatbots should extend this class and be registered with the @registerbot decorator :param id: An id string, must be unique! :param name: A user-friendly string shown to the end user to identify the chatbot. Should be unique. :param checkpoint_path: Directory where the trained model checkpoint is saved. :param max_conversation_length: Maximum number of conversation turns to condition on. :param max_sentence_length: Maximum number of tokens per sentence. :param is_test_bot: If True, this bot it can be chosen from the list of bots you see at /dialogadmins screen, but will never be randomly assigned to users landing on the home page. """ self.id = id self.name = name self.checkpoint_path = checkpoint_path self.max_conversation_length = max_conversation_length self.max_sentence_length = max_sentence_length self.is_test_bot = is_test_bot self.safe_mode = safe_mode print("\n\nCreating chatbot", name) self.config = get_config_from_dir(checkpoint_path, mode='test', load_rl_ckpt=rl) self.config.beam_size = 5 print('Loading Vocabulary...') self.vocab = Vocab() self.vocab.load(self.config.word2id_path, self.config.id2word_path) print(f'Vocabulary size: {self.vocab.vocab_size}') self.config.vocab_size = self.vocab.vocab_size # If checkpoint is for an emotion model, load that pickle file emotion_sentences = None if self.config.emotion: emotion_sentences = load_pickle(self.config.emojis_path) # Load infersent embeddings if necessary infersent_sentences = None if self.config.infersent: print('Loading infersent sentence embeddings...') infersent_sentences = load_pickle(self.config.infersent_path) embedding_size = infersent_sentences[0][0].shape[0] self.config.infersent_output_size = embedding_size self.data_loader = get_loader( sentences=load_pickle(self.config.sentences_path), conversation_length=load_pickle( self.config.conversation_length_path), sentence_length=load_pickle(self.config.sentence_length_path), vocab=self.vocab, batch_size=self.config.batch_size, emojis=emotion_sentences) if self.config.model in VariationalModels: self.solver = VariationalSolver(self.config, None, self.data_loader, vocab=self.vocab, is_train=False) elif self.config.model == 'Transformer': self.solver = ParlAISolver(self.config) else: self.solver = Solver(self.config, None, self.data_loader, vocab=self.vocab, is_train=False) self.solver.build() def handle_messages(self, messages): """ Takes a list of messages, and combines those with magic to return a response string :param messages: list of strings :return: string """ greetings = [ "hey , how are you ?", "hi , how 's it going ?", "hey , what 's up ?", "hi . how are you ?", "hello , how are you doing today ? ", "hello . how are things with you ?", "hey ! so, tell me about yourself .", "hi . nice to meet you ." ] # Check for no response if len(messages) == 0: # Respond with canned greeting response return np.random.choice(greetings) # Check for overly short intro messages if len(messages) < 2 and len(messages[0]) <= 6: # 6 for "hello." first_m = messages[0].lower() if 'hi' in first_m or 'hey' in first_m or 'hello' in first_m: # Respond with canned greeting response return np.random.choice(greetings) response = self.solver.generate_response_to_input( messages, max_conversation_length=self.max_conversation_length, emojize=True, debug=False) # Manually remove inappropriate language from response. # WARNING: the following code contains inappropriate language if self.safe_mode: response = response.replace("f*g", "<unknown>") response = response.replace("gays", "<unknown>") response = response.replace("c**t", "%@#$") response = response.replace("f**k", "%@#$") response = response.replace("shit", "%@#$") response = response.replace("dyke", "%@#$") response = response.replace("hell", "heck") response = response.replace("dick", "d***") response = response.replace("bitch", "%@#$") return response
import itertools import pickle from collections import Counter from pathlib import Path from model.split import split_morphs, split_space from model.utils import Vocab data_dir = Path('data') tr_filepath = (data_dir / 'train').with_suffix('.txt') tr_dataset = pd.read_csv(tr_filepath, sep='\t') # korean vocab count_ko = Counter( itertools.chain.from_iterable( tr_dataset['ko'].apply(split_morphs).tolist())) list_of_token_ko = [token[0] for token in count_ko.items() if token[1] >= 15] vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None) with open(data_dir / 'vocab_ko.pkl', mode='wb') as io: pickle.dump(vocab_ko, io) # english vocab count_en = Counter( itertools.chain.from_iterable( tr_dataset['en'].apply(split_space).tolist())) list_of_token_en = [token[0] for token in count_en.items() if token[1] >= 15] vocab_en = Vocab(list_of_token_en) with open(data_dir / 'vocab_en.pkl', mode='wb') as io: pickle.dump(vocab_en, io)
list_of_tokens = [ token_count[0] for token_count in token_counter.items() if token_count[1] >= min_freq ] list_of_tokens = sorted(list_of_tokens) list_of_tokens.insert(0, '<pad>') list_of_tokens.insert(0, '<unk>') tmp_vocab = nlp.Vocab(counter=Counter(list_of_tokens), min_freq=1, bos_token=None, eos_token=None) # connecting SISG embedding with vocab ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab = Vocab(list_of_tokens, padding_token='<pad>', unknown_token='<unk>', bos_token=None, eos_token=None) vocab.embedding = array # saving vocab with open('data/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io) data_config.vocab = 'data/vocab.pkl' data_config.save('data/config.json')
line = line.strip() if line: data.append(line.split('\t')[1:]) else: dataset.append([list(elm) for elm in zip(*data)]) data = [] continue except StopIteration: print('parsing is done') label_counter = nlp.data.count_tokens( itertools.chain.from_iterable(map(lambda elm: elm[1], dataset))) tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None) label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None) with open('./data/label_vocab.pkl', mode='wb') as io: pickle.dump(label_vocab, io) tr, val = train_test_split(dataset, test_size=.1, random_state=777) token_counter = nlp.data.count_tokens( itertools.chain.from_iterable(map(lambda elm: elm[0], tr))) tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_token_vocab.set_embedding(ptr_embedding) token_vocab = Vocab(tmp_token_vocab.idx_to_token) token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy() with open('./data/token_vocab.pkl', mode='wb') as io: pickle.dump(token_vocab, io)
import pickle from model.utils import Vocab from pretrained.tokenization import BertTokenizer # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained( 'pretrained/vocab.korean.rawtext.list', do_lower_case=False) list_of_tokens = list(ptr_tokenizer.vocab.keys()) # generate vocab vocab = Vocab(list_of_tokens, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], token_to_idx={'[UNK]': 1}) # save vocab with open('pretrained/vocab.pkl', mode='wb') as io: pickle.dump(vocab, io)
urlretrieve('https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params', filename=ptr_bert_path) ptr_bert = torch.load(ptr_bert_path) ptr_bert = OrderedDict([(('bert.' + k), ptr_bert.get(k)) for k in ptr_bert.keys()]) torch.save(ptr_bert, ptr_bert_path) else: print('Already you have pytorch_model_skt.bin!') if not ptr_vocab_path.exists(): urlretrieve('https://kobert.blob.core.windows.net/models/kobert/vocab/kobertvocab_f38b8a4d6d.json', filename=ptr_vocab_path) ptr_bert_vocab = BERTVocab.from_json(ptr_vocab_path.open(mode='rt').read()) vocab = Vocab(ptr_bert_vocab.idx_to_token, padding_token="[PAD]", unknown_token="[UNK]", bos_token=None, eos_token=None, reserved_tokens=["[CLS]", "[SEP]", "[MASK]"], token_to_idx=ptr_bert_vocab.token_to_idx) # save vocab with open(ptr_vocab_path.with_suffix('.pkl'), mode="wb") as io: pickle.dump(vocab, io) else: print('Already you have pytorch_model_skt_vocab.json!') if not ptr_tokenizer_path.exists(): urlretrieve('https://kobert.blob.core.windows.net/models/kobert/tokenizer/tokenizer_78b3253a26.model', filename=ptr_tokenizer_path) else: print('Already you have pytorch_model_skt_tokenizer.model')
# korean vocab split_ko = Stemmer(language='ko') count_ko = Counter( itertools.chain.from_iterable(tr_dataset['ko'].apply( split_ko.extract_stem).tolist())) list_of_token_ko = sorted( [token[0] for token in count_ko.items() if token[1] >= 15]) tmp_vocab = nlp.Vocab(Counter(list_of_token_ko), bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None) vocab_ko.embedding = array with open(data_dir / 'vocab_ko.pkl', mode='wb') as io: pickle.dump(vocab_ko, io) # english vocab split_en = Stemmer(language='en') count_en = Counter( itertools.chain.from_iterable(tr_dataset['en'].apply( split_en.extract_stem).tolist())) list_of_token_en = [token[0] for token in count_en.items() if token[1] >= 15] tmp_vocab = nlp.Vocab(Counter(list_of_token_en)) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy()
import json import pickle from model.utils import Vocab from bert.tokenization import BertTokenizer with open('experiment/config.json') as f: params = json.loads(f.read()) # loading BertTokenizer ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list', do_lower_case=False) idx_to_token = list(ptr_tokenizer.vocab.keys()) # generate vocab token_vocab = Vocab(idx_to_token, padding_token='[PAD]', unknown_token='[UNK]', bos_token=None, eos_token=None, reserved_tokens=['[CLS]', '[SEP]', '[MASK]'], unknown_token_idx=1) label_vocab = Vocab(['<split>', '<non_split>'], unknown_token=None, bos_token=None, eos_token=None) # save vocab token_vocab_path = params['filepath'].get('token_vocab') label_vocab_path = params['filepath'].get('label_vocab') with open(token_vocab_path, 'wb') as f: pickle.dump(token_vocab, f) with open(label_vocab_path, 'wb') as f: pickle.dump(label_vocab, f)
# [n_conversations, conversation_length (various)] conversation_length = [ min(len(conversation), max_conv_len) for conversation in conversations ] sentences, sentence_length = preprocess_utils.pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl')) to_pickle(conversations, split_data_dir.joinpath('sentences.pkl')) to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(ubuntu_dir.joinpath('word2id.pkl'), ubuntu_dir.joinpath('id2word.pkl')) print('Done!')
else: print("Already you have {}".format(config_filename)) print("Saving the config of {} is done.".format(args.type)) # saving vocab of pretraining model ptr_tokenizer = BertTokenizer.from_pretrained( args.type, do_lower_case="uncased" in args.type ) idx_to_token = list(ptr_tokenizer.vocab.keys()) token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)} vocab = Vocab( list_of_tokens=idx_to_token, unknown_token="[UNK]", padding_token="[PAD]", bos_token=None, eos_token=None, reserved_tokens=["[CLS]", "[SEP]", "[MASK]"], token_to_idx=token_to_idx ) vocab_filename = "{}-vocab.pkl".format(args.type) vocab_filepath = ptr_dir / vocab_filename if not vocab_filepath.exists(): with open(vocab_filepath, mode="wb") as io: pickle.dump(vocab, io) else: print("Already you have {}".format(vocab_filename)) print("Saving the vocab of {} is done".format(args.type))
def main(): """ here is the plan: for each dialogue create a history sequence of sentences seperated by <s>. The sentences in the history must occur in a short time span from another so they are relevant. The last sentence becomes the response where the response must also be in the span :return: """ parser = argparse.ArgumentParser() parser.add_argument( "-dataset_dir", default="./datasets/personachat/raw", type=str, required=False, help="The input data dir. Should contain the xml for the task.") parser.add_argument("-output_dir", default="./datasets/personachat/", type=str, required=False, help="The output data dir.") parser.add_argument("-type", default="none_original", type=str, required=False, help="The genres you would like to use.") parser.add_argument("-max_sentence_tokens", default=30, type=int, help="the maximum amout of sentence tokens") parser.add_argument( "-a_nice_note", default="only dialogues 1-10", type=str, required=False, help="leave a nice lil note for yourself in the future") parser.add_argument( '-train_split', default=0.9, type=float, help= 'fraction of dataset to use for training, remainder is halved for val & test' ) parser.add_argument('-vocab_size', default=20000, type=int, help='maximum size of the vocabulary for training') args = parser.parse_args() filename = os.path.join(args.dataset_dir, "train_{}.txt".format(args.type)) conversations = create_dialogues(filename, args.max_sentence_tokens) for conversation in conversations: for utterance in conversation: if len(utterance) != args.max_sentence_tokens: print('Length of utterance not equal max: %s' % len(utterance)) exit() print(conversations[0]) # shuffle dataset random.seed('seed') random.shuffle(conversations) print('Number of conversations: %s' % len(conversations)) mean_n_convos = sum([len(conv) for conv in conversations]) / len(conversations) print('Average utterances per conversations: %s' % mean_n_convos) # this is format needed to train dialogue models on this domain def format_for_dialogue(conversations): conversation_length = [len(conv) for conv in conversations] sentence_length = [[ sum([1 for token in sent if token != '<pad>']) for sent in conv ] for conv in conversations] sentences = conversations return conversation_length, sentence_length, sentences val_idx = int(len(conversations) * args.train_split) test_idx = (len(conversations) + val_idx) // 2 print(val_idx) train_convos = conversations[:val_idx] val_convos = conversations[val_idx:test_idx] test_convos = conversations[test_idx:] # construct vocab vocab = Vocab() vocab.add_dataframe(train_convos, tokenized=True) vocab.update(args.vocab_size) print('Vocab size: %s' % len(vocab)) word2id_path = os.path.join(args.output_dir, 'word2id.pkl') id2word_path = os.path.join(args.output_dir, 'id2word.pkl') vocab.pickle(word2id_path, id2word_path) print('Split: train %s, val %s, test %s' % (len(train_convos), len(val_convos), len(test_convos))) os.makedirs(args.output_dir, exist_ok=True) train_convo_len, train_sent_len, train_sent = format_for_dialogue( train_convos) print('Example data') print(train_convo_len[0]) print(train_sent_len[0]) print(train_sent[0]) print() os.makedirs(os.path.join(args.output_dir, 'train'), exist_ok=True) pickle.dump( train_convo_len, open(os.path.join(args.output_dir, 'train', 'conversation_length.pkl'), 'wb')) pickle.dump( train_sent_len, open(os.path.join(args.output_dir, 'train', 'sentence_length.pkl'), 'wb')) pickle.dump( train_sent, open(os.path.join(args.output_dir, 'train', 'sentences.pkl'), 'wb')) val_convo_len, val_sent_len, val_sent = format_for_dialogue(val_convos) os.makedirs(os.path.join(args.output_dir, 'valid'), exist_ok=True) pickle.dump( val_convo_len, open(os.path.join(args.output_dir, 'valid', 'conversation_length.pkl'), 'wb')) pickle.dump( val_sent_len, open(os.path.join(args.output_dir, 'valid', 'sentence_length.pkl'), 'wb')) pickle.dump( val_sent, open(os.path.join(args.output_dir, 'valid', 'sentences.pkl'), 'wb')) test_convo_len, test_sent_len, test_sent = format_for_dialogue(test_convos) os.makedirs(os.path.join(args.output_dir, 'test'), exist_ok=True) pickle.dump( test_convo_len, open(os.path.join(args.output_dir, 'test', 'conversation_length.pkl'), 'wb')) pickle.dump( test_sent_len, open(os.path.join(args.output_dir, 'test', 'sentence_length.pkl'), 'wb')) pickle.dump( test_sent, open(os.path.join(args.output_dir, 'test', 'sentences.pkl'), 'wb'))
config = Config("conf/dataset/sample.json") tr = pd.read_csv(config.train, sep='\t') # korean vocab split_ko = Stemmer(language='ko') count_ko = Counter( itertools.chain.from_iterable(tr['ko'].apply( split_ko.extract_stem).tolist())) tmp_vocab = nlp.Vocab(count_ko, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko', load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) vocab_ko.embedding = array vocab_ko_filepath = sample_dir / "vocab_ko.pkl" config.update({"source_vocab": str(vocab_ko_filepath)}) with open(vocab_ko_filepath, mode='wb') as io: pickle.dump(vocab_ko, io) # english vocab split_en = Stemmer(language='en') count_en = Counter( itertools.chain.from_iterable(tr['en'].apply( split_en.extract_stem).tolist())) tmp_vocab = nlp.Vocab(count_en) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple',
train = pd.read_csv(config.train, sep="\t") list_of_tokens_qa = train["question1"].apply( lambda sen: split_morphs(sen)).tolist() list_of_tokens_qb = train["question2"].apply( lambda sen: split_morphs(sen)).tolist() list_of_tokens = list_of_tokens_qa + list_of_tokens_qb count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens)) tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True) tmp_vocab.set_embedding(ptr_embedding) morph_vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None) morph_vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy() with open(qpair_dir / "morph_vocab.pkl", mode="wb") as io: pickle.dump(morph_vocab, io) config.update({"coarse_vocab": str(qpair_dir / "morph_vocab.pkl")}) # jamo chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] jungsung_list = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ',
import pickle from pathlib import Path from model.utils import Vocab from utils import Config LIST_OF_CHOSUNG = [ "ㄱ", "ㄲ", "ㄴ", "ㄷ", "ㄸ", "ㄹ", "ㅁ", "ㅂ", "ㅃ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅉ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ" ] LIST_OF_JUNGSUNG = [ "ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ", "ㅗ", "ㅘ", "ㅙ", "ㅚ", "ㅛ", "ㅜ", "ㅝ", "ㅞ", "ㅟ", "ㅠ", "ㅡ", "ㅢ", "ㅣ" ] LIST_OF_JONGSUNG = [ " ", "ㄱ", "ㄲ", "ㄳ", "ㄴ", "ㄵ", "ㄶ", "ㄷ", "ㄹ", "ㄺ", "ㄻ", "ㄼ", "ㄽ", "ㄾ", "ㄿ", "ㅀ", "ㅁ", "ㅂ", "ㅄ", "ㅅ", "ㅆ", "ㅇ", "ㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ" ] LIST_OF_JAMOS = sorted( set(LIST_OF_CHOSUNG + LIST_OF_JUNGSUNG + LIST_OF_JONGSUNG)) vocab = Vocab(list_of_tokens=LIST_OF_JAMOS, bos_token=None, eos_token=None) nsmc_dir = Path("nsmc") with open(nsmc_dir / "vocab.pkl", mode="wb") as io: pickle.dump(vocab, io) config = Config("conf/dataset/nsmc.json") config.update({"vocab": str(nsmc_dir / "vocab.pkl")}) config.save("conf/dataset/nsmc.json")
def load_pickle(path): with open(path, 'rb') as f: return pickle.load(f) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--checkpoint', type=str, default=None) parser.add_argument('--mode', type=str, default='test') # or valid kwargs = parser.parse_args() config = get_config_from_dir(kwargs.checkpoint, mode=kwargs.mode) print(config) print('Loading Vocabulary...') vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size emotion_sentences = None if config.emotion: emotion_sentences = load_pickle(config.emojis_path) # Load infersent embeddings if necessary infersent_sentences = None if config.infersent: print('Loading infersent sentence embeddings...') infersent_sentences = load_pickle(config.infersent_path) embedding_size = infersent_sentences[0][0].shape[0]
default=0.9, help='momentum for sgd') parser.add_argument('--clip_grad', type=float, default=5.0, help='clip grad at') parser.add_argument('--lambda0', type=float, default=1, help='lambda0') parser.add_argument('--patience', type=int, default=15, help='patience for early stop') args = parser.parse_args() # actions the parser can take acts = ['SHIFT', 'REDUCE_L', 'REDUCE_R'] vocab_acts = Vocab.from_list(acts) SHIFT = vocab_acts.w2i['SHIFT'] REDUCE_L = vocab_acts.w2i['REDUCE_L'] REDUCE_R = vocab_acts.w2i['REDUCE_R'] NUM_ACTIONS = vocab_acts.size() vocab_words = Vocab.from_file(args.vocab_file_path) training_set = list( read_oracle(args.train_file_path, vocab_words, vocab_acts)) validation_set = list( read_oracle(args.dev_file_path, vocab_words, vocab_acts)) # # CUDA for PyTorch # use_cuda = T.cuda.is_available() # device = T.device("cuda:0" if use_cuda else "cpu")