class RickAndMortyData(Dataset): def __init__(self, text, seq_length, vocab=None): self.text = text self.seq_length = seq_length if vocab is None: self.vocab = Vocabulary() self.vocab.add_text(self.text) else: self.vocab = vocab self.text = self.vocab.clean_text(text) self.tokens = self.vocab.tokenize(self.text) def __len__(self): return len(self.tokens) - self.seq_length def __getitem__(self, idx): x = [ self.vocab[word] for word in self.tokens[idx:idx + self.seq_length] ] y = [self.vocab[self.tokens[idx + self.seq_length]]] x = torch.LongTensor(x) y = torch.LongTensor(y) return x, y
def chat(self, question, chat_settings): """Chat with the chatbot model by predicting an answer to a question. 'question' and 'answer' in this context are generic terms for the interactions For example: Question: "How are you?" Answer: "Fine." Question: "That's great." Answer: "Yeah." Args: question: The input question for which the model should predict an answer. chat_settings: The ChatSettings instance containing the chat settings and inference hyperparameters Returns: q_with_hist: question with history if chat_settings.show_question_context = True otherwise None. answers: array of answer beams if chat_settings.show_all_beams = True otherwise the single selected answer. """ #Process the question by cleaning it and converting it to an integer encoded vector question = Vocabulary.clean_text(question) question = self.input_vocabulary.words2ints(question) #Prepend the currently tracked steps of the conversation history separated by EOS tokens. #This allows for deeper dialog context to influence the answer prediction. question_with_history = [] for i in range(len(self.conversation_history)): question_with_history += self.conversation_history[i] + [self.input_vocabulary.eos_int()] question_with_history += question #Get the answer prediction batch = np.zeros((1, len(question_with_history))) batch[0] = question_with_history max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1 # + 1 since the EOS token is counted as a timestep predicted_answer_info = self.predict_batch(inputs = batch, input_sequence_length = np.array([len(question_with_history)]), max_output_sequence_length = max_output_sequence_length, beam_length_penalty_weight = chat_settings.inference_hparams.beam_length_penalty_weight, sampling_temperature = chat_settings.inference_hparams.sampling_temperature, log_summary = chat_settings.inference_hparams.log_summary) #Read the answer prediction answer_beams = [] if self.beam_width > 0: #For beam search decoding: if show_all_beams is enabled then output all beams (sequences), otherwise take the first beam. # The beams (in the "predictions" matrix) are ordered with the highest ranked beams first. beam_count = 1 if not chat_settings.show_all_beams else len(predicted_answer_info["predictions_seq_lengths"][0]) for i in range(beam_count): predicted_answer_seq_length = predicted_answer_info["predictions_seq_lengths"][0][i] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][0][:predicted_answer_seq_length, i].tolist() answer_beams.append(predicted_answer) else: #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding # or the sampling distribution for sampling decoding. Return this beam. beam_count = 1 predicted_answer_seq_length = predicted_answer_info["predictions_seq_lengths"][0] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][0][:predicted_answer_seq_length].tolist() answer_beams.append(predicted_answer) #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length self.conversation_history.append(question) self.conversation_history.append(answer_beams[0]) self.trim_conversation_history(chat_settings.inference_hparams.conv_history_length) #Convert the answer(s) to text and return answers = [] for i in range(beam_count): answer = self.output_vocabulary.ints2words(answer_beams[i]) answers.append(answer) q_with_hist = None if not chat_settings.show_question_context else self.output_vocabulary.ints2words(question_with_history) if chat_settings.show_all_beams: return q_with_hist, answers else: return q_with_hist, answers[0]
def read_dataset(self, dataset_dir, model_dir, training_hparams, share_vocab=True): """Read and return a chatbot dataset based on the specified dataset Args: dataset_dir: directory to load the raw dataset file(s) from model_dir: directory to save the vocabulary to training_hparams: training parameters which determine how the dataset will be read. See hparams.py for in-depth comments. share_vocab: True to generate a single vocabulary file from the question and answer words. False to generate separate input and output vocabulary files, from the question and answer words respectively. (If training_hparams.conv_history_length > 0, share_vocab must be True since previous answers will be appended to the questions.) """ if not share_vocab and training_hparams.conv_history_length > 0: raise ValueError( "If training_hparams.conv_history_length > 0, share_vocab must be True since previous answers will be appended to the questions." ) if share_vocab and training_hparams.input_vocab_threshold != training_hparams.output_vocab_threshold: raise ValueError( "Cannot share vocabulary when the input and output vocab thresholds are different." ) #Get dialog line and conversation collections id2line, conversations_ids = self._get_dialog_lines_and_conversations( dataset_dir) #Clean dialog lines for line_id in id2line: id2line[line_id] = Vocabulary.clean_text( id2line[line_id], training_hparams.max_question_answer_words) # Getting separately the questions and the answers questions_for_count = [] questions = [] answers = [] for conversation in conversations_ids[:training_hparams. max_conversations]: for i in range(len(conversation) - 1): conv_up_to_question = '' for j in range( max(0, i - training_hparams.conv_history_length), i): conv_up_to_question += id2line[ conversation[j]] + " {0} ".format(Vocabulary.EOS) question = id2line[conversation[i]] question_with_history = conv_up_to_question + question answer = id2line[conversation[i + 1]] if training_hparams.min_question_words <= len( question_with_history.split()): questions.append(conv_up_to_question + question) questions_for_count.append(question) answers.append(answer) # Create the vocabulary object & add the question & answer words if share_vocab: questions_and_answers = [] for i in range(len(questions_for_count)): question = questions_for_count[i] answer = answers[i] if i == 0 or question != answers[i - 1]: questions_and_answers.append(question) questions_and_answers.append(answer) input_vocabulary = self._create_and_save_vocab( questions_and_answers, training_hparams.input_vocab_threshold, model_dir, Vocabulary.SHARED_VOCAB_FILENAME) output_vocabulary = input_vocabulary else: input_vocabulary = self._create_and_save_vocab( questions_for_count, training_hparams.input_vocab_threshold, model_dir, Vocabulary.INPUT_VOCAB_FILENAME) output_vocabulary = self._create_and_save_vocab( answers, training_hparams.output_vocab_threshold, model_dir, Vocabulary.OUTPUT_VOCAB_FILENAME) # Adding the End Of String tokens to the end of every answer for i in range(len(answers)): answers[i] += " {0}".format(Vocabulary.EOS) #Create the Dataset object from the questions / answers lists and the vocab object. dataset = Dataset(questions, answers, input_vocabulary, output_vocabulary) return dataset
def chat(self, question, chat_settings): """ chat with the seq2seq model :param question: input question in which the model should predict an answer :param chat_settings: chat settings :return: answer """ # Process the question by cleaning it and converting it to an integer encoded vector if chat_settings.enable_auto_punctuation: question = Vocabulary.auto_punctuate(question) question = Vocabulary.clean_text( question, normalize_words=chat_settings.inference_hparams.normalize_words) question = self.vocabulary.words2ints(question) # Get the answer prediction batch = np.expand_dims(question, 0) max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1 predicted_answer_info = self.predict_batch( inputs=batch, input_batch_lengths=1, max_output_sequence_length=max_output_sequence_length, beam_length_penalty_weight=chat_settings.inference_hparams. beam_length_penalty_weight) # Read the answer prediction answer_beams = [] if self.beam_width > 0: # For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), # otherwise take the first beam. # The beams (in the "predictions" matrix) are ordered with the highest ranked beams first. beam_count = 1 if not chat_settings.show_all_beams else len( predicted_answer_info["predictions_seq_lengths"][0]) for i in range(beam_count): predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0][ i] - 1 # -1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length, i].tolist() answer_beams.append(predicted_answer) else: # For greedy / sampling decoding: only one beam (sequence) is returned, # based on the argmax for greedy decoding # or the sampling distribution for sampling decoding. Return this beam. beam_count = 1 predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0] - 1 # -1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length].tolist() answer_beams.append(predicted_answer) # Convert the answer(s) to text and return answers = [] for i in range(beam_count): answer = self.vocabulary.ints2words(answer_beams[i]) answers.append(answer) if chat_settings.show_all_beams: return answers else: return answers[0]
def read_dataset(self, dataset_dir, model_dir, training_hparams, share_vocab=True, encoder_embeddings_dir=None, decoder_embeddings_dir=None): """Read and return a chatbot dataset based on the specified dataset Args: dataset_dir: directory to load the raw dataset file(s) from model_dir: directory to save the vocabulary to training_hparams: training parameters which determine how the dataset will be read. See hparams.py for in-depth comments. share_vocab: True to generate a single vocabulary file from the question and answer words. False to generate separate input and output vocabulary files, from the question and answer words respectively. (If training_hparams.conv_history_length > 0, share_vocab should be set to True since previous answers will be appended to the questions. This could cause many of these previous answer words to map to <OUT> when looking up against the input vocabulary. An exception to this is if the output vocabulary is a subset of the input vocaulary.) encoder_embeddings_dir: Path to directory containing external embeddings to import for the encoder. If this is specified, the input vocabulary will be loaded from this source and optionally joined with the generated dataset vocabulary (see training_hparams.input_vocab_import_mode) If share_vocab is True, the imported vocabulary is used for both input and output. decoder_embeddings_dir: Path to directory containing external embeddings to import for the decoder. If this is specified, the output vocabulary will be loaded from this source and optionally joined with the generated dataset vocabulary (see training_hparams.output_vocab_import_mode) If share_vocab is True, this argument must be None or the same as encoder_embeddings_dir (both are equivalent). """ if share_vocab: if training_hparams.input_vocab_threshold != training_hparams.output_vocab_threshold and ( encoder_embeddings_dir is None or training_hparams.input_vocab_import_mode != VocabularyImportMode.External): raise ValueError( "Cannot share generated or joined imported vocabulary when the input and output vocab thresholds are different." ) if encoder_embeddings_dir is not None: if training_hparams.input_vocab_import_mode != training_hparams.output_vocab_import_mode: raise ValueError( "Cannot share imported vocabulary when input and output vocab import modes are different." ) if training_hparams.input_vocab_import_normalized != training_hparams.output_vocab_import_normalized: raise ValueError( "Cannot share imported vocabulary when input and output normalization modes are different." ) if decoder_embeddings_dir is not None and decoder_embeddings_dir != encoder_embeddings_dir: raise ValueError( "Cannot share imported vocabulary from two different sources or share import and generated vocabulary." ) read_stats = DatasetReadStats() #Get dialog line and conversation collections id2line, conversations_ids = self._get_dialog_lines_and_conversations( dataset_dir) #Clean dialog lines for line_id in id2line: id2line[line_id] = Vocabulary.clean_text( id2line[line_id], training_hparams.max_question_answer_words, training_hparams.normalize_words) #Output cleaned lines for debugging purposes if training_hparams.log_cleaned_dataset: self._log_cleaned_dataset(model_dir, id2line.values()) # Getting separately the questions and the answers questions_for_count = [] questions = [] answers = [] for conversation in conversations_ids[:training_hparams. max_conversations]: for i in range(len(conversation) - 1): conv_up_to_question = '' for j in range( max(0, i - training_hparams.conv_history_length), i): conv_up_to_question += id2line[ conversation[j]] + " {0} ".format(Vocabulary.EOS) question = id2line[conversation[i]] question_with_history = conv_up_to_question + question answer = id2line[conversation[i + 1]] if training_hparams.min_question_words <= len( question_with_history.split()): questions.append(conv_up_to_question + question) questions_for_count.append(question) answers.append(answer) # Create the vocabulary object & add the question & answer words if share_vocab: questions_and_answers = [] for i in range(len(questions_for_count)): question = questions_for_count[i] answer = answers[i] if i == 0 or question != answers[i - 1]: questions_and_answers.append(question) questions_and_answers.append(answer) input_vocabulary, read_stats.input_vocabulary_import_stats = self._create_and_save_vocab( questions_and_answers, training_hparams.input_vocab_threshold, model_dir, Vocabulary.SHARED_VOCAB_FILENAME, encoder_embeddings_dir, training_hparams.input_vocab_import_normalized, training_hparams.input_vocab_import_mode) output_vocabulary = input_vocabulary read_stats.output_vocabulary_import_stats = read_stats.input_vocabulary_import_stats else: input_vocabulary, read_stats.input_vocabulary_import_stats = self._create_and_save_vocab( questions_for_count, training_hparams.input_vocab_threshold, model_dir, Vocabulary.INPUT_VOCAB_FILENAME, encoder_embeddings_dir, training_hparams.input_vocab_import_normalized, training_hparams.input_vocab_import_mode) output_vocabulary, read_stats.output_vocabulary_import_stats = self._create_and_save_vocab( answers, training_hparams.output_vocab_threshold, model_dir, Vocabulary.OUTPUT_VOCAB_FILENAME, decoder_embeddings_dir, training_hparams.output_vocab_import_normalized, training_hparams.output_vocab_import_mode) # Adding the End Of String tokens to the end of every answer for i in range(len(answers)): answers[i] += " {0}".format(Vocabulary.EOS) #Create the Dataset object from the questions / answers lists and the vocab object. dataset = Dataset(questions, answers, input_vocabulary, output_vocabulary) return dataset, read_stats
def read_dataset(self, dataset_dir, model_dir, training_hparams, share_vocab): if share_vocab and training_hparams.input_vocab_threshold != training_hparams.output_vocab_threshold: raise ValueError( "Cannot share vocabulary when the input and output vocab thresholds are different." ) id2line, conversations_ids = self.get_movie_dialog_conversations( dataset_dir) #Clean dialog lines for line_id in id2line: id2line[line_id] = Vocabulary.clean_text( id2line[line_id], training_hparams.max_question_answer_words) # Getting separately the questions and the answers questions_for_count = [] questions = [] answers = [] for conversation in conversations_ids[:training_hparams. max_conversations]: for i in range(len(conversation) - 1): conv_up_to_question = '' for j in range( max(0, i - training_hparams.conv_history_length), i): conv_up_to_question += id2line[ conversation[j]] + " {0} ".format(Vocabulary.EOS) question = id2line[conversation[i]] question_with_history = conv_up_to_question + question answer = id2line[conversation[i + 1]] if training_hparams.min_question_words <= len( question_with_history.split()): questions.append(conv_up_to_question + question) questions_for_count.append(question) answers.append(answer) # Create the vocabulary object & add the question & answer words if share_vocab: questions_and_answers = [] for i in range(len(questions_for_count)): question = questions_for_count[i] answer = answers[i] if i == 0 or question != answers[i - 1]: questions_and_answers.append(question) questions_and_answers.append(answer) questions_and_answers = questions_and_answers[:10] input_vocabulary = self.create_and_save_vocab( questions_and_answers, training_hparams.input_vocab_threshold, model_dir, Vocabulary.SHARED_VOCAB_FILENAME) output_vocabulary = input_vocabulary else: #different vocabularies for input and output input_vocabulary = self.create_and_save_vocab( questions_for_count, training_hparams.input_vocab_threshold, model_dir, Vocabulary.INPUT_VOCAB_FILENAME) output_vocabulary = self.create_and_save_vocab( answers, training_hparams.output_vocab_threshold, model_dir, Vocabulary.OUTPUT_VOCAB_FILENAME) # Adding the End Of String tokens to the end of every answer for i in range(len(answers)): answers[i] += " {0}".format(Vocabulary.EOS) return questions, answers, input_vocabulary, output_vocabulary
def read_dataset(self, dataset_dir, model_dir, training_hparams, share_vocab = True, encoder_embeddings_dir = None, decoder_embeddings_dir = None): if share_vocab: if training_hparams.input_vocab_threshold != training_hparams.output_vocab_threshold and (encoder_embeddings_dir is None or training_hparams.input_vocab_import_mode != VocabularyImportMode.External): raise ValueError("Cannot share generated or joined imported vocabulary when the input and output vocab thresholds are different.") if encoder_embeddings_dir is not None: if training_hparams.input_vocab_import_mode != training_hparams.output_vocab_import_mode: raise ValueError("Cannot share imported vocabulary when input and output vocab import modes are different.") if training_hparams.input_vocab_import_normalized != training_hparams.output_vocab_import_normalized: raise ValueError("Cannot share imported vocabulary when input and output normalization modes are different.") if decoder_embeddings_dir is not None and decoder_embeddings_dir != encoder_embeddings_dir: raise ValueError("Cannot share imported vocabulary from two different sources or share import and generated vocabulary.") read_stats = DatasetReadStats() #Get dialog line and conversation collections id2line, conversations_ids = self._get_dialog_lines_and_conversations(dataset_dir) #Clean dialog lines for line_id in id2line: id2line[line_id] = Vocabulary.clean_text(id2line[line_id], training_hparams.max_question_answer_words, training_hparams.normalize_words) #Output cleaned lines for debugging purposes if training_hparams.log_cleaned_dataset: self._log_cleaned_dataset(model_dir, id2line.values()) # Getting separately the questions and the answers questions_for_count = [] questions = [] answers = [] for conversation in conversations_ids[:training_hparams.max_conversations]: for i in range(len(conversation) - 1): conv_up_to_question = '' for j in range(max(0, i - training_hparams.conv_history_length), i): conv_up_to_question += id2line[conversation[j]] + " {0} ".format(Vocabulary.EOS) question = id2line[conversation[i]] question_with_history = conv_up_to_question + question answer = id2line[conversation[i+1]] if training_hparams.min_question_words <= len(question_with_history.split()): questions.append(conv_up_to_question + question) questions_for_count.append(question) answers.append(answer) # Create the vocabulary object & add the question & answer words if share_vocab: questions_and_answers = [] for i in range(len(questions_for_count)): question = questions_for_count[i] answer = answers[i] if i == 0 or question != answers[i - 1]: questions_and_answers.append(question) questions_and_answers.append(answer) input_vocabulary, read_stats.input_vocabulary_import_stats = self._create_and_save_vocab(questions_and_answers, training_hparams.input_vocab_threshold, model_dir, Vocabulary.SHARED_VOCAB_FILENAME, encoder_embeddings_dir, training_hparams.input_vocab_import_normalized, training_hparams.input_vocab_import_mode) output_vocabulary = input_vocabulary read_stats.output_vocabulary_import_stats = read_stats.input_vocabulary_import_stats else: input_vocabulary, read_stats.input_vocabulary_import_stats = self._create_and_save_vocab(questions_for_count, training_hparams.input_vocab_threshold, model_dir, Vocabulary.INPUT_VOCAB_FILENAME, encoder_embeddings_dir, training_hparams.input_vocab_import_normalized, training_hparams.input_vocab_import_mode) output_vocabulary, read_stats.output_vocabulary_import_stats = self._create_and_save_vocab(answers, training_hparams.output_vocab_threshold, model_dir, Vocabulary.OUTPUT_VOCAB_FILENAME, decoder_embeddings_dir, training_hparams.output_vocab_import_normalized, training_hparams.output_vocab_import_mode) # Adding the End Of String tokens to the end of every answer for i in range(len(answers)): answers[i] += " {0}".format(Vocabulary.EOS) #Create the Dataset object from the questions / answers lists and the vocab object. dataset = Dataset(questions, answers, input_vocabulary, output_vocabulary) return dataset, read_stats
def chat(self, question, chat_settings): if chat_settings.enable_auto_punctuation: question = Vocabulary.auto_punctuate(question) question = Vocabulary.clean_text( question, normalize_words=chat_settings.inference_hparams.normalize_words) question = self.input_vocabulary.words2ints(question) question_with_history = [] for i in range(len(self.conversation_history)): question_with_history += self.conversation_history[i] + [ self.input_vocabulary.eos_int() ] question_with_history += question #Get the answer prediction batch = np.zeros((1, len(question_with_history))) batch[0] = question_with_history max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1 # + 1 since the EOS token is counted as a timestep predicted_answer_info = self.predict_batch( inputs=batch, input_sequence_length=np.array([len(question_with_history)]), max_output_sequence_length=max_output_sequence_length, beam_length_penalty_weight=chat_settings.inference_hparams. beam_length_penalty_weight, sampling_temperature=chat_settings.inference_hparams. sampling_temperature, log_summary=chat_settings.inference_hparams.log_summary) #Read the answer prediction answer_beams = [] if self.beam_width > 0: #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam. # The beams (in the "predictions" matrix) are ordered with the highest ranked beams first. beam_count = 1 if not chat_settings.show_all_beams else len( predicted_answer_info["predictions_seq_lengths"][0]) for i in range(beam_count): predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0][ i] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length, i].tolist() answer_beams.append(predicted_answer) else: #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding # or the sampling distribution for sampling decoding. Return this beam. beam_count = 1 predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length].tolist() answer_beams.append(predicted_answer) #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length #Answers need to be converted from output_vocabulary ints to input_vocabulary ints (since they will be fed back in to the encoder) self.conversation_history.append(question) answer_for_history = self.output_vocabulary.ints2words( answer_beams[0], is_punct_discrete_word=True, capitalize_i=False) answer_for_history = self.input_vocabulary.words2ints( answer_for_history) self.conversation_history.append(answer_for_history) self.trim_conversation_history( chat_settings.inference_hparams.conv_history_length) #Convert the answer(s) to text and return answers = [] for i in range(beam_count): answer = self.output_vocabulary.ints2words(answer_beams[i]) answers.append(answer) q_with_hist = None if not chat_settings.show_question_context else self.input_vocabulary.ints2words( question_with_history) if chat_settings.show_all_beams: return q_with_hist, answers else: return q_with_hist, answers[0]