def load_from_params(cls): """ Creates a chatbot from params.py. Good for development, not for production. :return: Chatbot initialized from params.py. """ import params # load data questions, answers = load_data(params.data_file_directory, params.files, params.encoding) bigramer = Bigramer(params.bigramer) # prepare data manipulators VOCAB_SIZE = params.vocab_size tokenizer = create_tokenizer(questions + answers, VOCAB_SIZE, params.unknown_token) tokenized_questions, tokenized_answers = tokenize_q_a( tokenizer, questions, answers) # prepare data prepared_data = prepare_data(tokenized_questions, tokenized_answers) max_len_questions, max_len_answers, *_ = prepared_data # mle_model reversed_tokenizer_word_dict = { index: word for (word, index) in tokenizer.word_index.items() } mle_model = utils.fit_mle_model(tokenized_answers, reversed_tokenizer_word_dict) # load model model_data = utils.load_keras_model(params.model) _, encoder_inputs, encoder_states, decoder_inputs, decoder_embedding, decoder_lstm, decoder_dense = model_data return cls(params.model, tokenizer, mle_model, bigramer, max_len_questions, max_len_answers, params.strategy)
if tokens[i] == 'UNK': if starting_unknown is not None and i == 0: tokens[i] = starting_unknown elif tokens[i - 1] != 'UNK': tokens[i] = self.give_word(tokens[i - 1]) return tokens def strip_unks(self, tokens): return [token for token in tokens if token != 'UNK'] if __name__ == '__main__': questions, answers = load_data(params.data_file_directory, params.files, params.encoding) tokenizer = create_tokenizer(questions + answers, None, None) tokenized_questions, tokenized_answers = tokenize_q_a( tokenizer, questions, answers) reversed_tokenizer_word_dict = { index: text for text, index in tokenizer.word_index.items() } bigrams_frequency = dict() flatten = lambda l: [item for sublist in l for item in sublist] tokenized_text = [[ reversed_tokenizer_word_dict[index] for index in sentence ] for sentence in tokenized_questions + tokenized_answers] bigrams = flatten([list(nltk.bigrams(t)) for t in tokenized_text]) for (first, second) in bigrams: if first in bigrams_frequency: if second in bigrams_frequency[first]: bigrams_frequency[first][second] += 1 else:
def analyze_checkpoints(): questions, answers = load_data(params.data_file_directory, params.files, None) VOCAB_SIZE = 15001 tokenizer = create_tokenizer(questions + answers, VOCAB_SIZE, 'UNK') # tokenizer = utils.load_and_unpickle("test_models/tokenizer") tokenized_questions, tokenized_answers = tokenize_q_a( tokenizer, questions, answers) reversed_tokenizer_word_dict = { index: text for text, index in tokenizer.word_index.items() } mle_model = utils.fit_mle_model(tokenized_answers, reversed_tokenizer_word_dict) max_len_questions, max_len_answers, encoder_input_data, decoder_input_data, decoder_output_data = \ prepare_data(tokenized_questions, tokenized_answers) checkpoints = [ params.dir_name + file for file in os.listdir(params.dir_name) if file.endswith("hdf5") ] print(f"{len(checkpoints)} checkpoints") results = defaultdict(list) model_score = [] # model evaluations section questions, answers = load_data(params.data_file_directory, params.test_files) enc_in_data, dec_in_data, dec_out_data = generate_test_values( questions[:1000], answers[:1000], tokenizer) # generating answer and perplexity section texts = questions[:5] for checkpoint in checkpoints: net_model, encoder_inputs, encoder_states, decoder_inputs, \ decoder_embedding, decoder_lstm, decoder_dense = utils.load_keras_model(checkpoint) enc_model, dec_model = conversation.make_inference_models( encoder_inputs, encoder_states, decoder_inputs, decoder_embedding, decoder_lstm, decoder_dense) score = net_model.evaluate([enc_in_data, dec_in_data], dec_out_data) model_score.append(score) print(score) for text in texts: print(text) states_values = enc_model.predict( conversation.str_to_tokens(tokenizer, text, max_len_questions)) empty_target_seq = np.zeros((1, 1)) empty_target_seq[0, 0] = tokenizer.word_index['start'] end_index = tokenizer.word_index['end'] predictions, _ = beam_search(states_values, empty_target_seq, dec_model, end_index) decoded_texts = [] for prediction in predictions: decoded_text = ['start'] for word_index in prediction[1:]: decoded_text.append( reversed_tokenizer_word_dict.get(word_index, 'UNK')) decoded_texts.append(decoded_text) result = choose_best_fit(decoded_texts, mle_model) results[text].append(result) utils.pickle_and_save(results, params.perplexity_file) utils.pickle_and_save(model_score, params.model_summary_file)
def generate_test_values(questions, answers, tokenizer): tokenized_questions, tokenized_answers = tokenize_q_a( tokenizer, questions, answers) prepared_data = prepare_data(tokenized_questions, tokenized_answers) max_len_questions, max_len_answers, encoder_input_data, decoder_input_data, decoder_output_data = prepared_data return encoder_input_data, decoder_input_data, decoder_output_data
def test(): questions, answers = load_data("prepare_data/output_files", "preprocessed_cornell", None) VOCAB_SIZE = 15001 tokenizer = create_tokenizer(questions + answers, VOCAB_SIZE, 'UNK') tokenized_questions, tokenized_answers = tokenize_q_a( tokenizer, questions, answers) reversed_tokenizer_word_dict = { index: text for text, index in tokenizer.word_index.items() } mle_model = utils.fit_mle_model(tokenized_answers, reversed_tokenizer_word_dict) max_len_questions, max_len_answers, encoder_input_data, decoder_input_data, decoder_output_data = \ prepare_data(tokenized_questions, tokenized_answers) _, encoder_inputs, encoder_states, decoder_inputs, \ decoder_embedding, decoder_lstm, decoder_dense = utils.load_keras_model('cornell.hdf5') enc_model, dec_model = conversation.make_inference_models( encoder_inputs, encoder_states, decoder_inputs, decoder_embedding, decoder_lstm, decoder_dense) texts = [ 'stop talking shit', 'it is peanut butter jelly time', 'Are we going to pass this lecture', 'Where are you from', 'do you like me', 'carrot', 'tell me your biggest secret', 'How are you', 'do you know me', 'what does fox say', 'i am happy', 'this is america', 'kill me', 'do not forget to brush your teeth' ] for text in texts: print(text) states_values = enc_model.predict( conversation.str_to_tokens(tokenizer, text, max_len_questions)) empty_target_seq = np.zeros((1, 1)) empty_target_seq[0, 0] = tokenizer.word_index['start'] end_index = tokenizer.word_index['end'] predictions, _ = beam_search(states_values, empty_target_seq, dec_model, end_index) decoded_texts = [] for prediction in predictions: decoded_text = ['start'] for word_index in prediction[1:]: decoded_text.append( reversed_tokenizer_word_dict.get(word_index, 'UNK')) decoded_texts.append(decoded_text) print(utils.choose_best(decoded_texts, mle_model)) # for prediction in predictions: # decoded_translation = '' # for sampled_word_index in prediction[1:]: # decoded_translation += ' {}'.format(reversed_tokenizer_word_dict[sampled_word_index]) # print(decoded_translation) # print(predictions) print()