예제 #1
0
    def load_from_params(cls):
        """
        Creates a chatbot from params.py.
        Good for development, not for production.

        :return: Chatbot initialized from params.py.
        """

        import params
        # load data
        questions, answers = load_data(params.data_file_directory,
                                       params.files, params.encoding)
        bigramer = Bigramer(params.bigramer)

        # prepare data manipulators
        VOCAB_SIZE = params.vocab_size
        tokenizer = create_tokenizer(questions + answers, VOCAB_SIZE,
                                     params.unknown_token)
        tokenized_questions, tokenized_answers = tokenize_q_a(
            tokenizer, questions, answers)

        # prepare data
        prepared_data = prepare_data(tokenized_questions, tokenized_answers)
        max_len_questions, max_len_answers, *_ = prepared_data

        # mle_model
        reversed_tokenizer_word_dict = {
            index: word
            for (word, index) in tokenizer.word_index.items()
        }
        mle_model = utils.fit_mle_model(tokenized_answers,
                                        reversed_tokenizer_word_dict)

        # load model
        model_data = utils.load_keras_model(params.model)
        _, encoder_inputs, encoder_states, decoder_inputs, decoder_embedding, decoder_lstm, decoder_dense = model_data

        return cls(params.model, tokenizer, mle_model, bigramer,
                   max_len_questions, max_len_answers, params.strategy)
예제 #2
0
            if tokens[i] == 'UNK':
                if starting_unknown is not None and i == 0:
                    tokens[i] = starting_unknown
                elif tokens[i - 1] != 'UNK':
                    tokens[i] = self.give_word(tokens[i - 1])
        return tokens

    def strip_unks(self, tokens):
        return [token for token in tokens if token != 'UNK']


if __name__ == '__main__':
    questions, answers = load_data(params.data_file_directory, params.files,
                                   params.encoding)
    tokenizer = create_tokenizer(questions + answers, None, None)
    tokenized_questions, tokenized_answers = tokenize_q_a(
        tokenizer, questions, answers)
    reversed_tokenizer_word_dict = {
        index: text
        for text, index in tokenizer.word_index.items()
    }
    bigrams_frequency = dict()
    flatten = lambda l: [item for sublist in l for item in sublist]
    tokenized_text = [[
        reversed_tokenizer_word_dict[index] for index in sentence
    ] for sentence in tokenized_questions + tokenized_answers]
    bigrams = flatten([list(nltk.bigrams(t)) for t in tokenized_text])
    for (first, second) in bigrams:
        if first in bigrams_frequency:
            if second in bigrams_frequency[first]:
                bigrams_frequency[first][second] += 1
            else:
예제 #3
0
def analyze_checkpoints():
    questions, answers = load_data(params.data_file_directory, params.files,
                                   None)
    VOCAB_SIZE = 15001

    tokenizer = create_tokenizer(questions + answers, VOCAB_SIZE, 'UNK')
    # tokenizer = utils.load_and_unpickle("test_models/tokenizer")

    tokenized_questions, tokenized_answers = tokenize_q_a(
        tokenizer, questions, answers)

    reversed_tokenizer_word_dict = {
        index: text
        for text, index in tokenizer.word_index.items()
    }
    mle_model = utils.fit_mle_model(tokenized_answers,
                                    reversed_tokenizer_word_dict)

    max_len_questions, max_len_answers, encoder_input_data, decoder_input_data, decoder_output_data = \
        prepare_data(tokenized_questions, tokenized_answers)

    checkpoints = [
        params.dir_name + file for file in os.listdir(params.dir_name)
        if file.endswith("hdf5")
    ]
    print(f"{len(checkpoints)} checkpoints")

    results = defaultdict(list)
    model_score = []

    # model evaluations section
    questions, answers = load_data(params.data_file_directory,
                                   params.test_files)
    enc_in_data, dec_in_data, dec_out_data = generate_test_values(
        questions[:1000], answers[:1000], tokenizer)

    # generating answer and perplexity section
    texts = questions[:5]

    for checkpoint in checkpoints:
        net_model, encoder_inputs, encoder_states, decoder_inputs, \
        decoder_embedding, decoder_lstm, decoder_dense = utils.load_keras_model(checkpoint)

        enc_model, dec_model = conversation.make_inference_models(
            encoder_inputs, encoder_states, decoder_inputs, decoder_embedding,
            decoder_lstm, decoder_dense)

        score = net_model.evaluate([enc_in_data, dec_in_data], dec_out_data)
        model_score.append(score)
        print(score)
        for text in texts:
            print(text)
            states_values = enc_model.predict(
                conversation.str_to_tokens(tokenizer, text, max_len_questions))
            empty_target_seq = np.zeros((1, 1))
            empty_target_seq[0, 0] = tokenizer.word_index['start']
            end_index = tokenizer.word_index['end']

            predictions, _ = beam_search(states_values, empty_target_seq,
                                         dec_model, end_index)

            decoded_texts = []
            for prediction in predictions:
                decoded_text = ['start']
                for word_index in prediction[1:]:
                    decoded_text.append(
                        reversed_tokenizer_word_dict.get(word_index, 'UNK'))
                decoded_texts.append(decoded_text)
            result = choose_best_fit(decoded_texts, mle_model)
            results[text].append(result)

    utils.pickle_and_save(results, params.perplexity_file)
    utils.pickle_and_save(model_score, params.model_summary_file)
예제 #4
0
def generate_test_values(questions, answers, tokenizer):
    tokenized_questions, tokenized_answers = tokenize_q_a(
        tokenizer, questions, answers)
    prepared_data = prepare_data(tokenized_questions, tokenized_answers)
    max_len_questions, max_len_answers, encoder_input_data, decoder_input_data, decoder_output_data = prepared_data
    return encoder_input_data, decoder_input_data, decoder_output_data
def test():
    questions, answers = load_data("prepare_data/output_files",
                                   "preprocessed_cornell", None)
    VOCAB_SIZE = 15001

    tokenizer = create_tokenizer(questions + answers, VOCAB_SIZE, 'UNK')
    tokenized_questions, tokenized_answers = tokenize_q_a(
        tokenizer, questions, answers)

    reversed_tokenizer_word_dict = {
        index: text
        for text, index in tokenizer.word_index.items()
    }
    mle_model = utils.fit_mle_model(tokenized_answers,
                                    reversed_tokenizer_word_dict)

    max_len_questions, max_len_answers, encoder_input_data, decoder_input_data, decoder_output_data = \
        prepare_data(tokenized_questions, tokenized_answers)

    _, encoder_inputs, encoder_states, decoder_inputs, \
        decoder_embedding, decoder_lstm, decoder_dense = utils.load_keras_model('cornell.hdf5')

    enc_model, dec_model = conversation.make_inference_models(
        encoder_inputs, encoder_states, decoder_inputs, decoder_embedding,
        decoder_lstm, decoder_dense)

    texts = [
        'stop talking shit', 'it is peanut butter jelly time',
        'Are we going to pass this lecture', 'Where are you from',
        'do you like me', 'carrot', 'tell me your biggest secret',
        'How are you', 'do you know me', 'what does fox say', 'i am happy',
        'this is america', 'kill me', 'do not forget to brush your teeth'
    ]
    for text in texts:
        print(text)
        states_values = enc_model.predict(
            conversation.str_to_tokens(tokenizer, text, max_len_questions))
        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = tokenizer.word_index['start']
        end_index = tokenizer.word_index['end']

        predictions, _ = beam_search(states_values, empty_target_seq,
                                     dec_model, end_index)

        decoded_texts = []
        for prediction in predictions:
            decoded_text = ['start']
            for word_index in prediction[1:]:
                decoded_text.append(
                    reversed_tokenizer_word_dict.get(word_index, 'UNK'))
            decoded_texts.append(decoded_text)
        print(utils.choose_best(decoded_texts, mle_model))

        # for prediction in predictions:
        #     decoded_translation = ''
        #     for sampled_word_index in prediction[1:]:
        #         decoded_translation += ' {}'.format(reversed_tokenizer_word_dict[sampled_word_index])
        #     print(decoded_translation)

        # print(predictions)
        print()