示例#1
0
    def __init__(self, session, corpus_dir, knbase_dir, result_dir, result_file):
        """
        Args:
            session: The TensorFlow session.
            corpus_dir: Name of the folder storing corpus files and vocab information.
            knbase_dir: Name of the folder storing data files for the knowledge base.
            result_dir: The folder containing the trained result files.
            result_file: The file name of the trained model.
        """
        self.session = session

        # Prepare data and hyper parameters
        print("# Prepare dataset placeholder and hyper parameters ...")
        tokenized_data = TokenizedData(corpus_dir=corpus_dir, training=False)

        self.knowledge_base = KnowledgeBase()
        self.knowledge_base.load_knbase(knbase_dir)

        self.session_data = SessionData()

        self.hparams = tokenized_data.hparams
        self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        src_dataset = tf.data.Dataset.from_tensor_slices(self.src_placeholder)
        self.infer_batch = tokenized_data.get_inference_batch(src_dataset)

        # Create model
        print("# Creating inference model ...")
        self.model = ModelCreator(training=False, tokenized_data=tokenized_data,
                                  batch_input=self.infer_batch)
        # Restore model weights
        print("# Restoring model weights ...")
        self.model.saver.restore(session, os.path.join(result_dir, result_file))

        self.session.run(tf.tables_initializer())
    def __init__(self, session, corpus_dir, knbase_dir, result_dir,
                 result_file):
        self.session = session

        # Prepare data and hyper parameters
        print("# Prepare dataset placeholder and hyper parameters ...")
        self.tokenized_data = TokenizedData(corpus_dir=corpus_dir,
                                            knbase_dir=knbase_dir,
                                            training=False)

        self.hparams = self.tokenized_data.hparams
        self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        src_dataset = tf.data.Dataset.from_tensor_slices(self.src_placeholder)
        self.infer_batch = self.tokenized_data.get_inference_batch(src_dataset)

        # Create model
        print("# Creating inference model ...")
        self.model = ModelCreator(training=False,
                                  tokenized_data=self.tokenized_data,
                                  batch_input=self.infer_batch)
        # Restore model weights
        print("# Restoring model weights ...")
        self.model.saver.restore(session, os.path.join(result_dir,
                                                       result_file))

        self.session.run(tf.tables_initializer())
示例#3
0
    def __init__(self,
                 session,
                 corpus_dir,
                 knbase_dir,
                 result_dir,
                 hparams_dir=None):
        self.session = session

        hparams = HParams(hparams_dir).hparams if hparams_dir else None

        # Prepare data and hyper parameters
        print("# Prepare dataset placeholder and hyper parameters ...")
        self.tokenized_data = TokenizedData(corpus_dir=corpus_dir,
                                            hparams=hparams,
                                            knbase_dir=knbase_dir,
                                            training=False)

        self.hparams = self.tokenized_data.hparams
        self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        src_dataset = tf.contrib.data.Dataset.from_tensor_slices(
            self.src_placeholder)
        self.infer_batch = self.tokenized_data.get_inference_batch(src_dataset)

        # Create model
        print("# Creating inference model ...")
        self.model = ModelCreator(training=False,
                                  tokenized_data=self.tokenized_data,
                                  batch_input=self.infer_batch)
        latest_ckpt = tf.train.latest_checkpoint(result_dir)
        print("# Restoring model weights ...")
        self.model.saver.restore(session, latest_ckpt)
        self.session.run(tf.tables_initializer())
示例#4
0
    def __init__(self, corpus_dir):
        self.graph = tf.Graph()
        with self.graph.as_default():
            tokenized_data = TokenizedData(corpus_dir=corpus_dir)

            self.hparams = tokenized_data.hparams
            self.train_batch = tokenized_data.get_training_batch()  # Return BatchInput namedtuple from .chatbot/tokenizedata.py
            self.model = ModelCreator(training=True, tokenized_data=tokenized_data,
                                      batch_input=self.train_batch)
示例#5
0
    def __init__(self, corpus_dir):
        self.graph = tf.Graph()
        with self.graph.as_default():
            tokenized_data = TokenizedData(corpus_dir=corpus_dir)

            self.hparams = tokenized_data.hparams
            self.train_batch = tokenized_data.get_training_batch()
            self.model = ModelCreator(training=True, tokenized_data=tokenized_data,
                                      batch_input=self.train_batch)
示例#6
0
    def __init__(self, session, corpus_dir, knbase_dir, result_dir, aiml_dir,
                 result_file):
        """
        Args:
            session: The TensorFlow session.
            corpus_dir: Name of the folder storing corpus files and vocab information.
            knbase_dir: Name of the folder storing data files for the knowledge base.
            result_dir: The folder containing the trained result files.
            result_file: The file name of the trained model.
        """
        self.session = session

        # Prepare data and hyper parameters
        print("# Prepare dataset placeholder and hyper parameters ...")
        tokenized_data = TokenizedData(corpus_dir=corpus_dir, training=False)

        self.knowledge_base = KnowledgeBase()
        self.knowledge_base.load_knbase(knbase_dir)

        self.session_data = SessionData()

        self.hparams = tokenized_data.hparams
        self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        src_dataset = tf.data.Dataset.from_tensor_slices(self.src_placeholder)
        self.infer_batch = tokenized_data.get_inference_batch(src_dataset)

        # Create Retrival model
        self.kmodel = aiml.Kernel()
        brain_file_name = os.path.join(aiml_dir, BRAIN_FILE)
        print(aiml_dir)

        # Restore model rules
        if os.path.exists(brain_file_name):
            print("# Loading from brain file ... ")
            self.kmodel.loadBrain(brain_file_name)
        else:
            print("# Parsing aiml files ...")
            aimls_file_name = os.path.join(aiml_dir, AIMLS_FILE)
            self.kmodel.bootstrap(learnFiles=os.path.abspath(aimls_file_name),
                                  commands="load aiml b")
            print("# Saving brain file: " + BRAIN_FILE)
            self.kmodel.saveBrain(brain_file_name)

        # Create Generative model
        print("# Creating inference model ...")
        self.model = ModelCreator(training=False,
                                  tokenized_data=tokenized_data,
                                  batch_input=self.infer_batch)
        # Restore model weights
        print("# Restoring model weights ...")
        self.model.saver.restore(session, os.path.join(result_dir,
                                                       result_file))

        self.session.run(tf.tables_initializer())
示例#7
0
    def __init__(self, corpus_dir):
        """
        Constructor of the BotTrainer.
        Args:
            corpus_dir: The folder to save all the training related data.
        """
        self.graph = tf.Graph()
        with self.graph.as_default():
            tokenized_data = TokenizedData(corpus_dir=corpus_dir)

            self.hparams = tokenized_data.hparams
            self.train_batch = tokenized_data.get_training_batch()
            self.model = ModelCreator(training=True, tokenized_data=tokenized_data,
                                      batch_input=self.train_batch)
示例#8
0
class BotTrainer(object):
    def __init__(self, corpus_dir):
        self.graph = tf.Graph()
        with self.graph.as_default():
            tokenized_data = TokenizedData(corpus_dir=corpus_dir)

            self.hparams = tokenized_data.hparams
            self.train_batch = tokenized_data.get_training_batch()
            self.model = ModelCreator(training=True,
                                      tokenized_data=tokenized_data,
                                      batch_input=self.train_batch)

    def train(self, result_dir, target=""):
        """Train a seq2seq model."""
        # Summary writer
        summary_name = "train_log"
        summary_writer = tf.summary.FileWriter(
            os.path.join(result_dir, summary_name), self.graph)

        log_device_placement = self.hparams.log_device_placement
        num_epochs = self.hparams.num_epochs

        config_proto = tf.ConfigProto(
            log_device_placement=log_device_placement,
            allow_soft_placement=True)
        config_proto.gpu_options.allow_growth = True

        with tf.Session(target=target, config=config_proto,
                        graph=self.graph) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            global_step = self.model.global_step.eval(session=sess)

            # Initialize all of the iterators
            sess.run(self.train_batch.initializer)

            # Initialize the statistic variables
            ckpt_loss, ckpt_predict_count = 0.0, 0.0
            train_perp, last_record_perp = 2000.0, 2.0
            train_epoch = 0

            print("# Training loop started @ {}".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
            epoch_start_time = time.time()
            while train_epoch < num_epochs:
                # Each run of this while loop is a training step, multiple time/steps will trigger
                # the train_epoch to be increased.
                learning_rate = self._get_learning_rate(train_perp)

                try:
                    step_result = self.model.train_step(
                        sess, learning_rate=learning_rate)
                    (_, step_loss, step_predict_count, step_summary,
                     global_step, step_word_count, batch_size) = step_result

                    # Write step summary.
                    summary_writer.add_summary(step_summary, global_step)

                    # update statistics
                    ckpt_loss += (step_loss * batch_size)
                    ckpt_predict_count += step_predict_count
                except tf.errors.OutOfRangeError:
                    # Finished going through the training dataset. Go to next epoch.
                    train_epoch += 1

                    mean_loss = ckpt_loss / ckpt_predict_count
                    train_perp = math.exp(
                        float(mean_loss)) if mean_loss < 300 else math.inf

                    epoch_dur = time.time() - epoch_start_time
                    print(
                        "# Finished epoch {:2d} @ step {:5d} @ {}. In the epoch, learning rate = {:.6f}, "
                        "mean loss = {:.4f}, perplexity = {:8.4f}, and {:.2f} seconds elapsed."
                        .format(train_epoch, global_step,
                                time.strftime("%Y-%m-%d %H:%M:%S"),
                                learning_rate, mean_loss, train_perp,
                                round(epoch_dur, 2)))
                    epoch_start_time = time.time(
                    )  # The start time of the next epoch

                    summary = tf.Summary(value=[
                        tf.Summary.Value(tag="train_perp",
                                         simple_value=train_perp)
                    ])
                    summary_writer.add_summary(summary, global_step)

                    # Save checkpoint
                    if train_perp < 1.6 and train_perp < last_record_perp:
                        self.model.saver.save(sess,
                                              os.path.join(
                                                  result_dir, "basic"),
                                              global_step=global_step)
                        last_record_perp = train_perp

                    ckpt_loss, ckpt_predict_count = 0.0, 0.0

                    sess.run(self.model.batch_input.initializer)
                    continue

            # Done training
            self.model.saver.save(sess,
                                  os.path.join(result_dir, "basic"),
                                  global_step=global_step)
            summary_writer.close()

    @staticmethod
    def _get_learning_rate(perplexity):
        if perplexity <= 1.48:
            return 9.6e-5
        elif perplexity <= 1.64:
            return 1e-4
        elif perplexity <= 2.0:
            return 1.2e-4
        elif perplexity <= 2.4:
            return 1.6e-4
        elif perplexity <= 3.2:
            return 2e-4
        elif perplexity <= 4.8:
            return 2.4e-4
        elif perplexity <= 8.0:
            return 3.2e-4
        elif perplexity <= 16.0:
            return 4e-4
        elif perplexity <= 32.0:
            return 6e-4
        else:
            return 8e-4
示例#9
0
class BotPredictor(object):
    def __init__(self, session, corpus_dir, knbase_dir, result_dir, result_file):
        """
        Args:
            session: The TensorFlow session.
            corpus_dir: Name of the folder storing corpus files and vocab information.
            knbase_dir: Name of the folder storing data files for the knowledge base.
            result_dir: The folder containing the trained result files.
            result_file: The file name of the trained model.
        """
        self.session = session

        # Prepare data and hyper parameters
        print("# Prepare dataset placeholder and hyper parameters ...")
        tokenized_data = TokenizedData(corpus_dir=corpus_dir, training=False)

        self.knowledge_base = KnowledgeBase()
        self.knowledge_base.load_knbase(knbase_dir)

        self.session_data = SessionData()

        self.hparams = tokenized_data.hparams
        self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        src_dataset = tf.data.Dataset.from_tensor_slices(self.src_placeholder)
        self.infer_batch = tokenized_data.get_inference_batch(src_dataset)

        # Create model
        print("# Creating inference model ...")
        self.model = ModelCreator(training=False, tokenized_data=tokenized_data,
                                  batch_input=self.infer_batch)
        # Restore model weights
        print("# Restoring model weights ...")
        self.model.saver.restore(session, os.path.join(result_dir, result_file))

        self.session.run(tf.tables_initializer())

    def predict(self, session_id, question, html_format=False):
        chat_session = self.session_data.get_session(session_id)
        chat_session.before_prediction()  # Reset before each prediction

        if question.strip() == '':
            answer = "Don't you want to say something to me?"
            chat_session.after_prediction(question, answer)
            return answer

        pat_matched, new_sentence, para_list = check_patterns_and_replace(question)

        for pre_time in range(2):
            tokens = nltk.word_tokenize(new_sentence.lower())
            tmp_sentence = [' '.join(tokens[:]).strip()]

            self.session.run(self.infer_batch.initializer,
                             feed_dict={self.src_placeholder: tmp_sentence})

            outputs, _ = self.model.infer(self.session)

            if self.hparams.beam_width > 0:
                outputs = outputs[0]

            eos_token = self.hparams.eos_token.encode("utf-8")
            outputs = outputs.tolist()[0]

            if eos_token in outputs:
                outputs = outputs[:outputs.index(eos_token)]

            if pat_matched and pre_time == 0:
                out_sentence, if_func_val = self._get_final_output(outputs, chat_session,
                                                                   para_list=para_list,
                                                                   html_format=html_format)
                if if_func_val:
                    chat_session.after_prediction(question, out_sentence)
                    return out_sentence
                else:
                    new_sentence = question
            else:
                out_sentence, _ = self._get_final_output(outputs, chat_session,
                                                         html_format=html_format)
                chat_session.after_prediction(question, out_sentence)
                return out_sentence

    def _get_final_output(self, sentence, chat_session, para_list=None, html_format=False):
        sentence = b' '.join(sentence).decode('utf-8')
        if sentence == '':
            return "I don't know what to say.", False

        if_func_val = False
        last_word = None
        word_list = []
        for word in sentence.split(' '):
            word = word.strip()
            if not word:
                continue

            if word.startswith('_func_val_'):
                if_func_val = True
                word = call_function(word[10:], knowledge_base=self.knowledge_base,
                                     chat_session=chat_session, para_list=para_list,
                                     html_format=html_format)
                if word is None or word == '':
                    continue
            else:
                if word in self.knowledge_base.upper_words:
                    word = self.knowledge_base.upper_words[word]

                if (last_word is None or last_word in ['.', '!', '?']) and not word[0].isupper():
                    word = word.capitalize()

            if not word.startswith('\'') and word != 'n\'t' \
                and (word[0] not in string.punctuation or word in ['(', '[', '{', '``', '$']) \
                and last_word not in ['(', '[', '{', '``', '$']:
                word = ' ' + word

            word_list.append(word)
            last_word = word

        return ''.join(word_list).strip(), if_func_val
class BotPredictor(object):
    def __init__(self, session, corpus_dir, knbase_dir, result_dir,
                 result_file):
        self.session = session

        # Prepare data and hyper parameters
        print("# Prepare dataset placeholder and hyper parameters ...")
        self.tokenized_data = TokenizedData(corpus_dir=corpus_dir,
                                            knbase_dir=knbase_dir,
                                            training=False)

        self.hparams = self.tokenized_data.hparams
        self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        src_dataset = tf.data.Dataset.from_tensor_slices(self.src_placeholder)
        self.infer_batch = self.tokenized_data.get_inference_batch(src_dataset)

        # Create model
        print("# Creating inference model ...")
        self.model = ModelCreator(training=False,
                                  tokenized_data=self.tokenized_data,
                                  batch_input=self.infer_batch)
        # Restore model weights
        print("# Restoring model weights ...")
        self.model.saver.restore(session, os.path.join(result_dir,
                                                       result_file))

        self.session.run(tf.tables_initializer())

    def predict(self, sentence, html_format=False):
        if sentence.strip() == '':
            return "Don't you want to say something to me?"

        pat_matched, new_sentence, num_list = \
            FunctionData.check_arithmetic_pattern_and_replace(sentence)

        for pre_time in range(2):
            tokens = nltk.word_tokenize(new_sentence.lower())
            tmp_sentence = [' '.join(tokens[:]).strip()]

            self.session.run(self.infer_batch.initializer,
                             feed_dict={self.src_placeholder: tmp_sentence})

            outputs, _ = self.model.infer(self.session)

            if self.hparams.beam_width > 0:
                outputs = outputs[0]

            eos_token = self.hparams.eos_token.encode("utf-8")
            outputs = outputs.tolist()[0]

            if eos_token in outputs:
                outputs = outputs[:outputs.index(eos_token)]

            if pat_matched and pre_time == 0:
                out_sentence, if_func_val = self._get_final_output(
                    outputs, para_list=num_list, html_format=html_format)
                if if_func_val:
                    return out_sentence
                else:
                    new_sentence = sentence
            else:
                out_sentence, _ = self._get_final_output(
                    outputs, html_format=html_format)
                return out_sentence

    def _get_final_output(self, sentence, para_list=None, html_format=False):
        sentence = b' '.join(sentence).decode('utf-8')
        if sentence == '':
            return "I don't know what to say.", False

        if_func_val = False
        last_word = None
        word_list = []
        for word in sentence.split(' '):
            word = word.strip()
            if not word:
                continue

            if word.startswith('_func_val_'):
                if_func_val = True
                word = call_function(word[10:],
                                     tokenized_data=self.tokenized_data,
                                     para_list=para_list,
                                     html_format=html_format)
            else:
                if word in self.tokenized_data.upper_words:
                    word = self.tokenized_data.upper_words[word]

                if (last_word is None or last_word
                        in ['.', '!', '?']) and not word[0].isupper():
                    word = word.capitalize()

            if not word.startswith('\'') and word != 'n\'t' \
                and (word not in string.punctuation or word in ['(', '[', '{', '``', '$']) \
                and last_word not in ['(', '[', '{', '``', '$']:
                word = ' ' + word

            word_list.append(word)
            last_word = word

        return ''.join(word_list).strip(), if_func_val
示例#11
0
class BotTrainer(object):
    def __init__(self, corpus_dir):
        """
        Constructor of the BotTrainer.
        Args:
            corpus_dir: The folder to save all the training related data.
        """
        self.graph = tf.Graph()
        with self.graph.as_default():
            tokenized_data = TokenizedData(corpus_dir=corpus_dir)

            self.hparams = tokenized_data.hparams
            self.train_batch = tokenized_data.get_training_batch()
            self.model = ModelCreator(training=True, tokenized_data=tokenized_data,
                                      batch_input=self.train_batch)

    def train(self, result_dir, target="", last_end_file=None, last_end_epoch=0, last_end_lr=8e-4):
        """Train a seq2seq model."""
        # Summary writer
        summary_name = "train_log"
        summary_writer = tf.summary.FileWriter(os.path.join(result_dir, summary_name), self.graph)

        log_device_placement = self.hparams.log_device_placement
        num_epochs = self.hparams.num_epochs

        config_proto = tf.ConfigProto(log_device_placement=log_device_placement,
                                      allow_soft_placement=True)
        config_proto.gpu_options.allow_growth = True

        with tf.Session(target=target, config=config_proto, graph=self.graph) as sess:
            # This initialization is useful even when the model is restored from the last time
            # because not all variables used in the model training may be saved.
            sess.run(tf.global_variables_initializer())
            if last_end_file:  # Continue training from last time
                #print("Restoring model weights from last time ...")
                self.model.saver.restore(sess, os.path.join(result_dir, last_end_file))

            sess.run(tf.tables_initializer())
            global_step = self.model.global_step.eval(session=sess)

            # Initialize all of the iterators
            sess.run(self.train_batch.initializer)

            # Initialize the statistic variables
            ckpt_loss, ckpt_predict_count = 0.0, 0.0
            train_perp, last_record_perp = 2000.0, 200.0
            train_epoch = last_end_epoch
            learning_rate = pre_lr = last_end_lr

            #print("# Training loop started @ {}".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            epoch_start_time = time.time()
            while train_epoch < num_epochs:
                # Each run of this while loop is a training step, multiple time/steps will trigger
                # the train_epoch to be increased.
                try:
                    step_result = self.model.train_step(sess, learning_rate=learning_rate)
                    (_, step_loss, step_predict_count, step_summary, global_step,
                     step_word_count, batch_size) = step_result

                    # Write step summary.
                    summary_writer.add_summary(step_summary, global_step)

                    # update statistics
                    ckpt_loss += (step_loss * batch_size)
                    ckpt_predict_count += step_predict_count
                except tf.errors.OutOfRangeError:
                    # Finished going through the training dataset. Go to next epoch.
                    train_epoch += 1

                    mean_loss = ckpt_loss / ckpt_predict_count
                    train_perp = math.exp(float(mean_loss)) if mean_loss < 300 else math.inf

                    epoch_dur = time.time() - epoch_start_time
                    #print("# Finished epoch {:2d} @ step {:5d} @ {}. In the epoch, learning rate = {:.6f}, "
                          #"mean loss = {:.4f}, perplexity = {:8.4f}, and {:.2f} seconds elapsed."
                          .format(train_epoch, global_step, time.strftime("%Y-%m-%d %H:%M:%S"),
                                  learning_rate, mean_loss, train_perp, round(epoch_dur, 2)))
                    epoch_start_time = time.time()  # The start time of the next epoch

                    summary = tf.Summary(value=[tf.Summary.Value(tag="train_perp", simple_value=train_perp)])
                    summary_writer.add_summary(summary, global_step)

                    # Save checkpoint
                    if train_perp < last_record_perp:
                        self.model.saver.save(sess, os.path.join(result_dir, "basic"), global_step=train_epoch)
                        last_record_perp = train_perp

                    ckpt_loss, ckpt_predict_count = 0.0, 0.0

                    learning_rate = self._get_learning_rate(train_perp, pre_lr, train_epoch)
                    pre_lr = learning_rate

                    sess.run(self.model.batch_input.initializer)
                    continue

            # Done training
            self.model.saver.save(sess, os.path.join(result_dir, "basic"), global_step=train_epoch)
            summary_writer.close()