예제 #1
0
def init(data_dir, task_id, OOV=False):
    # load candidates
    candidates, candid2indx = load_candidates(
        data_dir, task_id)
    n_cand = len(candidates)
    print("Candidate Size", n_cand)
    indx2candid = dict(
        (candid2indx[key], key) for key in candid2indx)

    # load task data
    train_data, test_data, val_data = load_dialog_task(
        data_dir, task_id, candid2indx, OOV)
    data = train_data + test_data + val_data

    # build parameters
    word_idx, sentence_size, \
    candidate_sentence_size, memory_size, \
    vocab_size = build_vocab(data, candidates)

    # Variable(torch.from_numpy(candidates_vec)).view(len(candidates), sentence_size)
    candidates_vec = vectorize_candidates(
        candidates, word_idx, candidate_sentence_size)

    return candid2indx, \
           indx2candid, \
           candidates_vec, \
           word_idx, \
           sentence_size, \
           candidate_sentence_size, \
           memory_size, \
           vocab_size, \
           train_data, test_data, val_data
    def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None,
                 batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3,
                 epochs=200, embedding_size=20,intro_times=20):
        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        # self.isTrain=isTrain
        self.isInteractive = isInteractive
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size
        self.intro_times=intro_times

        candidates, self.candid2indx = load_candidates(
            self.data_dir, self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)
        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData

        self.build_vocab(data, candidates)
        #build training words set
        # pdb.set_trace()
        self.train_val_wordset = self.words_set(self.valData+self.trainData)
        all_wordset = self.words_set(data)
        no_oov_word = len(self.train_val_wordset)
        with_oov_word = len(all_wordset)
        print('oov words', with_oov_word - no_oov_word)
        # new_words=[]
        # for word in all_wordset:
        #     if word not in self.train_val_wordset:
        #         new_words.append(self.idx_word[word])
        # print('These words are new:',new_words)
        # pdb.set_trace()
        # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx)
        self.candidates_vec = vectorize_candidates(
            candidates, self.word_idx, self.candidate_sentence_size)
        optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate, epsilon=self.epsilon)
        self.sess = tf.Session()
        self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size,
                                  self.embedding_size, self.candidates_vec, session=self.sess,
                                  hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer,
                                  task_id=task_id,introspection_times=self.intro_times)
        self.saver = tf.train.Saver(max_to_keep=1)

        self.summary_writer = tf.summary.FileWriter(
            self.model.root_dir, self.model.graph_output.graph)
예제 #3
0
    def __init__(self,
                 data_dir,
                 model_dir,
                 task_id,
                 isInteractive=True,
                 OOV=False,
                 memory_size=50,
                 random_state=None,
                 batch_size=32,
                 learning_rate=0.001,
                 epsilon=1e-8,
                 max_grad_norm=40.0,
                 evaluation_interval=10,
                 hops=3,
                 epochs=200,
                 embedding_size=20):

        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        self.isInteractive = isInteractive
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size

        candidates, self.candid2indx = load_candidates(self.data_dir,
                                                       self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)
        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData
        self.build_vocab(data, candidates)
        # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx)
        self.candidates_vec = vectorize_candidates(
            candidates, self.word_idx, self.candidate_sentence_size)
        self.model = MemN2NDialog(self.batch_size,
                                  self.vocab_size,
                                  self.n_cand,
                                  self.sentence_size,
                                  self.embedding_size,
                                  self.candidates_vec,
                                  hops=self.hops,
                                  max_grad_norm=self.max_grad_norm,
                                  task_id=task_id)
    def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=20):
        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        # self.isTrain=isTrain
        self.isInteractive = isInteractive
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size

        candidates, self.candid2indx = load_candidates(
            self.data_dir, self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)
        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData
        self.build_vocab(data, candidates)
        # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx)
        self.candidates_vec = vectorize_candidates(
            candidates, self.word_idx, self.candidate_sentence_size)
        optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate, epsilon=self.epsilon)
        self.sess = tf.Session()
        self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess,
                                  hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id)
        self.saver = tf.train.Saver(max_to_keep=50)

        self.summary_writer = tf.summary.FileWriter(
            self.model.root_dir, self.model.graph_output.graph)
예제 #5
0
    def __init__(self,data_dir,model_dir,task_id,isInteractive=True,OOV=False,memory_size=250,random_state=None,batch_size=32,learning_rate=0.001,epsilon=1e-8,max_grad_norm=40.0,evaluation_interval=10,hops=3,epochs=200,embedding_size=20,save_vocab=False,load_vocab=False):
        self.data_dir=data_dir
        self.task_id=task_id
        self.model_dir=model_dir
        # self.isTrain=isTrain
        self.isInteractive=isInteractive
        self.OOV=OOV
        self.memory_size=memory_size
        self.random_state=random_state
        self.batch_size=batch_size
        self.learning_rate=learning_rate
        self.epsilon=epsilon
        self.max_grad_norm=max_grad_norm
        self.evaluation_interval=evaluation_interval
        self.hops=hops
        self.epochs=epochs
        self.embedding_size=embedding_size
        self.save_vocab=save_vocab
        self.load_vocab=load_vocab

        candidates,self.candid2indx = load_candidates(self.data_dir, self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid= dict((self.candid2indx[key],key) for key in self.candid2indx)
        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData
        self.build_vocab(data,candidates,self.save_vocab,self.load_vocab)
        # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx)
        self.candidates_vec=vectorize_candidates(candidates,self.word_idx,self.candidate_sentence_size)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon)
        self.sess=tf.Session()
        self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess,
                           hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id)
        self.saver = tf.train.Saver(max_to_keep=50)
        
        # self.summary_writer = tf.train.SummaryWriter(self.model.root_dir, self.model.graph_output.graph)
        self.summary_writer = tf.summary.FileWriter(self.model.root_dir, self.model.graph_output.graph)
예제 #6
0
    def __init__(self,
                 data_dir,
                 model_dir,
                 task_id,
                 OOV=False,
                 memory_size=250,
                 random_state=None,
                 batch_size=32,
                 learning_rate=0.001,
                 epsilon=1e-8,
                 max_grad_norm=40.0,
                 evaluation_interval=10,
                 hops=3,
                 epochs=10,
                 embedding_size=20,
                 save_vocab=False,
                 load_vocab=False):
        """Creates wrapper for training and testing a chatbot model.

        Args:
            data_dir: Directory containing personalized dialog tasks.

            model_dir: Directory containing memn2n model checkpoints.

            task_id: Personalized dialog task id, 1 <= id <= 5. Defaults to `1`.

            OOV: If `True`, use OOV test set. Defaults to `False`

            memory_size: The max size of the memory. Defaults to `250`.

            random_state: Random state to set graph-level random seed. Defaults to `None`.

            batch_size: Size of the batch for training. Defaults to `32`.

            learning_rate: Learning rate for Adam Optimizer. Defaults to `0.001`.

            epsilon: Epsilon value for Adam Optimizer. Defaults to `1e-8`.

            max_gradient_norm: Maximum L2 norm clipping value. Defaults to `40.0`.

            evaluation_interval: Evaluate and print results every x epochs.
            Defaults to `10`.

            hops: The number of hops over memory for responding. A hop consists
            of reading and addressing a memory slot. Defaults to `3`.

            epochs: Number of training epochs. Defualts to `200`.

            embedding_size: The size of the word embedding. Defaults to `20`.

            save_vocab: If `True`, save vocabulary file. Defaults to `False`.

            load_vocab: If `True`, load vocabulary from file. Defaults to `False`.
        """

        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size
        self.save_vocab = save_vocab
        self.load_vocab = load_vocab

        candidates, self.candid2indx = load_candidates(self.data_dir,
                                                       self.task_id)
        self.n_cand = len(candidates)
        # print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)

        # Task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)

        # print(self.testData)
        data = self.trainData + self.testData + self.valData

        self.build_vocab(data, candidates, self.save_vocab, self.load_vocab)
        print("build_vocab", self.build_vocab)
        self.candidates_vec = vectorize_candidates(
            candidates, self.word_idx, self.candidate_sentence_size)
        print("build_vocab", self.candidates_vec)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           epsilon=self.epsilon)

        self.sess = tf.Session()

        self.model = MemN2NDialog(self.batch_size,
                                  self.vocab_size,
                                  self.n_cand,
                                  self.sentence_size,
                                  self.embedding_size,
                                  self.candidates_vec,
                                  session=self.sess,
                                  hops=self.hops,
                                  max_grad_norm=self.max_grad_norm,
                                  optimizer=optimizer,
                                  task_id=task_id)

        self.saver = tf.train.Saver(max_to_keep=50)
예제 #7
0
    def __init__(self,
                 data_dir,
                 model_dir,
                 task_id,
                 source,
                 resFlag,
                 wrong_conversations,
                 error,
                 acc_each_epoch,
                 acc_ten_epoch,
                 conv_wrong_right,
                 epochs,
                 OOV=False,
                 memory_size=50,
                 random_state=None,
                 batch_size=32,
                 learning_rate=0.001,
                 epsilon=1e-8,
                 max_grad_norm=40.0,
                 evaluation_interval=10,
                 hops=3,
                 embedding_size=20):
        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size
        self.source = source
        self.resFlag = resFlag
        self.wrong_conversations = wrong_conversations
        self.error = error
        self.acc_each_epoch = acc_each_epoch
        self.acc_ten_epoch = acc_ten_epoch
        candidates, self.candid2indx = load_candidates(self.data_dir,
                                                       self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)

        # create train, test and validation data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData
        self.build_vocab(data, candidates)

        self.test_acc_list = []
        self.candidates_vec = vectorize_candidates(
            candidates, self.word_idx, self.candidate_sentence_size)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           epsilon=self.epsilon)
        self.sess = tf.Session()
        self.model = MemN2NDialog(self.batch_size,
                                  self.vocab_size,
                                  self.n_cand,
                                  self.sentence_size,
                                  self.embedding_size,
                                  self.candidates_vec,
                                  session=self.sess,
                                  hops=self.hops,
                                  max_grad_norm=self.max_grad_norm,
                                  optimizer=optimizer,
                                  task_id=task_id,
                                  source=self.source,
                                  resFlag=self.resFlag,
                                  oov=self.OOV)
        self.saver = tf.train.Saver(max_to_keep=50)
        self.summary_writer = tf.summary.FileWriter(
            self.model.root_dir, self.model.graph_output.graph)
예제 #8
0
    def __init__(self,
                 data_dir,
                 model_dir,
                 task_id,
                 isInteractive=True,
                 OOV=False,
                 memory_size=50,
                 random_state=None,
                 batch_size=32,
                 learning_rate=0.001,
                 epsilon=1e-8,
                 max_grad_norm=40.0,
                 evaluation_interval=10,
                 hops=3,
                 epochs=200,
                 embedding_size=100):
        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        # self.isTrain=isTrain
        self.isInteractive = isInteractive
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size
        self.vocab = {}
        self.ivocab = {}
        self.word2vec = {}
        self.word2vec_init = True

        if self.word2vec_init:
            # assert config.embed_size == 100
            self.word2vec = load_glove(self.embedding_size)

        process_word(word="<eos>",
                     word2vec=self.word2vec,
                     vocab=self.vocab,
                     ivocab=self.ivocab,
                     word_vector_size=self.embedding_size,
                     to_return="index")

        # Define uncertain or unknown word index and vec for use later for training out-of-context data
        self.uncertain_word_index = process_word(
            word="sdfsssdf",
            word2vec=self.word2vec,
            vocab=self.vocab,
            ivocab=self.ivocab,
            word_vector_size=self.embedding_size,
            to_return="index")

        candidates, self.candid2indx = load_candidates(self.data_dir,
                                                       self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)
        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData

        self.build_vocab(data, candidates)
        self.set_max_sentence_length()
        # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx)
        self.trainS, self.trainQ, self.trainA = vectorize_data_match(
            self.trainData,
            self.word2vec,
            self.max_sentence_size,
            self.batch_size,
            self.n_cand,
            self.memory_size,
            self.vocab,
            self.ivocab,
            self.embedding_size,
            uncertain=self.uncertain_word_index)
        self.valS, self.valQ, self.valA = vectorize_data_match(
            self.valData,
            self.word2vec,
            self.max_sentence_size,
            self.batch_size,
            self.n_cand,
            self.memory_size,
            self.vocab,
            self.ivocab,
            self.embedding_size,
            uncertain_word=True,
            uncertain=self.uncertain_word_index)

        self.candidates_vec = vectorize_candidates(
            candidates, self.word2vec, self.candidate_sentence_size,
            self.vocab, self.ivocab, self.embedding_size)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           epsilon=self.epsilon)
        self.sess = tf.Session()
        # Set max sentence vector size
        self.build_vocab(data, candidates)

        answer_n_hot = np.zeros((self.vocab_size, len(self.candid2indx)))
        for ans_it in range(len(self.indx2candid)):
            ans = self.indx2candid[ans_it]
            n_hot = np.zeros((self.vocab_size, ))
            for w in tokenize(ans):
                assert w in self.word_idx
                n_hot[self.word_idx[w]] = 1
            answer_n_hot[:, ans_it] = n_hot

        # Need to understand more about sentence size. Model failing because sentence size > candidate_sentence_size? Answers longer than queries?
        self.model = MemN2NDialogHybridMatch(self.batch_size,
                                             self.vocab_size,
                                             self.max_sentence_size,
                                             self.memory_size,
                                             self.embedding_size,
                                             answer_n_hot,
                                             match=FLAGS.match,
                                             session=self.sess,
                                             hops=self.hops,
                                             max_grad_norm=self.max_grad_norm,
                                             optimizer=optimizer,
                                             task_id=self.task_id)
        # self.model = MemN2NDialogHybrid(self.batch_size, self.vocab_size, self.n_cand, self.max_sentence_size, self.embedding_size, self.candidates_vec, session=self.sess,
        #                           hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id)
        self.saver = tf.train.Saver(max_to_keep=50)

        self.summary_writer = tf.summary.FileWriter(
            self.model.root_dir, self.model.graph_output.graph)

        self.kb = parse_kb(FLAGS.kb_file)
예제 #9
0
파일: main.py 프로젝트: aquadrop/py
def main(args):
    # parse args
    args = parse_args(args)

    # prepare data
    if args['prep_data']:
        print('\n>> Preparing Data\n')
        prepare_data(args)
        sys.exit()

    # ELSE
    # read data and metadata from pickled files
    with open(P_DATA_DIR + 'metadata.pkl', 'rb') as f:
        metadata = pkl.load(f)
    with open(P_DATA_DIR + 'data.pkl', 'rb') as f:
        data_ = pkl.load(f)

    # read content of data and metadata
    candidates = data_['candidates']
    candid2idx, idx2candid = metadata['candid2idx'], metadata['idx2candid']

    # get train/test/val data
    train, test, val = data_['train'], data_['test'], data_['val']

    # gather more information from metadata
    sentence_size = metadata['sentence_size']
    w2idx = metadata['w2idx']  # is a list
    idx2w = metadata['idx2w']
    memory_size = metadata['memory_size']
    vocab_size = metadata['vocab_size']
    n_cand = metadata['n_cand']
    candidate_sentence_size = metadata['candidate_sentence_size']
    # embeddings = metadata['embeddings']

    # vectorize candidates
    candidates_vec = data_utils.vectorize_candidates(candidates, w2idx,
                                                     candidate_sentence_size)

    print('---- memory config ----')
    print('embedding size:', EMBEDDING_SIZE)
    print('batch_size:', BATCH_SIZE)
    print('memory_size:', memory_size)
    print('vocab_size:', vocab_size)
    print('candidate_size:', n_cand)
    print('candidate_sentence_size:', candidate_sentence_size)
    print('hops:', HOPS)
    print('---- end ----')
    ###
    # create model
    # model = model['memn2n'](  # why?
    model = memn2n.MemN2NDialog(batch_size=BATCH_SIZE,
                                vocab_size=vocab_size,
                                candidates_size=n_cand,
                                sentence_size=sentence_size,
                                embedding_size=EMBEDDING_SIZE,
                                candidates_vec=candidates_vec,
                                hops=HOPS)

    # model = memn2n2.MemN2NDialog(
    #     batch_size=BATCH_SIZE,
    #     vocab_size=vocab_size,
    #     candidates_size=n_cand,
    #     sentence_size=sentence_size,
    #     embedding_size=EMBEDDING_SIZE,
    #     candidates_vec=candidates_vec,
    #     embeddings=embeddings,
    #     hops=HOPS
    # )

    # gather data in batches
    train, val, test, batches = data_utils.get_batches(train,
                                                       val,
                                                       test,
                                                       metadata,
                                                       batch_size=BATCH_SIZE)

    # for t in train['q']:
    #     print(recover_sentence(t, idx2w))

    if args['train']:
        # training starts here
        epochs = args['epochs']
        eval_interval = args['eval_interval']

        # restore from checkpoint
        _check_restore_parameters(model.get_sess(), model.saver, CKPT_DIR)
        #
        # training and evaluation loop
        print('\n>> Training started!\n')
        # write log to file
        log_handle = open(dir_path + '/../../logs/' + args['log_file'], 'w')
        cost_total = 0.
        best_cost = 100
        # best_validation_accuracy = 0.
        lowest_val_acc = 0.8
        total_begin = time.clock()
        begin = time.clock()
        for i in range(epochs + 1):

            for start, end in batches:
                s = train['s'][start:end]
                q = train['q'][start:end]
                # print(len(q))
                a = train['a'][start:end]
                if config.MULTILABEL >= 1:
                    # convert to one hot
                    one_hot = np.zeros((end - start, n_cand))
                    for aa in range(end - start):
                        for index in a[aa]:
                            one_hot[aa][index] = 1
                    a = one_hot
                cost_total += model.batch_fit(s, q, a)
            if config.MULTILABEL >= 1:
                if i % 1 == 0 and i:
                    print('stage...', i, cost_total)
                    if cost_total < best_cost:
                        print('saving model...', i, '++',
                              str(best_cost) + '-->' + str(cost_total))
                        best_cost = cost_total
                        model.saver.save(model.get_sess(),
                                         CKPT_DIR + '/memn2n_model.ckpt',
                                         global_step=i)
            else:
                if i % 1 == 0 and i:
                    print('stage...', i)
                    if i % eval_interval == 0 and i:
                        train_preds = batch_predict(model,
                                                    train['s'],
                                                    train['q'],
                                                    len(train['s']),
                                                    batch_size=BATCH_SIZE)
                        for error in range(len(train['q'])):
                            if train_preds[error] != train['a'][error]:
                                print_out = recover(error, train['s'],
                                                    train['q'],
                                                    train_preds[error],
                                                    train['a'][error], idx2w,
                                                    idx2candid)
                                print(print_out)
                                # print(recover_sentence(train['q'][i], idx2w),
                                #       recover_cls(train_preds[i], idx2candid),
                                #       recover_cls(train['a'][i], idx2candid))
                        val_preds = batch_predict(model,
                                                  val['s'],
                                                  val['q'],
                                                  len(val['s']),
                                                  batch_size=BATCH_SIZE)
                        train_acc = metrics.accuracy_score(
                            np.array(train_preds), train['a'])
                        val_acc = metrics.accuracy_score(val_preds, val['a'])
                        end = time.clock()
                        print('Epoch[{}] : <ACCURACY>\n\t,\
                              training : {} \n\t,\
                              validation : {}\n\t,\
                              current_best_accuracy: {}'.format(
                            i, train_acc, val_acc, lowest_val_acc))
                        print('time:{}'.format(end - begin))
                        # log_handle.write('{} {} {} {}\n'.format(i, train_acc, val_acc,
                        #                                         cost_total / (eval_interval * len(batches))))
                        cost_total = 0.  # empty cost
                        begin = end
                        #
                        # save the best model, to disk
                        # if val_acc > best_validation_accuracy:
                        # best_validation_accuracy = val_acc
                        if train_acc > lowest_val_acc:
                            print('saving model...', train_acc, lowest_val_acc)
                            lowest_val_acc = train_acc
                            model.saver.save(model.get_sess(),
                                             CKPT_DIR + '/memn2n_model.ckpt',
                                             global_step=i)
        # close file
        total_end = time.clock()
        print('Total time: {} minutes.'.format((total_end - total_begin) / 60))
        log_handle.close()

    else:  # inference
        ###
        # restore checkpoint
        # ckpt = tf.train.get_checkpoint_state(CKPT_DIR)
        # if ckpt and ckpt.model_checkpoint_path:
        #     print('\n>> restoring checkpoint from', ckpt.model_checkpoint_path)
        #     model.saver.restore(model.get_sess(), ckpt.model_checkpoint_path)
        # # base(model, idx2candid, w2idx, sentence_size, BATCH_SIZE, n_cand, memory_size)
        #
        # # create an base session instance
        # isess = InteractiveSession(
        #     model, idx2candid, w2idx, n_cand, memory_size)
        #
        # if args['infer']:
        #     query = ''
        #     while query != 'exit':
        #         query = input('>> ')
        #         print('>> ' + isess.reply(query))
        # elif args['ui']:
        #     return isess
        pass
예제 #10
0
    def __init__(self,
                 data_dir,
                 task_id,
                 OOV=False,
                 memory_size=50,
                 train=0,
                 batch_size=32,
                 nn=False):
        self.data_dir = data_dir
        self.task_id = task_id
        self.OOV = OOV
        self.memory_size = memory_size
        self.train = train
        self.batch_size = batch_size
        self.nn = nn
        candidates, self.candid2indx = load_candidates(self.data_dir,
                                                       self.task_id)
        self.n_cand = len(candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData
        self.build_vocab(data, candidates)
        self.candidates_vec = vectorize_candidates(
            candidates, self.word_idx, self.candidate_sentence_size)
        self.params = {
            'n_cand': self.n_cand,
            'indx2candid': self.indx2candid,
            'candid2indx': self.candid2indx,
            'candidates_vec': self.candidates_vec,
            'word_idx': self.word_idx,
            'sentence_size': self.sentence_size,
            'candidate_sentence_size': self.candidate_sentence_size,
            'vocab_size': self.vocab_size
        }

        if self.nn:
            if self.train == 0:
                self.S, self.Q, self.A = vectorize_data(self.trainData,
                                                        self.word_idx,
                                                        self.sentence_size,
                                                        self.batch_size,
                                                        self.n_cand,
                                                        self.memory_size,
                                                        nn=self.nn)
            elif self.train == 1:
                self.S, self.Q, self.A = vectorize_data(self.valData,
                                                        self.word_idx,
                                                        self.sentence_size,
                                                        self.batch_size,
                                                        self.n_cand,
                                                        self.memory_size,
                                                        nn=self.nn)
            elif self.train == 2:
                self.S, self.Q, self.A = vectorize_data(self.testData,
                                                        self.word_idx,
                                                        self.sentence_size,
                                                        self.batch_size,
                                                        self.n_cand,
                                                        self.memory_size,
                                                        nn=self.nn)
        else:
            if self.train == 0:
                self.S, self.Q, self.A = vectorize_data(
                    self.trainData, self.word_idx, self.sentence_size,
                    self.batch_size, self.n_cand, self.memory_size)
            elif self.train == 1:
                self.S, self.Q, self.A = vectorize_data(
                    self.valData, self.word_idx, self.sentence_size,
                    self.batch_size, self.n_cand, self.memory_size)
            elif self.train == 2:
                self.S, self.Q, self.A = vectorize_data(
                    self.testData, self.word_idx, self.sentence_size,
                    self.batch_size, self.n_cand, self.memory_size)
                        sentence_size) + 5  # add some space for testing data
    memory_size = min(FLAGS.memory_size, max_story_size)

    # vectorize data
    trainS, trainQ, trainA, trainID = utils.vectorize_data(
        train, word_idx, sentence_size, FLAGS.batch_size, memory_size,
        cand_idx)
    valS, valQ, valA, valID = utils.vectorize_data(val, word_idx,
                                                   sentence_size,
                                                   FLAGS.batch_size,
                                                   memory_size, cand_idx)
    testS, testQ, testA, testID = utils.vectorize_data(test, word_idx,
                                                       sentence_size,
                                                       FLAGS.batch_size,
                                                       memory_size, cand_idx)
    C, cand_idx, idx_cand = utils.vectorize_candidates(cand_idx, idx_cand,
                                                       word_idx, sentence_size)

    # params
    n_train = np.array(trainS).shape[0]
    n_test = np.array(testS).shape[0]
    n_val = np.array(valS).shape[0]
    tf.set_random_seed(FLAGS.random_state)
    batch_size = FLAGS.batch_size
    batches = zip(range(0, n_train - batch_size, batch_size),
                  range(batch_size, n_train, batch_size))
    batches = [(start, end) for start, end in batches]

    print "input data example: ", train[5]['utter_list'][0]
    print "overall bot utterance candidates: ", len(cand_idx)
    print 'vocab_size', vocab_size
    print "Longest sentence length", sentence_size
    def __init__(self,
                 data_dir,
                 model_dir,
                 task_id,
                 isInteractive=True,
                 OOV=False,
                 memory_size=250,
                 random_state=None,
                 batch_size=32,
                 learning_rate=0.001,
                 epsilon=1e-8,
                 max_grad_norm=40.0,
                 evaluation_interval=10,
                 hops=3,
                 epochs=200,
                 embedding_size=20,
                 alpha=.5,
                 save_vocab=None,
                 load_vocab=None,
                 verbose=False,
                 load_profiles=None,
                 save_profiles=None):

        self.data_dir = data_dir
        self.task_id = task_id
        self.model_dir = model_dir
        # self.isTrain=isTrain
        self.isInteractive = isInteractive
        self.OOV = OOV
        self.memory_size = memory_size
        self.random_state = random_state
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_grad_norm = max_grad_norm
        self.evaluation_interval = evaluation_interval
        self.hops = hops
        self.epochs = epochs
        self.embedding_size = embedding_size
        self.save_vocab = save_vocab
        self.load_vocab = load_vocab
        self.verbose = verbose
        self.alpha = alpha

        # Loading possible answers
        self.candidates, self.candid2indx = load_candidates(
            self.data_dir, self.task_id)
        self.n_cand = len(self.candidates)
        print("Candidate Size", self.n_cand)
        self.indx2candid = dict(
            (self.candid2indx[key], key) for key in self.candid2indx)

        # task data
        self.trainData, self.testData, self.valData = load_dialog_task(
            self.data_dir, self.task_id, self.candid2indx, self.OOV)
        data = self.trainData + self.testData + self.valData

        # Find profiles types
        if load_profiles:
            with open(load_profiles, 'rb') as f:
                self._profiles_mapping = pickle.load(f)
        else:
            self._profiles_mapping = generate_profile_encoding(self.trainData)
            if save_profiles:
                with open(save_profiles, 'wb') as f:
                    pickle.dump(self._profiles_mapping, f)

        profiles_idx_set = set(self._profiles_mapping.values())

        print("Profiles:", self._profiles_mapping)

        # Vocabulary
        self.build_vocab(data, self.candidates, self.save_vocab,
                         self.load_vocab)
        # self.candidates_vec=vectorize_candidates_sparse(self.candidates,self.word_idx)
        self.candidates_vec = vectorize_candidates(
            self.candidates, self.word_idx, self.candidate_sentence_size)

        # Model initialisation
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
                                           epsilon=self.epsilon)
        self.sess = tf.Session()
        self.model = MemN2NDialog(self.batch_size,
                                  self.vocab_size,
                                  self.n_cand,
                                  self.sentence_size,
                                  self.embedding_size,
                                  self.candidates_vec,
                                  profiles_idx_set,
                                  session=self.sess,
                                  hops=self.hops,
                                  max_grad_norm=self.max_grad_norm,
                                  alpha=alpha,
                                  optimizer=optimizer,
                                  task_id=task_id,
                                  verbose=verbose)
        self.saver = tf.train.Saver(max_to_keep=50)

        # self.summary_writer = tf.train.SummaryWriter(self.model.root_dir, self.model.graph_output.graph)
        self.summary_writer = tf.summary.FileWriter(
            self.model.root_dir, self.model.graph_output.graph)