def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
              train_rate, valid_rate, test_rate, algo_name, charset, mode) :
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {'algo_name': algo_name, 'batch_size': 96,
                       'train_valid_test_rate': [train_rate, valid_rate, test_rate],
                       'split_level': 'zi', 'pre_word_embedding': False,
                       'word_embedding_dim': 128, 'max_sentence_word_num': 150,
                       'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True,
                       'hidden_dim': 1024, 'n_style': 4,
                       'style_embedding_dim': 128, 'charset': charset, 'shuffle': False,
                       'save_freq': 100}
     if mode == 'train' :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model')
     else :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final')
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf')
     save_confs_val(self.conf_dict, self.conf_path)
     
     # set corpus reader
     if mode == 'train' :
         self.cr = CorpusReaderDialog(dataset_file=dataset_file,
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file,
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
                                      charset=self.conf_dict['charset'],
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     else :
         self.cr = CorpusReaderDialog(dataset_file=None,
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file,
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
                                      charset=self.conf_dict['charset'],
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = StyleEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                           hidden_status_dim=self.conf_dict['hidden_dim'],
                                           word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                           n_style=self.conf_dict['n_style'],
                                           style_embedding_dim=self.conf_dict['style_embedding_dim'],
                                           input_params=param_dict)
                 
                 
                 
                 
예제 #2
0
 def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
              train_rate, valid_rate, test_rate, algo_name, charset, mode) :
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {'algo_name': algo_name, 'batch_size': 256,
                       'train_valid_test_rate': [train_rate, valid_rate, test_rate],
                       'split_level': 'zi', 'pre_word_embedding': False,
                       'word_embedding_dim': 128, 'max_sentence_word_num': 150,
                       'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True,
                       'hidden_dim': 512, 'charset': charset, 'shuffle': False,
                       'save_freq': 100}
     if mode == 'train' :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model')
     else :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final')
     self.param_path = 'ChoEncoderDecoder_bda37ef460ea58d4cfaf1122e4a7e2d8.model302'
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf')
     #save_confs_val(self.conf_dict, self.conf_path)
     # set corpus reader
     if mode == 'train' :
         self.cr = CorpusReaderDialog(dataset_file=dataset_file, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None, 
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     else :
         self.cr = CorpusReaderDialog(dataset_file=None, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = RnnEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                           hidden_status_dim=self.conf_dict['hidden_dim'],
                                           word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                           input_params=param_dict)
예제 #3
0
 def train(self):
     """
     Train a model.
     """
     n_trainset, n_validset, n_testset = self.cr.get_size()
     n_batches = max(1, (n_trainset - 1) / self.conf_dict['batch_size'] + 1)
     # print model information
     print '#' * 100
     self.print_model_info()
     print ('Compiling model...')
     train_model = \
         self.model.get_training_function(self.cr, batch_size=self.conf_dict['batch_size'])
     valid_model = self.model.get_validing_function(self.cr, batch_size=self.conf_dict['batch_size'])
     test_model = self.model.get_testing_function(self.cr, batch_size=self.conf_dict['batch_size'])
     # start train
     print '#' * 100
     print ('Start to train.') 
     test_error = test_model()[0]
     print ('Now testing model. Cost Error: %.10f' % (test_error))
     epoch, it, n_epochs = 0, 0, 1000000
     while (epoch < n_epochs):
         epoch += 1
         for i in xrange(n_batches):
             # train model
             train_error = train_model(i)[0]
             # print 'Step error: %f\r' % train_error,
             if math.isnan(train_error):
                 print ('Train error is NaN in iteration %d, batch %d' % (epoch, i))
                 error_model_path = self.param_path + str(epoch) + '.error' 
                 save_params_val(error_model_path, self.model.get_parameters())
                 print ('model saved in %s , reload and skip the %d batch.' % \
                     (error_model_path, self.conf_dict['save_freq']))
                 param_dict = load_params_val(self.param_path)
                 self.model.set_parameters(param_dict)
                 exit()
             if it % self.conf_dict['save_freq'] == 0:
                 # valid model
                 # valid_error = valid_model()[0]
                 print ('@iter: %d\tTraining Error: %.10f' % (it, train_error))
                 print self.param_path
                 save_params_val(self.param_path, self.model.get_parameters())
             it = it + 1
         # test model
         # print ('Finished a epoch.')
         test_error = test_model()[0]
         print ('Now testing model. Testing Error: %.10f' % (test_error))
         save_params_val(self.param_path + str(epoch), self.model.get_parameters())
 def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
              train_rate, valid_rate, test_rate, algo_name, charset, mode) :
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {'algo_name': algo_name, 'batch_size': 128,
                       'train_valid_test_rate': [train_rate, valid_rate, test_rate],
                       'split_level': 'zi', 'pre_word_embedding': False,
                       'word_embedding_dim': 128, 'max_sentence_word_num': 150,
                       'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True,
                       'hidden_dim': 1024, 'charset': charset, 'shuffle': False,
                       'save_freq': 100}
     if mode == 'train' :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model')
     else :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final')
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf')
     save_confs_val(self.conf_dict, self.conf_path)
     # set corpus reader
     if mode == 'train' :
         self.cr = CorpusReaderDialog(dataset_file=dataset_file, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None, 
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     else :
         self.cr = CorpusReaderDialog(dataset_file=None, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                                   hidden_status_dim=self.conf_dict['hidden_dim'],
                                                   word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                                   input_params=param_dict)
     # create param sets
     if mode == 'test_all' :
         self.model_sets, self.model_paths = list(), list()
         for parent, dirnames, filenames in os.walk(os.path.join(dataset_folder, 'model', 'dialog')) :
             for filename in filenames :
                 if re.match(algo_name + '_(\w+).model[0-9]+', filename) != None :
                     self.model_paths.append(filename)
                     param_dict = load_params_val(os.path.join(dataset_folder, 'model', 'dialog', filename))
                     model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                                      hidden_status_dim=self.conf_dict['hidden_dim'],
                                                      word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                                      input_params=param_dict)
                     self.model_sets.append(model)
예제 #5
0
    def train(self):
        """
        Train a model.
        """
        if self.conf_dict['shuffle'] :
            self.cr.shuffle()  # The data may be shuffle by some implement of CorpusReader, but NOT ALL.
        n_train_set, n_valid_set, n_test_set = self.cr.get_size()
        n_batches = (n_train_set - 1) / self.conf_dict['batch_size'] + 1
        print 'Compiling model'
        train_model = self.model.get_training_function(self.cr, batch_size=self.conf_dict['batch_size'],
                                                       batch_repeat=1)
        valid_model = self.model.get_validing_function(self.cr)
        test_model, pr_model = self.model.get_testing_function(self.cr)
        
        print ('Start to train.') 
        zae, sae, bae = test_model()
        print ('Now testing model. Test Zi Average Error: %s' % (str(zae)))
        print ('Now testing model. Test Sentence Average Error: %s' % (str(sae)))
        print ('Now testing model. Test Batch Average Error: %s' % (str(bae)))
        epoch = 0
        n_epochs = 1000000
        it = 0
        while (epoch < n_epochs):
            epoch += 1
            for i in xrange(n_batches):
                # train model
                train_error = train_model(i)[0]
                # print 'Step error: %f\r' % train_error,
                
                if math.isnan(train_error):
                    print 'Train error is NaN in iteration %d' % i
                    error_model_path = self.param_path + str(epoch) + '.error' 
                    save_params_val(error_model_path, self.model.get_parameters())
                    print 'model saved in %s , reload and skip the %d batch' % (error_model_path, self.conf_dict['save_freq'])
                    scope = (self.conf_dict['batch_size'] * i, self.conf_dict['batch_size'] * (i + 1))
                    for j in range(scope[0], scope[1]):
                        print '%s\t%s' % (self.cr.train_set[0][j], self.cr.train_set[1][j])
                    param_dict = load_params_val(self.param_path)
                    self.model.set_parameters(param_dict)
                    exit()
#                     continue
                if it % self.conf_dict['save_freq'] == 0:
                    valid_error = valid_model()[0]
                    # valid model
                    print ('@iter: %s\tTraining Error: %s\tValid Error: %s.' % 
                                 (it, str(train_error), str(valid_error)))
                    # Save model parameters
                    print ('Saving parameters to %s.' % (self.param_path))
                    save_params_val(self.param_path, self.model.get_parameters())
                it = it + 1
            # test model
            print 'Finished a epoch.'
            save_params_val(self.param_path + str(epoch), self.model.get_parameters())
            zae, sae, bae = test_model()
            print ('Now testing model. Test Zi Average Error: %s' % (str(zae)))
            print ('Now testing model. Test Sentence Average Error: %s' % (str(sae)))
            print ('Now testing model. Test Batch Average Error: %s' % (str(bae)))
            '''
            pr_error = pr_model()[0]
            test_pr = 0.0
            n_samples = config.globalTestPRSamples()
            n_data = pr_error.shape[0] / n_samples
            trues = [0.0] * (n_samples / 2) + [1.0] * (n_samples / 2)
            for i in range(n_data) :
                test_pr += pearsonr(trues, list(pr_error[i * n_samples:(i + 1) * n_samples]))[0]
            test_pr /= n_data
            print ('Now testing model. Test PR: %s' % (str(test_pr)))
            '''
            print ('\n')
예제 #6
0
import cPickle
import math

from deep.util.parameter_operation import load_params_val

p = load_params_val('data/human/model/dialog/StyleEncoderDecoderMulti_b498fc540bd33b3d2b33077573e8efe6.model.final')

# def check(l):
#     if isinstance(l,list):
#         for i in l:
#             c = check(i)
#             if c:
#                 return c
#         return False
#     else:
#         return math.isnan(l)
# for k,v in p.items():
#     print k, check(v.tolist())
print p
# with open('error', 'wb') as fw:
#     cPickle.dump(p, fw)  
 def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file,
              word_embedding_file, train_rate, valid_rate, test_rate,
              algo_name, charset, mode):
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {
         'algo_name': algo_name,
         'batch_size': 256,
         'train_valid_test_rate': [train_rate, valid_rate, test_rate],
         'split_level': 'zi',
         'pre_word_embedding': False,
         'word_embedding_dim': 128,
         'n_topics': 5000,
         'topic_embedding_dim': 256,
         'max_sentence_word_num': 150,
         'min_sentence_word_num': 1,
         'is_BEG': False,
         'is_END': True,
         'hidden_dim': 512,
         'charset': charset,
         'shuffle': False,
         'save_freq': 100
     }
     self.param_path = os.path.join(
         dataset_folder, 'model', 'dialog',
         get_params_file_name(self.conf_dict) + '.model')
     print self.param_path
     #self.param_path ='ChoEncoderDecoderTopic_5908276eb2ae513520ca72135e5b82d0.model83'
     #self.param_path='ChoEncoderDecoderDT_4575b6c5893c10a009e29b6eb2988387.model42'
     #self.param_path='ChoEncoderDecoderDT_cc7f5ed5d9e9fe5a90a012f4e017106a.model'
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(
         dataset_folder, 'model', 'dialog',
         get_params_file_name(self.conf_dict) + '.conf')
     save_confs_val(self.conf_dict, self.conf_path)
     # set corpus reader
     if mode == 'train':
         self.cr = CorpusReaderDialogTopic(
             dataset_file=dataset_file,
             stopwords_file=stopwords_file,
             dict_file=dict_file,
             word_embedding_file=None,
             train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
             charset=self.conf_dict['charset'],
             max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
             min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
             is_BEG_available=self.conf_dict['is_BEG'],
             is_END_available=self.conf_dict['is_END'])
     else:
         self.cr = CorpusReaderDialogTopic(
             dataset_file=None,
             stopwords_file=stopwords_file,
             dict_file=dict_file,
             word_embedding_file=None,
             train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
             charset=self.conf_dict['charset'],
             max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
             min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
             is_BEG_available=self.conf_dict['is_BEG'],
             is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = RnnEncoderDecoderNetwork(
         n_words=len(self.cr.get_word_dictionary()),
         hidden_status_dim=self.conf_dict['hidden_dim'],
         word_embedding_dim=self.conf_dict['word_embedding_dim'],
         n_topics=self.conf_dict['n_topics'],
         topic_embedding_dim=self.conf_dict['topic_embedding_dim'],
         input_params=param_dict)