def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 128, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 1024, 'n_style': 4, 'style_embedding_dim': 128, 'n_topic': 500, 'topic_embedding_dim': 128, 'charset': charset, 'shuffle': False, 'save_freq': 200} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialogTopic(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialogTopic(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = StyleEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_style=self.conf_dict['n_style'], style_embedding_dim=self.conf_dict['style_embedding_dim'], n_topic=self.conf_dict['n_topic'], topic_embedding_dim=self.conf_dict['topic_embedding_dim'], input_params=param_dict)
class StyleEncoderDecoderGRUMixedTopic(ModelManager) : def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 128, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 1024, 'n_style': 4, 'style_embedding_dim': 128, 'n_topic': 500, 'topic_embedding_dim': 128, 'charset': charset, 'shuffle': False, 'save_freq': 200} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialogTopic(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialogTopic(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = StyleEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_style=self.conf_dict['n_style'], style_embedding_dim=self.conf_dict['style_embedding_dim'], n_topic=self.conf_dict['n_topic'], topic_embedding_dim=self.conf_dict['topic_embedding_dim'], input_params=param_dict) def generate(self, input_file, output_file): """ Generate a model with style modeling. """ k = 10 topic_distribution_function = self.model.get_topic_distribution_function() style_distribution_function = self.model.get_style_distribution_function() deploy_model = self.model.get_deploy_function() style_number = self.conf_dict['n_style'] with codecs.open(input_file, 'r', config.globalCharSet()) as fo: with open(output_file, 'w') as fw: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print (line.encode(config.globalCharSet())) fw.writelines('%s\n' % line.encode(config.globalCharSet())) (question, question_mask) = self.cr.transform_input_data(line) question = question[:-1] question_mask = question_mask[:-1] media_data, topic_distribution = \ topic_distribution_function(question, question_mask) sorted_topics = \ sorted(enumerate(topic_distribution[0]), key=lambda x: x[1], reverse=True) all_prob = list() all_res = list() for topic, prob in sorted_topics[0:k] : for style in range(style_number): style_distribution =\ style_distribution_function(question, question_mask, numpy.array([topic], dtype='int64'))[0] all_prob.append([topic, style, prob, style_distribution[0][style]]) # print 'style number: %d, score: %f' % (style, style_distribution[0][style]) for topic, style, tp, sp in sorted(all_prob, key=lambda x: x[2]*x[3], reverse=True)[0:k] : def distribution_calculate(question, question_mask, answer, answer_mask): topic_vector = \ numpy.concatenate([numpy.array([topic], dtype='int64')]*question.shape[1], axis=0) return deploy_model(question, question_mask, answer, answer_mask, media_data, topic_vector, style) res, score = beam_search(line, self.cr, distribution_calculate, beam_size=5, search_scope=5) # print res for idx, r in enumerate(res) : all_res.append([res[idx], score[idx]-math.log(tp*sp)]) all_res = sorted(all_res, key=lambda x: x[1], reverse=False) print all_res res = [(' '.join(self.cr.transform_input_text(s[0])), s[1]) for s in all_res[0:5]] for r, s in res : print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s)) fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s)) fw.writelines('\n') def style_chaos(self, input_file, output_file): """ Compute the chaos of the questions. """ get_media_data_function = self.model.get_media_data_function() style_number = self.conf_dict['n_style'] with codecs.open(input_file, 'r', config.globalCharSet()) as fo: with open(output_file + 'chaos', 'w') as fw: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print (line.encode(config.globalCharSet())) fw.writelines('%s\t' % line.encode(config.globalCharSet())) (question, question_mask) = self.cr.transform_input_data(line) question = question[:-1] question_mask = question_mask[:-1] _, style_distribution = get_media_data_function(question, question_mask) st = style_distribution.tolist()[0] chaos = 0 for p in st: chaos += -p * math.log(p) print chaos, '\t', str(st) output = '%f\t%s\n' % (chaos, str(st)) fw.writelines(output.encode(config.globalCharSet()))
class RnnEncoderDecoder(ModelManager): def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode): """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = { 'algo_name': algo_name, 'batch_size': 256, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'n_topics': 5000, 'topic_embedding_dim': 256, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 512, 'charset': charset, 'shuffle': False, 'save_freq': 100 } self.param_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') print self.param_path #self.param_path ='ChoEncoderDecoderTopic_5908276eb2ae513520ca72135e5b82d0.model83' #self.param_path='ChoEncoderDecoderDT_4575b6c5893c10a009e29b6eb2988387.model42' #self.param_path='ChoEncoderDecoderDT_cc7f5ed5d9e9fe5a90a012f4e017106a.model' param_dict = load_params_val(self.param_path) self.conf_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train': self.cr = CorpusReaderDialogTopic( dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else: self.cr = CorpusReaderDialogTopic( dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = RnnEncoderDecoderNetwork( n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_topics=self.conf_dict['n_topics'], topic_embedding_dim=self.conf_dict['topic_embedding_dim'], input_params=param_dict)