def generate(self, input_file, output_file): """ Generate a model. """ deploy_model = self.model.get_deploy_function() with open(output_file, 'w') as fw: with codecs.open(input_file, 'r', config.globalCharSet()) as fo: for line in fo.readlines(): # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print(line.encode(config.globalCharSet())) fw.writelines('%s\n' % line.encode(config.globalCharSet())) res, score = beam_search(line, self.cr, deploy_model, beam_size=200, search_scope=200) print res res = [ ' '.join(self.cr.transform_input_text(s)) for s in res ] for r, s in zip(res, score): print('result: %s, score: %f.' % (r, s)) fw.writelines('result: %s, score: %f.\n' % (r, s))
def generate(self, input_file, output_file): """ Generate a model. """ deploy_model = self.model.get_deploy_function() observe_model = self.model.get_observe_function() with open(output_file, 'w') as fw: with codecs.open(input_file, 'r', config.globalCharSet()) as fo: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print (line.encode(config.globalCharSet())) fw.writelines('%s\n' % line.encode(config.globalCharSet())) sentence, score = beam_search(line, self.cr, deploy_model, beam_size=50, search_scope=50) print sentence res = [' '.join(self.cr.transform_input_text(s)) for s in sentence] for r, st, s in zip(res, sentence, score)[0:5] : (question, question_mask) = self.cr.transform_input_data(line) (tanswer, tanswer_mask) = ([[i] for i in st], [[i] for i in [1]*len(st)]) [alpha] = observe_model(question[:-1,:], question_mask[:-1,:], tanswer, tanswer_mask) print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s)) for row in range(alpha.shape[0]) : for col in range(alpha.shape[1]) : print alpha[row, col, 0, 0], print fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s)) fw.writelines('\n')
def generate(self, input_file, output_file): """ Generate a model. """ deploy_model = self.model.get_deploy_function() evaluate_model = self.reverse_model.get_evaluation_function() with open(output_file, 'w') as fw: with codecs.open(input_file, 'r', config.globalCharSet()) as fo: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print (line.encode(config.globalCharSet())) fw.writelines('%s\n' % line.encode(config.globalCharSet())) res, score = beam_search(line, self.cr, deploy_model, beam_size=10, search_scope=10, output_size=20) print res res = [' '.join(self.cr.transform_input_text(s[1:-1])) for s in res] cbres = list() for r, s in zip(res, score) : (question, question_mask) = self.cr.transform_input_data(r) (answer, answer_mask) = self.cr.transform_input_data(line) answer = np.concatenate([question[-1:], answer], axis=0) answer_mask = np.concatenate([question_mask[-1:], answer_mask], axis=0) question = question[:-1,:] question_mask = question_mask[:-1,:] sae, _, _= evaluate_model(question, question_mask, answer, answer_mask) cbres.append([r, s, sae]) for r, s, rs in sorted(cbres, key=lambda x: x[1]+x[2], reverse=True) : print ('result: %s, score: %f, %f' % (r.encode(config.globalCharSet()), s, rs)) # fw.writelines('result: %s, score: %f, %f\n' % (r.encode(config.globalCharSet()), s, rs)) fw.writelines('\n')
def generate_one_question(self, question, deploy_model, output_size=50): res, score = beam_search(question, self.cr, deploy_model, beam_size=200, search_scope=200, output_size=output_size) print res res = [' '.join(self.cr.transform_input_text(s)) for s in res] return res, score
def generate(self, input_file, output_file): """ Generate a model with style modeling. """ k = 10 topic_distribution_function = self.model.get_topic_distribution_function() style_distribution_function = self.model.get_style_distribution_function() deploy_model = self.model.get_deploy_function() style_number = self.conf_dict['n_style'] with codecs.open(input_file, 'r', config.globalCharSet()) as fo: with open(output_file, 'w') as fw: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print (line.encode(config.globalCharSet())) fw.writelines('%s\n' % line.encode(config.globalCharSet())) (question, question_mask) = self.cr.transform_input_data(line) question = question[:-1] question_mask = question_mask[:-1] media_data, topic_distribution = \ topic_distribution_function(question, question_mask) sorted_topics = \ sorted(enumerate(topic_distribution[0]), key=lambda x: x[1], reverse=True) all_prob = list() all_res = list() for topic, prob in sorted_topics[0:k] : for style in range(style_number): style_distribution =\ style_distribution_function(question, question_mask, numpy.array([topic], dtype='int64'))[0] all_prob.append([topic, style, prob, style_distribution[0][style]]) # print 'style number: %d, score: %f' % (style, style_distribution[0][style]) for topic, style, tp, sp in sorted(all_prob, key=lambda x: x[2]*x[3], reverse=True)[0:k] : def distribution_calculate(question, question_mask, answer, answer_mask): topic_vector = \ numpy.concatenate([numpy.array([topic], dtype='int64')]*question.shape[1], axis=0) return deploy_model(question, question_mask, answer, answer_mask, media_data, topic_vector, style) res, score = beam_search(line, self.cr, distribution_calculate, beam_size=5, search_scope=5) # print res for idx, r in enumerate(res) : all_res.append([res[idx], score[idx]-math.log(tp*sp)]) all_res = sorted(all_res, key=lambda x: x[1], reverse=False) print all_res res = [(' '.join(self.cr.transform_input_text(s[0])), s[1]) for s in all_res[0:5]] for r, s in res : print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s)) fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s)) fw.writelines('\n')
def generate_one_question(self, question, media_function, deploy_function, output_size=50, n_chosen_style=2): """ Generate a model with style modeling. """ style_number = self.conf_dict['n_style'] style_score = [0] * style_number style_candidate_list = [[]] * style_number # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") print (question.encode(config.globalCharSet())) (question0, question_mask0) = self.cr.transform_input_data(question) question0 = question0[:-1] question_mask0 = question_mask0[:-1] media_data, style_distribution = media_function(question0, question_mask0) print style_distribution style_score = style_distribution[0] style_sorted_index = sorted(range(style_number), key=lambda x:-math.log(style_score[x]))[:n_chosen_style] res_list = [] for style in range(style_number): def distribution_calculate(question, question_mask, answer, answer_mask): return deploy_function(question, question_mask, answer, answer_mask, media_data, style) res, score = beam_search(question, self.cr, distribution_calculate, beam_size=200, search_scope=200, output_size=5) print res res = [' '.join(self.cr.transform_input_text(s)) for s in res[0:5]] for r, s in zip(res, score) : style_candidate_list[style].append((r, s - math.log(style_score[style]))) for style_index in style_sorted_index: res_list += style_candidate_list[style_index] res_list = sorted(res_list, key=lambda x:x[1]) answer_list = [] answer_set = set() count = 0 for answer, score in res_list: if count >= output_size: break if not answer in answer_set: answer_list.append((answer, score)) count += 1 answer_set.add(answer) return style_candidate_list, answer_list
def generate(self, input_file, output_file): """ Generate a model with style modeling. """ media_function = self.model.get_media_data_function() deploy_function = self.model.get_deploy_function() style_number = self.conf_dict['n_style'] for style in range(style_number): with codecs.open(input_file, 'r', config.globalCharSet()) as fo: with open(output_file + str(style), 'w') as fw: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_zi.decode("gb18030") line = line.strip() print (line.encode(config.globalCharSet())) fw.writelines('%s\n' % line.encode(config.globalCharSet())) (question, question_mask) = self.cr.transform_input_data(line) question = question[:-1] question_mask = question_mask[:-1] media_data, style_distribution = media_function(question, question_mask) print style_distribution print 'style number : %d, score: %f' % (style, style_distribution[0][style]) fw.writelines('style number : %d, score: %f\n' % (style, style_distribution[0][style])) def distribution_calculate(question, question_mask, answer, answer_mask): return deploy_function(question, question_mask, answer, answer_mask, media_data, style) res, score = beam_search(line, self.cr, distribution_calculate, beam_size=200, search_scope=200) print res res = [' '.join(self.cr.transform_input_text(s)) for s in res[0:5]] for r, s in zip(res, score) : print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s)) fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s)) fw.writelines('\n')
def generate_one_question_b_v(self, question, deploy_model): question_make_sense = isMakeSense(question) res, score = beam_search(question, self.cr, deploy_model, beam_size=100, search_scope=100, output_size=50) print res res = [' '.join(self.cr.transform_input_text(s)) for s in res] resorted_list = list() for r, s in zip(res, score): idf = 0.0 tokens = r.split(u' ') for token in tokens[1:-1]: idf += get_idf(token) # idf /= len(tokens) # idf_revise = 1 / (1 + np.exp(-2 / idf)) idf_revise = 4 * np.tanh(4 * idf) resorted_list.append((r, s, s)) if len(question) > 3: resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(question) ** 1) else: resorted_list = sorted(resorted_list, key=lambda x:x[2]) candidates = list() if question_make_sense == 1: f = 0 for r, _, _ in resorted_list[:5]: ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'') if isMakeSense(ori_sentence) == 1: f += 1 if f <= 1: question_make_sense = 0 for r, s1, s2 in resorted_list: ori_sentence = r.strip().replace(u'<END>', u'') ori_sentence = ori_sentence.replace(u' ', u'') answer_make_sense = isMakeSense(ori_sentence) r0 = r if isinstance(r, unicode) : r0 = r.encode(config.globalCharSet()) print r0, s1, s2, answer_make_sense, if len(ori_sentence) <= 3 \ and len(ori_sentence) < len(question) and ori_sentence in question: print 'continue1' continue if answer_make_sense == -1 or u'ϵͳ' in ori_sentence or u'NUM' in ori_sentence: print 'continue2' continue if question_make_sense == 1 and answer_make_sense <= 0: print 'continue3' continue # r_token_count = len(ori_sentence.strip().split(u' ')) # if question_word_count > 1 and r_token_count == 1: # print 'continue4' # continue candidates.append((r, s2)) print 'variousen' variousen_scope = 15 output_size = 5 high_fruq_left = 4 if len(candidates) == 0: return candidates, _ = zip(*candidates) # v_index = variousen_strings(candidates[:variousen_scope], output_size) # v_index = range(min(len(candidates), high_fruq_left)) + v_index # # print v_index # func = lambda x, y:x if y in x else x + [y] # v_index = reduce(func, [[], ] + v_index) # toReturn = [candidates[i] for i in v_index[:output_size]] toReturn = candidates[:output_size] return toReturn
def generate_b_v(self, input_file, output_file): """ Generate a model with special optimizers. """ deploy_model = self.model.get_deploy_function() with codecs.open(output_file, 'w', config.globalCharSet()) as fw: with codecs.open(input_file, 'r', config.globalCharSet()) as fo: for line in fo.readlines() : # line_word, line_zi = SegProcess(line.strip()) # line = line_word.decode("gb18030") # line = line_word line = line.strip() #question_make_sense = isMakeSense(line) question_make_sense=1 print (line.encode(config.globalCharSet())) fw.writelines('%s\n' % line) res, score = beam_search(line, self.cr, deploy_model, beam_size=1000, search_scope=1000) print res res = [' '.join(self.cr.transform_input_text(s)) for s in res] resorted_list = list() for r, s in zip(res, score): idf = 0.0 tokens = r.split(u' ') for token in tokens[1:-1]: idf += get_idf(token) # idf /= len(tokens) # idf_revise = 1 / (1 + np.exp(-2 / idf)) idf_revise = 4 * np.tanh(4 * idf) resorted_list.append((r, s, s)) if len(line) > 3: resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(line) ** 1) else: resorted_list = sorted(resorted_list, key=lambda x:x[2]) candidates = list() if question_make_sense == 1: f = 0 for r, _, _ in resorted_list[:5]: ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'') #if isMakeSense(ori_sentence) == 1: if 1: f += 1 if f <= 1: question_make_sense = 0 for r, s1, s2 in resorted_list: ori_sentence = r.strip().replace(u'<END>', u'') ori_sentence = ori_sentence.replace(u' ', u'') answer_make_sense = 1 #isMakeSense(ori_sentence) r0 = r if isinstance(r, unicode) : r0 = r.encode(config.globalCharSet()) print r0, s1, s2, answer_make_sense, if len(ori_sentence) <= 3 \ and len(ori_sentence) < len(line) and ori_sentence in line: print 'continue1' continue if answer_make_sense == -1: print 'continue2' continue if question_make_sense == 1 and answer_make_sense <= 0: print 'continue3' continue # r_token_count = len(ori_sentence.strip().split(u' ')) # if question_word_count > 1 and r_token_count == 1: # print 'continue4' # continue candidates.append((r, s2)) print 'variousen' variousen_scope = 15 output_size = 5 high_fruq_left = 4 v_index = variousen_strings(candidates[:variousen_scope], output_size) v_index = range(min(len(candidates), high_fruq_left)) + v_index # print v_index func = lambda x, y:x if y in x else x + [y] v_index = reduce(func, [[], ] + v_index) toReturn = [candidates[i] for i in v_index[:output_size]] for r, s in toReturn : print ('result: %s, score: %f.' % (r.encode(config.globalCharSet()), s)) fw.writelines('result: %s, score: %f.\n' % (r, s))