def output_embedding(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) user_ranklist_map = {} print('Start Testing') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in range(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) model.prepare_test_epoch() has_next = True user_idxs, product_idxs, query_word_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next, uqr_pairs = model.get_test_batch() if len(user_idxs) > 0: part_1 , part_2 = model.step(sess, learning_rate, user_idxs, product_idxs, query_word_idxs, review_idxs, word_idxs, context_word_idxs, True, FLAGS.test_mode) # record the results user_emb = part_1[0] product_emb = part_1[1] Wu = part_1[2] data_set.output_embedding(user_emb, FLAGS.train_dir + 'user_emb.txt') data_set.output_embedding(product_emb, FLAGS.train_dir + 'product_emb.txt') data_set.output_embedding(Wu, FLAGS.train_dir + 'Wu.txt') return
def output_embedding(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) user_ranklist_map = {} print('Start Testing') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) model.prepare_test_epoch() has_next = True input_feed, has_next, uqr_pairs = model.get_test_batch() if len(uqr_pairs) > 0: embeddings , keys = model.step(sess, input_feed, True, FLAGS.test_mode) # record the results for i in xrange(len(keys)): data_set.output_embedding(embeddings[i], FLAGS.train_dir + '%s.txt' % keys[i]) return
def get_product_scores(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) # add image features data_set.read_image_features(FLAGS.data_dir) # add rating features data_set.read_latent_factor(FLAGS.data_dir) current_step = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) user_ranklist_map = {} user_ranklist_score_map = {} print('Start Testing') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) has_next = True while has_next: user_idxs, product_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next = model.get_test_batch( ) if len(user_idxs) > 0: user_product_scores, _ = model.step(sess, learning_rate, user_idxs, product_idxs, review_idxs, word_idxs, context_word_idxs, True) current_step += 1 # record the results for i in xrange(len(user_idxs)): u_idx = user_idxs[i] sorted_product_idxs = sorted( range(len(user_product_scores[i])), key=lambda k: user_product_scores[i][k], reverse=True) user_ranklist_map[u_idx], user_ranklist_score_map[ u_idx] = data_set.compute_test_product_ranklist( u_idx, user_product_scores[i], sorted_product_idxs, FLAGS.rank_cutoff) #(product name, rank) if current_step % FLAGS.steps_per_checkpoint == 0: print("Finish test review %d/%d\r" % (model.cur_review_i, model.review_size), end="") data_set.output_ranklist(user_ranklist_map, user_ranklist_score_map, FLAGS.train_dir, FLAGS.similarity_func) return
def get_doc_softmax_norm(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) #if 'pv' in FLAGS.net_struct: data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train', FLAGS.DF_sampling) #else: # data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test', FLAGS.DF_sampling) current_step = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.doc_num) print('Start softmax denominator computing') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.doc_num)] model.setup_data_set(data_set, words_to_train) model.prepare_test_epoch(test_seq) softmax_denominators = [] has_next = True while has_next: word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, learning_rate, has_next = model.get_test_batch( ) if len(word_idxs) > 0: doc_softmax_denominator, _ = model.step( sess, learning_rate, word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, True, FLAGS.test_mode) current_step += 1 # record the results for i in xrange(len(doc_idxs)): doc_idx = doc_idxs[i] softmax_denominators.append((data_set.doc_info[doc_idx][0], doc_softmax_denominator[i])) if current_step % FLAGS.steps_per_checkpoint == 0: print("Finish test doc %d/%d\r" % (model.cur_doc_i, len(model.test_seq)), end="") with open(FLAGS.train_dir + 'test_doc.softmax_denominators', 'w') as softmax_denominator_fout: for i in xrange(len(softmax_denominators)): #softmax_denominator_fout.write(softmax_denominators[i][0] + '\t%.3f\n'%softmax_denominators[i][1]) softmax_denominator_fout.write(softmax_denominators[i][0] + '\t' + str(softmax_denominators[i][1]) + '\n') return
def interactive_explain_mode(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) FLAGS.batch_size = 1 data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) #data_set.read_image_features(FLAGS.data_dir) current_step = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) user_ranklist_map = {} user_ranklist_score_map = {} print('Start Interactive Process') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) has_next = True input_feed, has_next = model.get_test_batch() while True: # read information from stdin mode, user_idx, product_idx = None, None, None test_feed = copy.deepcopy(input_feed) print('Enter rank cut:') rank_cut = int(sys.stdin.readline().strip()) print('Enter mode, "product" for gathering product information and "user" for gathering user information:') mode = sys.stdin.readline().strip() # Output user+query or product? if mode == 'product': # product print('Enter product idx (line number start from 0) or name ("asin"):') product_idx = data_set.get_idx(sys.stdin.readline().strip(), 'product') test_feed[model.relation_dict['product']['idxs'].name] = [product_idx] p_entity_list, _ = model.step(sess, test_feed, True, 'explain_product') # output results print('Product %d %s' % (product_idx, data_set.product_ids[product_idx])) for relation_name, entity_name, entity_scores in p_entity_list: data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, {}) else: # user + query print('Enter user idx (line number start from 0) or name (user id):') user_idx = data_set.get_idx(sys.stdin.readline().strip(), 'user') test_feed[model.user_idxs.name] = [user_idx] up_entity_list, _ = model.step(sess, test_feed, True, 'explain_user_query') remove_map = { 'product' : data_set.user_train_product_set_list[user_idx] } print('User %d %s' % (user_idx, data_set.user_ids[user_idx])) # output results for relation_name, entity_name, entity_scores in up_entity_list: data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, remove_map)
def train(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train') data_set.sub_sampling(FLAGS.subsampling_rate) config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.log_device_placement=True with tf.Session(config=config) as sess: # Create model. print("Creating model") model = create_model(sess, False, data_set, data_set.review_size) print('Start training') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 current_words = 0.0 previous_words = 0.0 start_time = time.time() last_check_point_time = time.time() step_time, loss = 0.0, 0.0 current_epoch = 0 current_step = 0 get_batch_time = 0.0 training_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) while True: random.shuffle(training_seq) model.intialize_epoch(training_seq) has_next = True while has_next: time_flag = time.time() input_feed, has_next = model.get_train_batch() get_batch_time += time.time() - time_flag if len(input_feed[model.relation_dict['word']['idxs'].name]) > 0: time_flag = time.time() step_loss, _ = model.step(sess, input_feed, False) #step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 step_time += time.time() - time_flag # Once in a while, we print statistics. if current_step % FLAGS.steps_per_checkpoint == 0: print("Epoch %d Words %d/%d: lr = %5.3f loss = %6.2f words/sec = %5.2f prepare_time %.2f step_time %.2f\r" % (current_epoch, model.finished_word_num, model.words_to_train, input_feed[model.learning_rate.name], loss, (model.finished_word_num- previous_words)/(time.time() - start_time), get_batch_time, step_time), end="") step_time, loss = 0.0, 0.0 current_step = 1 get_batch_time = 0.0 sys.stdout.flush() previous_words = model.finished_word_num start_time = time.time() #print('time: ' + str(time.time() - last_check_point_time)) #if time.time() - last_check_point_time > FLAGS.seconds_per_checkpoint: # checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt") # model.saver.save(sess, checkpoint_path_best, global_step=model.global_step) current_epoch += 1 #checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt") #model.saver.save(sess, checkpoint_path_best, global_step=model.global_step) if current_epoch >= FLAGS.max_train_epoch: break checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt") model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)
def find_explanation_path(): print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) model.prepare_test_epoch() input_feed, has_next, uqr_pairs = model.get_test_batch() test_feed = copy.deepcopy(input_feed) print('Generating explanations') with open(FLAGS.explanation_output_dir + 'explanation-output.csv', mode='w') as write_csv_file: csv_writer = csv.writer(write_csv_file, delimiter=',') csv_writer.writerow(['sample_id', 'user', 'query', 'product', 'explanation', 'attention_weight', 'previous_reviews']) count = 0 for (user_idx, product_idx, query_idx, review_idx) in uqr_pairs: sample_id = '-'.join([str(user_idx), str(product_idx), str(query_idx), str(review_idx)]) user_history_idx_dict, user_hist_len_dict = model.get_history_and_length_dicts(review_idx) for key in user_history_idx_dict: test_feed[model.user_history_dict[key]['idxs'].name] = [user_history_idx_dict[key]] test_feed[model.user_history_dict[key]['length'].name] = [user_hist_len_dict[key]] test_feed[model.product_idxs.name] = [product_idx] query_word_idx = model.data_set.query_words[query_idx] test_feed[model.query_word_idxs.name] = [query_word_idx] attn_distribution_dict, _ = model.step(sess, test_feed, True, 'explanation_path') user = data_set.user_ids[user_idx] product = data_set.product_ids[product_idx] query = ' '.join([data_set.words[x] for x in query_word_idx if x < len(data_set.words)]) review_idxs = [idx for idx, review in enumerate(data_set.review_info) if review[0] == user_idx] review_word_idxs = [data_set.review_text[idx] for idx in review_idxs] reviews = [] for idx, review_word_idx in enumerate(review_word_idxs): if idx >= 5 : break review_txt = ' '.join([data_set.words[idx] for idx in review_word_idx if idx < len(data_set.words)]) reviews.append(str(idx+1) + ') ' + review_txt) #get max attn from master to find which slave attn is more important indexed_attn_values = list(enumerate(attn_distribution_dict['master'][0])) top_values = sorted(indexed_attn_values, key=operator.itemgetter(1), reverse=True)[:3] explanation = '' expln_index = 1 max_attn = top_values[0][1] # get explanation for top 3 attn scores from attention list for index, attn_score in top_values: curr_explanation = data_set.get_expln_with_max_attn(index, model.user_history_dict, user_history_idx_dict, attn_distribution_dict) if curr_explanation: explanation += str(expln_index) + '. ' + curr_explanation + '\n' expln_index += 1 csv_writer.writerow([sample_id, user, query, product, explanation, max_attn, '\n'.join(reviews)]) count+=1 print("Generated " + str(count) + " explanations")
def interactive_explain_mode(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) FLAGS.batch_size = 1 data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) current_step = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) user_ranklist_map = {} user_ranklist_score_map = {} print('Start Interactive Process') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) model.prepare_test_epoch() has_next = True input_feed, has_next, uqr_pairs = model.get_test_batch() while True: # read information from stdin mode, user_idx, query_idx, product_idx = None, None, None, None test_feed = copy.deepcopy(input_feed) print('Enter rank cut:') rank_cut = int(sys.stdin.readline().strip()) print('Enter mode:') mode = sys.stdin.readline().strip() # Output user+query or product? if mode == 'product': # product print('Enter product idx or name:') product_idx = data_set.get_idx(sys.stdin.readline().strip(), 'product') test_feed[model.product_idxs.name] = [product_idx] p_entity_list, _ = model.step(sess, test_feed, True, 'explain_product') # output results print('Product %d %s' % (product_idx, data_set.product_ids[product_idx])) for relation_name, entity_name, entity_scores in p_entity_list: data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, {}) else: # user + query print('Enter user idx or name:') user_idx = data_set.get_idx(sys.stdin.readline().strip(), 'user') user_history_idx_dict = data_set.get_user_history_idx(user_idx, model.max_history_length) print('Enter query idx:') query_idx = int(sys.stdin.readline().strip()) query_word_idx = model.data_set.query_words[query_idx] for key in user_history_idx_dict: test_feed[model.user_history_dict[key]['idxs'].name] = user_history_idx_dict[key] test_feed[model.query_word_idxs.name] = [query_word_idx] uq_entity_list, _ = model.step(sess, test_feed, True, 'explain_user_query') remove_map = { 'product' : data_set.user_train_product_set_list[user_idx] } print('User %d %s' % (user_idx, data_set.user_ids[user_idx])) print('Query %d %s' % (query_idx, '_'.join([data_set.words[x] for x in query_word_idx if x < len(data_set.words)]))) # output results for relation_name, entity_name, entity_scores in uq_entity_list: data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, remove_map) return
def self_test(): print("Self_test") FLAGS.data_dir = '/mnt/scratch/aiqy/MultiViewEmbedding/working/Amazon/small_sample/min_count1/' # Prepare data. print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train') data_set.sub_sampling(FLAGS.subsampling_rate) # add image features data_set.read_image_features(FLAGS.data_dir) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Creating model") model = create_model(sess, False, data_set, data_set.review_size) # This is the training loop. print('Start training') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 current_words = 0.0 previous_words = 0.0 start_time = time.time() step_time, loss = 0.0, 0.0 current_epoch = 0 current_step = 0 get_batch_time = 0.0 training_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) while True: random.shuffle(training_seq) model.intialize_epoch(training_seq) has_next = True while has_next: time_flag = time.time() user_idxs, product_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next = model.get_train_batch( ) get_batch_time += time.time() - time_flag if len(word_idxs) > 0: time_flag = time.time() step_loss, _ = model.step(sess, learning_rate, user_idxs, product_idxs, review_idxs, word_idxs, context_word_idxs, False) #step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 step_time += time.time() - time_flag # Once in a while, we print statistics. if current_step % FLAGS.steps_per_checkpoint == 0: print( "Epoch %d Words %d/%d: lr = %5.3f loss = %6.2f words/sec = %5.2f prepare_time %.2f step_time %.2f\r" % (current_epoch, model.finished_word_num, model.words_to_train, learning_rate, loss, (model.finished_word_num - previous_words) / (time.time() - start_time), get_batch_time, step_time), end="") step_time, loss = 0.0, 0.0 current_step = 1 get_batch_time = 0.0 sys.stdout.flush() previous_words = model.finished_word_num start_time = time.time() current_epoch += 1 if current_epoch >= FLAGS.max_train_epoch: break
def output_embedding(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) #if 'pv' in FLAGS.net_struct: data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train', FLAGS.DF_sampling) #else: # data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test', FLAGS.DF_sampling) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.doc_num) print('Start saving embeddings') words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.doc_num)] model.setup_data_set(data_set, words_to_train) model.prepare_test_epoch(test_seq) has_next = True word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, learning_rate, has_next = model.get_test_batch( ) part_1, part_2 = model.step(sess, learning_rate, word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, True, FLAGS.test_mode) # record the results word_emb = part_1[0] data_set.output_embedding(word_emb, data_set.words, FLAGS.train_dir + 'word_emb.txt') if 'pv' in FLAGS.net_struct: doc_emb = part_1[1] doc_names = [x[0] for x in data_set.doc_info] data_set.output_embedding(doc_emb, doc_names, FLAGS.train_dir + 'doc_emb.txt') if len(part_2) > 0: context_emb = part_2[0] data_set.output_embedding(context_emb, data_set.words, FLAGS.train_dir + 'context_emb.txt') else: context_emb = part_1[1] data_set.output_embedding(context_emb, data_set.words, FLAGS.train_dir + 'context_emb.txt') if FLAGS.use_local_context: local_context_emb = part_2[0] data_set.output_embedding( local_context_emb, data_set.words, FLAGS.train_dir + 'local_context_emb.txt') #need to compute doc embedding one by one words_to_train = float( FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.doc_num)] model.setup_data_set(data_set, words_to_train) model.prepare_test_epoch(test_seq) has_next = True current_step = 0 doc_emb = [None for x in xrange(len(data_set.doc_info))] while has_next: word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, learning_rate, has_next = model.get_test_batch( ) if len(doc_idxs) > 0: doc_emb_output, _ = model.step(sess, learning_rate, word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, True, 'output_doc_embedding') current_step += 1 # record the results for i in xrange(len(doc_idxs)): doc_idx = doc_idxs[i] doc_emb[doc_idx] = doc_emb_output[i] if current_step % FLAGS.steps_per_checkpoint == 0: print("Finish test doc %d/%d\r" % (model.cur_doc_i, len(model.test_seq)), end="") doc_names = [x[0] for x in data_set.doc_info] data_set.output_embedding(doc_emb, doc_names, FLAGS.train_dir + 'doc_emb.txt') return
def train(): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) # 处理数据集 """ 返回内容: ['product_ids', 'product_size', 'user_ids', 'user_size', 'words', 'vocab_size', 'query_words', 'query_max_length', 'word_count', 'vocab_distribute', 'review_info', 'review_text', 'review_size', 'sub_sampling_rate', 'review_distribute', 'product_distribute', 'product_query_idx'] """ data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train') # 计算得到再采样概率,去除高频词的影响在skip gram中使用过 data_set.sub_sampling(FLAGS.subsampling_rate) config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True with tf.Session(config=config) as sess: # Create model. print("Creating model") # 构造模型 model = create_model(sess, False, data_set, data_set.review_size) print('Start training') # 训练用到的单词数 words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 # 已经训练的单词数 current_words = 0.0 previous_words = 0.0 # 开始时间 start_time = time.time() # 保存时间 last_check_point_time = time.time() step_time, loss = 0.0, 0.0 current_epoch = 0 current_step = 0 get_batch_time = 0.0 # 所有的review的index training_seq = [i for i in range(data_set.review_size)] # model设置启动数据 ''' self.data_set = data_set self.words_to_train = words_to_train self.finished_word_num = 0 ''' model.setup_data_set(data_set, words_to_train) while True: # 吧review 的index的顺序打乱 random.shuffle(training_seq) ''' self.train_seq = training_seq self.review_size = len(self.train_seq) self.cur_review_i = 0 self.cur_word_i = 0 ''' model.intialize_epoch(training_seq) has_next = True while has_next: time_flag = time.time() user_idxs, product_idxs, query_word_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next = model.get_train_batch() get_batch_time += time.time() - time_flag if len(word_idxs) > 0: time_flag = time.time() step_loss, _ = model.step(sess, learning_rate, user_idxs, product_idxs, query_word_idxs, review_idxs, word_idxs, context_word_idxs, False) #step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 step_time += time.time() - time_flag # Once in a while, we print statistics. if current_step % FLAGS.steps_per_checkpoint == 0: print("Epoch %d Words %d/%d: lr = %5.3f loss = %6.2f words/sec = %5.2f prepare_time %.2f step_time %.2f\r" % (current_epoch, model.finished_word_num, model.words_to_train, learning_rate, loss, (model.finished_word_num- previous_words)/(time.time() - start_time), get_batch_time, step_time), end="") step_time, loss = 0.0, 0.0 current_step = 1 get_batch_time = 0.0 sys.stdout.flush() previous_words = model.finished_word_num start_time = time.time() #print('time: ' + str(time.time() - last_check_point_time)) #if time.time() - last_check_point_time > FLAGS.seconds_per_checkpoint: # checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt") # model.saver.save(sess, checkpoint_path_best, global_step=model.global_step) current_epoch += 1 #checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt") #model.saver.save(sess, checkpoint_path_best, global_step=model.global_step) if current_epoch >= FLAGS.max_train_epoch: break checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt") model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)
def find_explanation_path(): print("Reading data in %s" % FLAGS.data_dir) data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test') data_set.read_train_product_ids(FLAGS.input_train_dir) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Read model") model = create_model(sess, True, data_set, data_set.train_review_size) words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1 test_seq = [i for i in xrange(data_set.review_size)] model.setup_data_set(data_set, words_to_train) model.intialize_epoch(test_seq) model.prepare_test_epoch() input_feed, has_next, uqr_pairs = model.get_test_batch() test_feed = copy.deepcopy(input_feed) print('Generating explanations') with open(FLAGS.explanation_output_dir + 'explanation-output.csv', mode='w') as write_csv_file: csv_writer = csv.writer(write_csv_file, delimiter=',') csv_writer.writerow([ 'sample_id', 'user', 'query', 'product', 'explanation', 'previous_reviews' ]) count = 0 for (user_idx, product_idx, query_idx, review_idx) in uqr_pairs: sample_id = '-'.join([ str(user_idx), str(product_idx), str(query_idx), str(review_idx) ]) test_feed[model.user_idxs.name] = [user_idx] test_feed[model.product_idxs.name] = [product_idx] query_word_idx = model.data_set.query_words[query_idx] test_feed[model.query_word_idxs.name] = [query_word_idx] up_entity_list, _ = model.step(sess, test_feed, True, 'explain_user_product') user = data_set.user_ids[user_idx] product = data_set.product_ids[product_idx] query = ' '.join([ data_set.words[x] for x in query_word_idx if x < len(data_set.words) ]) review_idxs = [ idx for idx, review in enumerate(data_set.review_info) if review[0] == user_idx ] review_word_idxs = [ data_set.review_text[idx] for idx in review_idxs ] reviews = [] for idx, review_word_idx in enumerate(review_word_idxs): if idx >= 5: break review_txt = ' '.join([ data_set.words[idx] for idx in review_word_idx if idx < len(data_set.words) ]) reviews.append(str(idx + 1) + ') ' + review_txt) #merge all entity scores into one list to get max 3 values overall_tuple_list = [] for relation_name, entity_name, entity_scores in up_entity_list: entity_scores = entity_scores[0] indexed_scores = list(enumerate(entity_scores)) curr_tuple_list = [(relation_name, entity_name, index, value) for index, value in indexed_scores] overall_tuple_list.extend(curr_tuple_list) #get top 3 values and generate explanation for them top_valued_tuples = sorted(overall_tuple_list, key=operator.itemgetter(3), reverse=True)[:3] explanation = '' for index, top_tuple in enumerate(top_valued_tuples): relation_name, entity_name, max_index, _ = top_tuple word = data_set.entity_vocab[entity_name][max_index] if relation_name == 'write': curr_explanation = EXPLANATION_TMPL_WRITE.format( user=user, product=product, word=word) elif relation_name == 'brand': curr_explanation = EXPLANATION_TMPL_BRAND.format( user=user, product=product, word=word) elif relation_name == 'categories': curr_explanation = EXPLANATION_TMPL_CATEGORY.format( user=user, product=product, word=word) else: curr_explanation = EXPLANATION_TMPL_RELATED.format( user=user, product=product, word=word) explanation += str(index + 1) + '. ' + curr_explanation + '\n' csv_writer.writerow([ sample_id, user, query, product, explanation, '\n'.join(reviews) ]) count += 1 print("Generated " + str(count) + " explanations")