def eps_greedy_sampling(td_ct, eval_td_ct, args, conf, summary_writer, epoch_id): """eps_greedy_sampling""" dataset = NpzDataset(conf.test_npz_list, conf.npz_config_path, conf.requested_npz_names, if_random_shuffle=False) data_gen = dataset.get_data_generator(conf.batch_size) list_reward = [] last_batch_data = BatchData(conf, data_gen.next()) batch_id = 0 for tensor_dict in data_gen: ### eps_greedy_sampling batch_data = BatchData(conf, tensor_dict) batch_data.set_decode_len(batch_data.seq_lens()) batch_data.expand_candidates(last_batch_data, batch_data.seq_lens()) fetch_dict = td_ct.eps_greedy_sampling( GenRLFeedConvertor.eps_greedy_sampling(batch_data, eps=0)) sampled_id = np.array(fetch_dict['sampled_id']).reshape([-1]) order = sequence_unconcat(sampled_id, batch_data.decode_len()) ### get reward reordered_batch_data = batch_data.get_reordered(order) fetch_dict = eval_td_ct.inference( EvalFeedConvertor.inference(reordered_batch_data)) reward = np.array(fetch_dict['click_prob'])[:, 1] ### logging list_reward.append(np.mean(reward)) if batch_id == 100: break last_batch_data = BatchData(conf, tensor_dict) batch_id += 1 add_scalar_summary(summary_writer, epoch_id, 'eps_greedy_sampling/reward-%s' % args.eval_exp, np.mean(list_reward))
def online_inference_for_test(args, epoch_id, max_steps, ct_sim, dict_gen_ct, summary_writer): """ Do inference on the test test. """ sim_conf = ct_sim.alg.model.conf dataset = NpzDataset(args.test_npz_list, sim_conf.npz_config_path, sim_conf.requested_names, if_random_shuffle=False, one_pass=True) data_gen = dataset.get_data_generator(sim_conf.batch_size) thread_data_gen = threaded_generator(data_gen, capacity=100) list_sim_responses = [] ### online inference last_batch_data = BatchData(sim_conf, thread_data_gen.next()) for batch_id, tensor_dict in enumerate(thread_data_gen): if batch_id > max_steps: break np.random.seed(batch_id) batch_data = BatchData(sim_conf, tensor_dict) batch_data.set_decode_len(batch_data.seq_lens()) batch_data.expand_candidates(last_batch_data, batch_data.seq_lens()) np.random.seed(None) del batch_data.tensor_dict['click_id'] orders, sim_responses = inference_one_batch(args.gen_type, ct_sim, dict_gen_ct, batch_data, eps=0) # , (b, decode_len) # save to replay memory sim_batch_data = batch_data.get_reordered(orders, sim_responses) list_sim_responses.append(sim_responses) last_batch_data = BatchData(sim_conf, tensor_dict) if batch_id % 100 == 0: logging.info('inference test batch %d' % batch_id) list_sum_response = np.sum(np.concatenate(list_sim_responses, 0), 1) # (b,) add_scalar_summary(summary_writer, epoch_id, 'inference/test_sim_responses', np.mean(list_sum_response))
def online_inference(args, epoch_id, max_steps, data_gen, ct_sim, dict_gen_ct, summary_writer, if_print=True): """ Do inference for `max_steps` batches. """ sim_conf = ct_sim.alg.model.conf replay_memory = [] list_sim_responses = [] ### online inference last_batch_data = BatchData(sim_conf, data_gen.next()) for batch_id in range(max_steps): np.random.seed(epoch_id * max_steps + batch_id) tensor_dict = data_gen.next() batch_data = BatchData(sim_conf, tensor_dict) batch_data.set_decode_len(batch_data.seq_lens()) batch_data.expand_candidates(last_batch_data, batch_data.seq_lens()) np.random.seed(None) del batch_data.tensor_dict['click_id'] if batch_data.batch_size() == 1: # otherwise, rl will crash continue orders, sim_responses = inference_one_batch(args.gen_type, ct_sim, dict_gen_ct, batch_data, eps=args.infer_eps) # , (b, decode_len) # save to replay memory sim_batch_data = batch_data.get_reordered(orders, sim_responses) replay_memory.append(sim_batch_data) list_sim_responses.append(sim_responses) last_batch_data = BatchData(sim_conf, tensor_dict) if batch_id % 100 == 0 and if_print: logging.info('inference epoch %d batch %d' % (epoch_id, batch_id)) if if_print: list_sum_response = np.sum(np.concatenate(list_sim_responses, 0), 1) # (b,) add_scalar_summary(summary_writer, epoch_id, 'inference/sim_responses', np.mean(list_sum_response)) return replay_memory
def evaluate(td_ct, eval_td_ct, args, conf, epoch_id): """softmax_sampling""" np.random.seed( 0 ) # IMPORTANT. To have the same candidates, since the candidates is selected by np.random.choice. dataset = NpzDataset(conf.test_npz_list, conf.npz_config_path, conf.requested_npz_names, if_random_shuffle=False) batch_size = 250 data_gen = dataset.get_data_generator(batch_size) max_batch_id = 200 list_n = [1, 20, 40] dict_reward = {'eps_greedy': [], 'softmax': {n: [] for n in list_n}} p_counter = PatternCounter() last_batch_data = BatchData(conf, data_gen.next()) for batch_id in range(max_batch_id): def get_list_wise_reward(batch_data, order): reordered_batch_data = batch_data.get_reordered(order) fetch_dict = eval_td_ct.inference( EvalFeedConvertor.inference(reordered_batch_data)) reward = np.array(fetch_dict['click_prob'])[:, 1] reward_unconcat = sequence_unconcat(reward, [len(od) for od in order]) return [np.sum(rw) for rw in reward_unconcat] def greedy_sampling(batch_data): fetch_dict = td_ct.eps_greedy_sampling( GenRLFeedConvertor.eps_greedy_sampling(batch_data, eps=0)) sampled_id = np.array(fetch_dict['sampled_id']).reshape([-1]) order = sequence_unconcat(sampled_id, batch_data.decode_len()) list_wise_reward = get_list_wise_reward(batch_data, order) # (b,) return order, list_wise_reward def softmax_sampling(batch_data, max_sampling_time): mat_list_wise_reward = [] mat_order = [] for i in range(max_sampling_time): fetch_dict = td_ct.softmax_sampling( GenRLFeedConvertor.softmax_sampling(batch_data, eta=0.1)) sampled_id = np.array(fetch_dict['sampled_id']).reshape([-1]) order = sequence_unconcat(sampled_id, batch_data.decode_len()) list_wise_reward = get_list_wise_reward(batch_data, order) mat_order.append(order) mat_list_wise_reward.append(list_wise_reward) mat_list_wise_reward = np.array( mat_list_wise_reward) # (max_sampling_time, b) return mat_order, mat_list_wise_reward # (max_sampling_time, b, var_seq_len), tensor_dict = data_gen.next() batch_data = BatchData(conf, tensor_dict) batch_data.set_decode_len(batch_data.seq_lens()) batch_data.expand_candidates(last_batch_data, batch_data.seq_lens()) p_counter.add_log_pattern(batch_data) ### eps_greedy_sampling order, list_wise_reward = greedy_sampling(batch_data) dict_reward['eps_greedy'] += list_wise_reward p_counter.add_sampled_pattern('eps_greedy', batch_data, order) ### softmax_sampling max_sampling_time = np.max(list_n) mat_order, mat_list_wise_reward = softmax_sampling( batch_data, max_sampling_time) for n in list_n: dict_reward['softmax'][n] += np.max(mat_list_wise_reward[:n], 0).tolist() max_indice = np.argmax(mat_list_wise_reward[:n], 0) # (b,) max_order = [ mat_order[max_id][b_id] for b_id, max_id in enumerate(max_indice) ] p_counter.add_sampled_pattern('softmax_%d' % n, batch_data, max_order) ### log if batch_id % 10 == 0: logging.info('batch_id:%d eps_greedy %f' % (batch_id, np.mean(dict_reward['eps_greedy']))) for n in list_n: logging.info('batch_id:%d softmax_%d %f' % (batch_id, n, np.mean(dict_reward['softmax'][n]))) p_counter.Print() last_batch_data = BatchData(conf, tensor_dict) ### log logging.info('final eps_greedy %f' % np.mean(dict_reward['eps_greedy'])) for n in list_n: logging.info('final softmax_%d %f' % (n, np.mean(dict_reward['softmax'][n]))) p_counter.Print() ### save pickle_file = 'tmp/%s-eval_%s.pkl' % (args.exp, args.eval_model) p_counter.save(pickle_file)
def train(td_ct, eval_td_ct, args, conf, summary_writer, replay_memory, epoch_id): """train""" dataset = NpzDataset(conf.train_npz_list, conf.npz_config_path, conf.requested_npz_names, if_random_shuffle=True) data_gen = dataset.get_data_generator(conf.batch_size) list_reward = [] list_loss = [] list_first_Q = [] guessed_batch_num = 11500 batch_id = 0 last_batch_data = BatchData(conf, data_gen.next()) for tensor_dict in data_gen: ### eps_greedy_sampling batch_data = BatchData(conf, tensor_dict) batch_data.set_decode_len(batch_data.seq_lens()) batch_data.expand_candidates(last_batch_data, batch_data.seq_lens()) fetch_dict = td_ct.eps_greedy_sampling( GenRLFeedConvertor.eps_greedy_sampling(batch_data, eps=0.2)) sampled_id = np.array(fetch_dict['sampled_id']).reshape([-1]) order = sequence_unconcat(sampled_id, batch_data.decode_len()) ### get reward reordered_batch_data = batch_data.get_reordered(order) fetch_dict = eval_td_ct.inference( EvalFeedConvertor.inference(reordered_batch_data)) reward = np.array(fetch_dict['click_prob'])[:, 1] ### save to replay_memory reordered_batch_data2 = batch_data.get_reordered_keep_candidate(order) reordered_batch_data2.set_decode_len(batch_data.decode_len()) replay_memory.append((reordered_batch_data2, reward)) ### train memory_batch_data, reward = replay_memory[np.random.randint( len(replay_memory))] feed_dict = GenRLFeedConvertor.train_test(memory_batch_data, reward) fetch_dict = td_ct.train(feed_dict) ### logging list_reward.append(np.mean(reward)) list_loss.append(np.array(fetch_dict['loss'])) list_first_Q.append(np.mean(np.array(fetch_dict['c_Q'])[0])) if batch_id % 10 == 0: global_batch_id = epoch_id * guessed_batch_num + batch_id add_scalar_summary(summary_writer, global_batch_id, 'train/rl_reward', np.mean(list_reward)) add_scalar_summary(summary_writer, global_batch_id, 'train/rl_loss', np.mean(list_loss)) add_scalar_summary(summary_writer, global_batch_id, 'train/rl_1st_Q', np.mean(list_first_Q)) list_reward = [] list_loss = [] list_first_Q = [] last_batch_data = BatchData(conf, tensor_dict) batch_id += 1