def main_calculate_credit_variance(args, credit_type): """ Calculate variance of credit by varing following items. """ def sampling(click_prob): """ click_prob: (n, n_class) """ n_class = click_prob.shape[1] return np.int64([np.random.choice(n_class, 1, p=p) for p in click_prob]).reshape([-1]) assert args.gen_type == 'env' ct_sim = get_ct_sim(args.sim_exp, args.use_cuda, args.train_mode, args.sim_cell_type, args.output_dim) assert ct_sim.ckp_step > 0, (ct_sim.ckp_step) ct_env = get_ct_env(args.env_exp, args.use_cuda, args.train_mode, args.env_output_type, args.output_dim) assert ct_env.ckp_step > 0, (ct_env.ckp_step) ### dataset sim_conf = ct_sim.alg.model.conf dataset = NpzDataset(args.train_npz_list, sim_conf.npz_config_path, sim_conf.requested_names, if_random_shuffle=True, one_pass=True) data_gen = dataset.get_data_generator(sim_conf.batch_size) thread_data_gen = threaded_generator(data_gen, capacity=100) n_vary = 64 base_batch_data = BatchData(sim_conf, thread_data_gen.next()) batch_size = base_batch_data.batch_size() batch_credits = [] for pos in range(base_batch_data.seq_lens()[0]): list_credits = [] for batch_id, tensor_dict in enumerate(thread_data_gen): if len(list_credits) == n_vary: break ref_batch_data = BatchData(sim_conf, tensor_dict) if ref_batch_data.batch_size() != batch_size: continue mix_batch_data = base_batch_data.replace_following_items(pos + 1, ref_batch_data) sim_fetch_dict = ct_sim.inference(SimFeedConvertor.inference(mix_batch_data)) sim_response = sampling(np.array(sim_fetch_dict['click_prob'])).reshape([-1, 1]).astype('int64') mix_batch_data.tensor_dict['click_id'] = FakeTensor(sim_response, mix_batch_data.seq_lens()) credit = generate_credit_one_batch(ct_env, mix_batch_data, credit_type=credit_type, credit_gamma=args.credit_gamma, globbase=None) credit = credit.reshape([batch_size, -1]) list_credits.append(credit[:, pos].reshape(-1, 1)) list_credits = np.concatenate(list_credits, 1) # (b, n_vary) batch_credits.append(list_credits) batch_credits = np.concatenate(batch_credits, 0) # (seq_len*b, n_vary) print(credit_type) print(batch_credits.shape) print('(s,a)-wise credit variance', np.mean(np.std(batch_credits, 1)))
def main_batch_rl(args): """ Include online inference and offline training. """ ct_sim = get_ct_sim(args.sim_exp, args.use_cuda, args.train_mode, args.sim_cell_type, args.output_dim) assert ct_sim.ckp_step > 0, (ct_sim.ckp_step) dict_gen_ct = {} if args.gen_type in ['env', 'env_credit', 'env_rl']: if args.gen_type == 'env_rl': assert args.env_output_type == 'click', \ ('env_rl only support click env, which will be used as a simulator', args.env_output_type) ct_env = get_ct_env(args.env_exp, args.use_cuda, args.train_mode, args.env_output_type, args.output_dim) dict_gen_ct['env'] = ct_env if args.gen_type in ['env_credit', 'mc_credit']: ct_credit = get_ct_credit(args.credit_exp, args.use_cuda, args.train_mode, args.credit_scale) dict_gen_ct['credit'] = ct_credit if args.gen_type in ['rl', 'env_rl']: ct_rl = get_ct_rl(args.rl_exp, args.use_cuda, args.train_mode, args.rl_gamma, args.rl_Q_type) dict_gen_ct['rl'] = ct_rl if args.gen_type == 'ddpg': ct_ddpg = get_ct_ddpg(args.ddpg_exp, args.use_cuda, args.train_mode, args.ddpg_gamma) dict_gen_ct['ddpg'] = ct_ddpg ### dataset sim_conf = ct_sim.alg.model.conf dataset = NpzDataset(args.train_npz_list, sim_conf.npz_config_path, sim_conf.requested_names, if_random_shuffle=True, one_pass=True) if args.gen_type == 'env_rl': # env_rl will need data from env_conf env_conf = ct_env.alg.model.conf env_dataset = NpzDataset(args.train_npz_list, env_conf.npz_config_path, env_conf.requested_names, if_random_shuffle=True, one_pass=False) summary_writer = tf.summary.FileWriter(args.summary_dir) max_test_steps = 1000 for epoch_id in range(50): if args.gen_type == 'env_rl': env_data_gen = env_dataset.get_data_generator(env_conf.batch_size) thread_env_data_gen = threaded_generator(env_data_gen, capacity=10) else: thread_env_data_gen = None data_gen = dataset.get_data_generator(sim_conf.batch_size) thread_data_gen = threaded_generator(data_gen, capacity=100) for batch_id, tensor_dict in enumerate(thread_data_gen): if_save = True if batch_id == 0 else False batch_data = BatchData(sim_conf, tensor_dict) if batch_data.batch_size() == 1: # otherwise, rl will crash continue offline_training(args, epoch_id, [batch_data], dict_gen_ct, summary_writer, if_save=if_save, env_rl_data_gen=thread_env_data_gen) if epoch_id % 1 == 0: online_inference_for_test(args, epoch_id, max_test_steps, ct_sim, dict_gen_ct, summary_writer)
def online_inference(args, epoch_id, max_steps, data_gen, ct_sim, dict_gen_ct, summary_writer, if_print=True): """ Do inference for `max_steps` batches. """ sim_conf = ct_sim.alg.model.conf replay_memory = [] list_sim_responses = [] ### online inference last_batch_data = BatchData(sim_conf, data_gen.next()) for batch_id in range(max_steps): np.random.seed(epoch_id * max_steps + batch_id) tensor_dict = data_gen.next() batch_data = BatchData(sim_conf, tensor_dict) batch_data.set_decode_len(batch_data.seq_lens()) batch_data.expand_candidates(last_batch_data, batch_data.seq_lens()) np.random.seed(None) del batch_data.tensor_dict['click_id'] if batch_data.batch_size() == 1: # otherwise, rl will crash continue orders, sim_responses = inference_one_batch(args.gen_type, ct_sim, dict_gen_ct, batch_data, eps=args.infer_eps) # , (b, decode_len) # save to replay memory sim_batch_data = batch_data.get_reordered(orders, sim_responses) replay_memory.append(sim_batch_data) list_sim_responses.append(sim_responses) last_batch_data = BatchData(sim_conf, tensor_dict) if batch_id % 100 == 0 and if_print: logging.info('inference epoch %d batch %d' % (epoch_id, batch_id)) if if_print: list_sum_response = np.sum(np.concatenate(list_sim_responses, 0), 1) # (b,) add_scalar_summary(summary_writer, epoch_id, 'inference/sim_responses', np.mean(list_sum_response)) return replay_memory