예제 #1
0
def main_calculate_credit_variance(args, credit_type):
    """
    Calculate variance of credit by varing following items.
    """
    def sampling(click_prob):
        """
        click_prob: (n, n_class)
        """
        n_class = click_prob.shape[1]
        return np.int64([np.random.choice(n_class, 1, p=p) for p in click_prob]).reshape([-1])

    assert args.gen_type == 'env'
    ct_sim = get_ct_sim(args.sim_exp, args.use_cuda, args.train_mode, args.sim_cell_type, args.output_dim)
    assert ct_sim.ckp_step > 0, (ct_sim.ckp_step)
    ct_env = get_ct_env(args.env_exp, args.use_cuda, args.train_mode, args.env_output_type, args.output_dim)
    assert ct_env.ckp_step > 0, (ct_env.ckp_step)

    ### dataset
    sim_conf = ct_sim.alg.model.conf
    dataset = NpzDataset(args.train_npz_list, 
                        sim_conf.npz_config_path, 
                        sim_conf.requested_names,
                        if_random_shuffle=True,
                        one_pass=True)
    data_gen = dataset.get_data_generator(sim_conf.batch_size)
    thread_data_gen = threaded_generator(data_gen, capacity=100)

    n_vary = 64
    base_batch_data = BatchData(sim_conf, thread_data_gen.next())
    batch_size = base_batch_data.batch_size()
    batch_credits = []
    for pos in range(base_batch_data.seq_lens()[0]):
        list_credits = []
        for batch_id, tensor_dict in enumerate(thread_data_gen):
            if len(list_credits) == n_vary:
                break
            ref_batch_data = BatchData(sim_conf, tensor_dict)
            if ref_batch_data.batch_size() != batch_size:
                continue
            mix_batch_data = base_batch_data.replace_following_items(pos + 1, ref_batch_data)
            sim_fetch_dict = ct_sim.inference(SimFeedConvertor.inference(mix_batch_data))
            sim_response = sampling(np.array(sim_fetch_dict['click_prob'])).reshape([-1, 1]).astype('int64')
            mix_batch_data.tensor_dict['click_id'] = FakeTensor(sim_response, mix_batch_data.seq_lens())
            credit = generate_credit_one_batch(ct_env, 
                                            mix_batch_data, 
                                            credit_type=credit_type,
                                            credit_gamma=args.credit_gamma,
                                            globbase=None)
            credit = credit.reshape([batch_size, -1])
            list_credits.append(credit[:, pos].reshape(-1, 1))
        list_credits = np.concatenate(list_credits, 1)  # (b, n_vary)
        batch_credits.append(list_credits)
    batch_credits = np.concatenate(batch_credits, 0)    # (seq_len*b, n_vary)
    print(credit_type)
    print(batch_credits.shape)
    print('(s,a)-wise credit variance', np.mean(np.std(batch_credits, 1)))
예제 #2
0
def main_batch_rl(args):
    """
    Include online inference and offline training.
    """
    ct_sim = get_ct_sim(args.sim_exp, args.use_cuda, args.train_mode, args.sim_cell_type, args.output_dim)
    assert ct_sim.ckp_step > 0, (ct_sim.ckp_step)
    dict_gen_ct = {}
    if args.gen_type in ['env', 'env_credit', 'env_rl']:
        if args.gen_type == 'env_rl':
            assert args.env_output_type == 'click', \
                ('env_rl only support click env, which will be used as a simulator', args.env_output_type)
        ct_env = get_ct_env(args.env_exp, args.use_cuda, args.train_mode, args.env_output_type, args.output_dim)
        dict_gen_ct['env'] = ct_env
    if args.gen_type in ['env_credit', 'mc_credit']:
        ct_credit = get_ct_credit(args.credit_exp, args.use_cuda, args.train_mode, args.credit_scale)
        dict_gen_ct['credit'] = ct_credit
    if args.gen_type in ['rl', 'env_rl']:
        ct_rl = get_ct_rl(args.rl_exp, args.use_cuda, args.train_mode, args.rl_gamma, args.rl_Q_type)
        dict_gen_ct['rl'] = ct_rl
    if args.gen_type == 'ddpg':
        ct_ddpg = get_ct_ddpg(args.ddpg_exp, args.use_cuda, args.train_mode, args.ddpg_gamma)
        dict_gen_ct['ddpg'] = ct_ddpg

    ### dataset
    sim_conf = ct_sim.alg.model.conf
    dataset = NpzDataset(args.train_npz_list, 
                        sim_conf.npz_config_path, 
                        sim_conf.requested_names,
                        if_random_shuffle=True,
                        one_pass=True)

    if args.gen_type == 'env_rl':       # env_rl will need data from env_conf
        env_conf = ct_env.alg.model.conf
        env_dataset = NpzDataset(args.train_npz_list, 
                                env_conf.npz_config_path, 
                                env_conf.requested_names,
                                if_random_shuffle=True,
                                one_pass=False)

    summary_writer = tf.summary.FileWriter(args.summary_dir)
    max_test_steps = 1000
    for epoch_id in range(50):
        if args.gen_type == 'env_rl':
            env_data_gen = env_dataset.get_data_generator(env_conf.batch_size)
            thread_env_data_gen = threaded_generator(env_data_gen, capacity=10)
        else:
            thread_env_data_gen = None

        data_gen = dataset.get_data_generator(sim_conf.batch_size)
        thread_data_gen = threaded_generator(data_gen, capacity=100)
        for batch_id, tensor_dict in enumerate(thread_data_gen):
            if_save = True if batch_id == 0 else False
            batch_data = BatchData(sim_conf, tensor_dict)
            if batch_data.batch_size() == 1:    # otherwise, rl will crash
                continue
            offline_training(args, epoch_id, [batch_data], dict_gen_ct, summary_writer, if_save=if_save, env_rl_data_gen=thread_env_data_gen)
        if epoch_id % 1 == 0:
            online_inference_for_test(args, epoch_id, max_test_steps, ct_sim, dict_gen_ct, summary_writer)
예제 #3
0
def online_inference(args, epoch_id, max_steps, data_gen, ct_sim, dict_gen_ct, summary_writer, if_print=True):
    """
    Do inference for `max_steps` batches.
    """
    sim_conf = ct_sim.alg.model.conf

    replay_memory = []
    list_sim_responses = []
    ### online inference
    last_batch_data = BatchData(sim_conf, data_gen.next())
    for batch_id in range(max_steps):
        np.random.seed(epoch_id * max_steps + batch_id)
        tensor_dict = data_gen.next()
        batch_data = BatchData(sim_conf, tensor_dict)
        batch_data.set_decode_len(batch_data.seq_lens())
        batch_data.expand_candidates(last_batch_data, batch_data.seq_lens())
        np.random.seed(None)
        del batch_data.tensor_dict['click_id']

        if batch_data.batch_size() == 1:    # otherwise, rl will crash
            continue

        orders, sim_responses = inference_one_batch(args.gen_type, ct_sim, dict_gen_ct, batch_data, eps=args.infer_eps) # , (b, decode_len)

        # save to replay memory
        sim_batch_data = batch_data.get_reordered(orders, sim_responses)
        replay_memory.append(sim_batch_data)
        list_sim_responses.append(sim_responses)
        last_batch_data = BatchData(sim_conf, tensor_dict)

        if batch_id % 100 == 0 and if_print:
            logging.info('inference epoch %d batch %d' % (epoch_id, batch_id))

    if if_print:
        list_sum_response = np.sum(np.concatenate(list_sim_responses, 0), 1)    # (b,)
        add_scalar_summary(summary_writer, epoch_id, 'inference/sim_responses', np.mean(list_sum_response))
    return replay_memory