예제 #1
0
def bayes_optimize_zeta(seed,
                        mc_rep=1000,
                        T=50,
                        list_of_reward_betas=[[-10, 0.4, 0.4, -0.4],
                                              [-9.8, 0.6, 0.6, -0.4]],
                        context_mean=np.array([[0.0, 0.0, 0.0]]),
                        context_var=np.eye(3),
                        list_of_reward_vars=[1.0, 1.0]):

    np.random.seed(seed)

    sim_env = NormalCB(5,
                       list_of_reward_betas=list_of_reward_betas,
                       context_mean=context_mean,
                       context_var=context_var,
                       list_of_reward_vars=list_of_reward_vars)
    # env = NormalMAB(list_of_reward_mus=[0, 1], list_of_reward_vars=[1, 140])
    #  env = NormalMAB(list_of_reward_mus=[0.3, 0.6], list_of_reward_vars=[1**2, 1**2])
    # X = env.X
    # estimated_context_mean = np.mean(X, axis=0)
    # estimated_context_variance = np.cov(X, rowvar=False)
    # estimated_context_bounds = (np.min(X), np.max(X))
    # sim_env = NormalUniformCB(list_of_reward_betas=env.list_of_reward_betas, list_of_reward_vars=env.list_of_reward_vars,
    #                           context_bounds=env.context_bounds)
    # sim_env = NormalCB(list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([0.0, 0.0, 0.0]),
    #           context_var=np.array([[1.0,0,0], [0,1.,0], [0, 0, 1.]]), list_of_reward_vars=[1, 1])
    #  sim_env = NormalMAB(list_of_reward_mus=env.list_of_reward_mus, list_of_reward_vars=env.list_of_reward_vars)
    pre_simulated_data = sim_env.generate_mc_samples(mc_rep, T)
    rollout_function_kwargs = {'pre_simulated_data': pre_simulated_data}

    # def objective(zeta0, zeta1, zeta2, zeta3, zeta4, zeta5, zeta6, zeta7, zeta8, zeta9):
    #   zeta = np.array([zeta0, zeta1, zeta2, zeta3, zeta4, zeta5, zeta6, zeta7, zeta8, zeta9])

    def objective(zeta0, zeta1, zeta2):
        zeta = np.array([zeta0, zeta1, zeta2])
        #    return rollout.mab_rollout_with_fixed_simulations(zeta, policies.mab_frequentist_ts_policy, T,
        #                                                      policies.expit_epsilon_decay, sim_env, **rollout_function_kwargs)
        return rollout.normal_cb_rollout_with_fixed_simulations(
            zeta, policies.linear_cb_epsilon_greedy_policy, T,
            policies.expit_epsilon_decay, sim_env, **rollout_function_kwargs)

    # bounds = {'zeta{}'.format(i): (0.0, 1.0) for i in range(10)}
    # explore_ = {'zeta{}'.format(i): [0.0] for i in range(10)}
    explore_ = {
        'zeta0': [1.0, 0.05, 1.0, 0.1],
        'zeta1': [50.0, 49.0, 1.0, 49.0],
        'zeta2': [0.1, 2.5, 1.0, 2.5]
    }
    bounds = {'zeta0': (0.8, 2.0), 'zeta1': (1.0, 49.0), 'zeta2': (0.01, 2.5)}
    bo = BayesianOptimization(objective, bounds)
    bo.explore(explore_)
    bo.maximize(init_points=10, n_iter=20, alpha=1e-4)
    best_param = bo.res['max']['max_params']
    best_param = np.array([best_param['zeta{}'.format(i)] for i in range(3)])
    return best_param
예제 #2
0
def fixed_pulls(num_initial_pulls):
  np.random.seed(num_initial_pulls)
  env = NormalCB(num_initial_pulls, list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]],
                 context_mean=np.array([0.0, 0.0, 0.0]), context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]),
                 list_of_reward_vars=[1, 1])
  p = bayes_optimize_zeta(num_initial_pulls, T=30, mc_rep=1000, list_of_reward_betas=env.beta_hat_list,
                          context_mean=env.estimated_context_mean[1:], context_var=env.estimated_context_cov[1:, 1:],
                          list_of_reward_vars=np.array(env.sigma_hat_list) ** 2)
  return {'num_initial_pulls': num_initial_pulls, 'theta_opt': [float(param) for param in p]}
예제 #3
0
파일: rollout.py 프로젝트: lwu9/bayesRL
def mHealth_rollout(tuning_function_parameter, policy, time_horizon, estimated_context_mean,
                    tuning_function, estimated_context_variance, env, nPatients, monte_carlo_reps):

  score = 0
  rollout_env = NormalCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list,
                         context_mean=estimated_context_mean, context_var=estimated_context_variance)
  for rep in range(monte_carlo_reps):
    rollout_env.reset()
    episode_score = 0

    # Initial assignments
    for t in range(10):
      for j in range(5):
        rollout_env.step(0)
      for j in range(5):
        rollout_env.step(1)

    for time in range(time_horizon):
      beta_hat = rollout_env.beta_hat_list
      sampling_cov_list = rollout_env.sampling_cov_list
      for j in range(nPatients):
        # Draw context and take action
        # context = context_sequence[time - current_time][j]
        action = policy(beta_hat, sampling_cov_list, rollout_env.curr_context, tuning_function,
                        tuning_function_parameter, time_horizon, time, env)
        expected_reward = rollout_env.expected_reward(action, rollout_env.curr_context)
        optimal_expected_reward = np.max([rollout_env.expected_reward(a, rollout_env.curr_context)
                                          for a in range(rollout_env.number_of_actions)])
        rollout_env.step(action)

        # Update regret
        regret = (expected_reward - optimal_expected_reward)
        episode_score += regret

    print(rep)
    score += (episode_score - score) / (rep + 1)
  return score
예제 #4
0
def episode(policy_name,
            label,
            n_patients=15,
            list_of_reward_betas=[[-10, 0.4, 0.4, -0.4],
                                  [-9.8, 0.6, 0.6, -0.4]],
            context_mean=np.array([0.0, 0.0, 0.0]),
            context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]),
            list_of_reward_vars=[1, 1],
            T=50,
            mc_replicates=1000,
            pre_simulate=True):
    np.random.seed(label)

    # ToDo: Create policy class that encapsulates this behavior
    posterior_sample = True
    bootstrap_posterior = False
    positive_zeta = False
    if policy_name == 'eps':
        tuning_function = lambda a, b, c: 0.1  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'random':
        tuning_function = lambda a, b, c: 1.0  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay-fixed':
        tuning_function = tuned_bandit.expit_epsilon_decay
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = np.array([0.8, 46.38, 1.857])
    elif policy_name == 'eps-decay':
        tuning_function = tuned_bandit.expit_epsilon_decay
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = True
        explore_ = {
            'zeta0': [1.0, 0.05, 1.0, 0.1],
            'zeta1': [30.0, 0.0, 1.0, 0.0],
            'zeta2': [0.1, 1.0, 0.01, 1.0]
        }
        bounds = {
            'zeta0': (0.025, 2.0),
            'zeta1': (0.0, 30.0),
            'zeta2': (0.01, 2)
        }
        tuning_function_parameter = np.array([0.05, 1.0, 0.01])
        posterior_sample = True
    elif policy_name == 'greedy':
        tuning_function = lambda a, b, c: 0.00  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'worst':
        tuning_function = lambda a, b, c: 0.00
        policy = ref.linear_cb_worst_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts':
        tuning_function = lambda a, b, c: 1.0  # No shrinkage
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts-decay-posterior-sample':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.1
        posterior_sample = True
    elif policy_name == 'ts-decay-bootstrap-sample':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.1
        posterior_sample = True
        bootstrap_posterior = True
    elif policy_name == 'ts-decay':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.1
    elif policy_name == 'ucb-tune-posterior-sample':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_ucb_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.025
        posterior_sample = True
    # elif policy_name == 'ts-shrink':
    #   tuning_function = tuned_bandit.expit_truncate
    #   policy = tuned_bandit.thompson_sampling_policy
    #   tune = True
    #   tuning_function_parameter = np.array([-2, 1])
    else:
        raise ValueError('Incorrect policy name')

    env = NormalCB(list_of_reward_betas=list_of_reward_betas,
                   context_mean=context_mean,
                   context_var=context_var,
                   list_of_reward_vars=list_of_reward_vars)
    #  env = NormalUniformCB(list_of_reward_betas=[np.ones(10) + 0.05, np.ones(10)], list_of_reward_vars=[0.01, 25])
    cumulative_regret = 0.0
    # env.reset()
    tuning_parameter_sequence = []
    rewards = []
    actions = []

    # Using pre-simulated data
    # data_for_episode = env.generate_mc_samples(1, T)
    # rep_dict = data_for_episode[0]
    # initial_linear_model = rep_dict['initial_linear_model']
    # beta_hat_list = initial_linear_model['beta_hat_list']
    # Xprime_X_list = initial_linear_model['Xprime_X_list']
    # Xprime_X_inv_list = initial_linear_model['Xprime_X_inv_list']
    # X_list = initial_linear_model['X_list']
    # y_list = initial_linear_model['y_list']
    # X_dot_y_list = initial_linear_model['X_dot_y_list']
    # sampling_cov_list = initial_linear_model['sampling_cov_list']
    # sigma_hat_list = initial_linear_model['sigma_hat_list']

    # context_sequence = rep_dict['contexts']
    # regrets_sequence = rep_dict['regrets']
    # rewards_sequence = rep_dict['rewards']

    for t in range(T):
        X = env.X
        estimated_context_mean = np.mean(X, axis=0)
        estimated_context_variance = np.cov(X, rowvar=False)
        estimated_context_bounds = (np.min(X), np.max(X[:, 1:]))

        if tune:
            if pre_simulate:
                if posterior_sample:
                    gen_model_parameters = []
                    for rep in range(mc_replicates):
                        if bootstrap_posterior:
                            pass
                        else:
                            draws = env.sample_from_posterior()
                            # draws = env.sample_from_sampling_dist()
                        betas_for_each_action = []
                        vars_for_each_action = []
                        for a in range(env.number_of_actions):
                            beta_a = draws[a]['beta_draw']
                            var_a = draws[a]['var_draw']
                            betas_for_each_action.append(beta_a)
                            vars_for_each_action.append(var_a)
                        param_dict = {
                            'reward_betas': betas_for_each_action,
                            'reward_vars': vars_for_each_action,
                            'context_mean': draws['context_mu_draw'],
                            'context_var': draws['context_var_draw']
                        }
                        #                          'context_max': draws['context_max']}
                        gen_model_parameters.append(param_dict)
                else:
                    gen_model_parameters = None


#        sim_env = NormalUniformCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list,
#                                  context_bounds=estimated_context_bounds)
                sim_env = NormalCB(list_of_reward_betas=list_of_reward_betas,
                                   context_mean=context_mean,
                                   context_var=context_var,
                                   list_of_reward_vars=list_of_reward_vars)
                pre_simulated_data = sim_env.generate_mc_samples(
                    mc_replicates,
                    T,
                    n_patients=n_patients,
                    gen_model_params=gen_model_parameters)
                tuning_function_parameter = opt.bayesopt(
                    rollout.normal_cb_rollout_with_fixed_simulations,
                    policy,
                    tuning_function,
                    tuning_function_parameter,
                    T,
                    sim_env,
                    mc_replicates, {'pre_simulated_data': pre_simulated_data},
                    bounds,
                    explore_,
                    positive_zeta=positive_zeta)
                tuning_parameter_sequence.append(
                    [float(z) for z in tuning_function_parameter])
            else:
                tuning_function_parameter = tuned_bandit.random_search(
                    tuned_bandit.oracle_rollout, policy, tuning_function,
                    tuning_function_parameter, linear_model_results, T, t,
                    estimated_context_mean, estimated_context_variance, env)

        for patient in range(n_patients):
            x = copy.copy(env.curr_context)
            beta_hat = np.array([
                env.posterior_params_dict[a]['beta_post']
                for a in range(env.number_of_actions)
            ])
            # print(env.posterior_params_dict)
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            res = env.step(action)
            cumulative_regret += -env.regret(action, x)
            actions.append(action)
            u = res['Utility']
            rewards.append(u)
        print(beta_hat)

        if t == 0:
            break
    return {
        'cumulative_regret': cumulative_regret,
        'zeta_sequence': tuning_parameter_sequence,
        'rewards': rewards,
        'actions': actions
    }
예제 #5
0
파일: mHealth.py 프로젝트: lwu9/bayesRL
def episode(policy_name,
            label,
            save=False,
            points_per_grid_dimension=50,
            monte_carlo_reps=100):
    if save:
        base_name = 'mhealth-{}-{}'.format(label, policy_name)
        prefix = os.path.join(project_dir, 'src', 'run', 'results', base_name)
        suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
        filename = '{}_{}.yml'.format(prefix, suffix)

    np.random.seed(label)
    T = 10

    # ToDo: Create policy class that encapsulates this behavior
    if policy_name == 'eps':
        tuning_function = lambda a, b, c: 0.05  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay-fixed':
        tuning_function = lambda a, t, c: 0.5 / (t + 1)
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.025
    elif policy_name == 'greedy':
        tuning_function = lambda a, b, c: 0.00  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'worst':
        tuning_function = lambda a, b, c: 0.00
        policy = ref.linear_cb_worst_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts':
        tuning_function = lambda a, b, c: 1.0  # No shrinkage
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = False
        tuning_function_parameter = None
    # elif policy_name == 'ts-shrink':
    #   tuning_function = tuned_bandit.expit_truncate
    #   policy = tuned_bandit.thompson_sampling_policy
    #   tune = True
    #   tuning_function_parameter = np.array([-2, 1])
    else:
        raise ValueError('Incorrect policy name')

    env = NormalCB(
        list_of_reward_betas=[np.array([1.0, 1.0]),
                              np.array([2.0, -2.0])])
    cumulative_regret = 0.0
    nPatients = 10
    env.reset()

    # Initial assignments
    for t in range(10):
        for j in range(5):
            env.step(0)
        for j in range(5):
            env.step(1)

    for t in range(T):
        X = env.X
        estimated_context_mean = np.mean(X, axis=0)
        estimated_context_variance = np.cov(X, rowvar=False)
        if tune:
            tuning_function_parameter = opt.bayesopt(
                rollout.mHealth_rollout, policy, tuning_function,
                tuning_function_parameter, T, estimated_context_mean,
                estimated_context_variance, env, nPatients,
                points_per_grid_dimension, monte_carlo_reps)
        # print('time {} epsilon {}'.format(t, tuning_function(T,t,tuning_function_parameter)))
        for j in range(nPatients):
            x = copy.copy(env.curr_context)

            beta_hat = env.beta_hat_list
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            env.step(action)

            # Compute regret
            expected_rewards = [
                env.expected_reward(a, env.curr_context)
                for a in range(env.number_of_actions)
            ]
            expected_reward_at_action = expected_rewards[action]
            optimal_expected_reward = np.max(expected_rewards)
            regret = optimal_expected_reward - expected_reward_at_action
            cumulative_regret += regret

        # Save results
        if save:
            results = {'t': float(t), 'regret': float(cumulative_regret)}
            with open(filename, 'w') as outfile:
                yaml.dump(results, outfile)

    return cumulative_regret
예제 #6
0
def episode(label,
            tuning_function_parameter,
            n_patients=1,
            list_of_reward_betas=[[-10, 0.4, 0.4, -0.4],
                                  [-9.8, 0.6, 0.6, -0.4]],
            context_mean=np.array([0.0, 0.0, 0.0]),
            context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]),
            list_of_reward_vars=[1, 1],
            T=30):

    tuning_function = tuned_bandit.expit_epsilon_decay
    policy = tuned_bandit.linear_cb_epsilon_greedy_policy

    env = NormalCB(1,
                   list_of_reward_betas=list_of_reward_betas,
                   context_mean=context_mean,
                   context_var=context_var,
                   list_of_reward_vars=list_of_reward_vars)
    #  env = NormalUniformCB(list_of_reward_betas=[np.ones(10) + 0.05, np.ones(10)], list_of_reward_vars=[0.01, 25])
    cumulative_regret = 0.0
    # env.reset()
    print('epsilon', tuning_function(T, 0, tuning_function_parameter))
    tuning_parameter_sequence = []
    rewards = []
    actions = []

    # Using pre-simulated data
    # data_for_episode = env.generate_mc_samples(1, T)
    # rep_dict = data_for_episode[0]
    # initial_linear_model = rep_dict['initial_linear_model']
    # beta_hat_list = initial_linear_model['beta_hat_list']
    # Xprime_X_list = initial_linear_model['Xprime_X_list']
    # Xprime_X_inv_list = initial_linear_model['Xprime_X_inv_list']
    # X_list = initial_linear_model['X_list']
    # y_list = initial_linear_model['y_list']
    # X_dot_y_list = initial_linear_model['X_dot_y_list']
    # sampling_cov_list = initial_linear_model['sampling_cov_list']
    # sigma_hat_list = initial_linear_model['sigma_hat_list']

    # context_sequence = rep_dict['contexts']
    # regrets_sequence = rep_dict['regrets']
    # rewards_sequence = rep_dict['rewards']

    for t in range(T):
        X = env.X

        for patient in range(n_patients):
            x = copy.copy(env.curr_context)
            beta_hat = np.array([
                env.posterior_params_dict[a]['beta_post']
                for a in range(env.number_of_actions)
            ])
            # print(env.posterior_params_dict)
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            res = env.step(action)
            cumulative_regret += -env.regret(action, x)
            actions.append(int(action))
            u = res['Utility']
            rewards.append(float(u))

    return {
        'cumulative_regret': float(cumulative_regret),
        'rewards': rewards,
        'actions': actions
    }
예제 #7
0
 #    for i in range(96):
 #      tuning_function_parameter = doc['zeta_sequences'][i][t]
 ##      print(i, t, tuning_function_parameter)
 #      res = episode('eps-decay-fixed', 0, tuning_function_parameter=tuning_function_parameter, T=50)
 ##      print(i, t, res['cumulative_regret'])
 #      cumulative_regret += (res['cumulative_regret'] - cumulative_regret)/(i+1)
 #      cumulative_regret_se = np.append(cumulative_regret_se,  cumulative_regret)
 #    cumulative_regret_different_t = np.append(cumulative_regret_different_t, cumulative_regret)
 #    cumulative_regret_se_different_t = np.append(cumulative_regret_se_different_t, np.std(cumulative_regret_se)/np.sqrt(96))
 #    print(t, cumulative_regret_different_t , cumulative_regret_se_different_t )
 best_para = dict()
 for N in [5, 10, 15, 20, 25]:
     env = NormalCB(num_initial_pulls=N,
                    list_of_reward_betas=[[-10, 0.4, 0.4, -0.4],
                                          [-9.8, 0.6, 0.6, -0.4]],
                    context_mean=np.array([0.0, 0.0, 0.0]),
                    context_var=np.array([[1.0, 0, 0], [0, 1., 0],
                                          [0, 0, 1.]]),
                    list_of_reward_vars=[1, 1])
     sigma_sq_hat_list = [env.sigma_hat_list[a]**2 for a in range(2)]
     p = bayes_optimize_zeta(0,
                             num_initial_pulls=N,
                             list_of_reward_betas=env.beta_hat_list,
                             context_mean=np.mean(env.X[:, -3:], axis=0),
                             context_var=np.cov(env.X[:, -3:],
                                                rowvar=False),
                             list_of_reward_vars=sigma_sq_hat_list,
                             mc_rep=1000,
                             T=50)
     print(p)
     best_para[str(N)] = p