Python NormalCB.reset примеры использования

Язык программирования: Python

Пространство имен/Пакет: src.environments.Bandit

Класс/Тип: NormalCB

Метод/Функция: reset

Примеров на hotexamples.com: 2

Python NormalCB.reset - 2 примера найдено. Это лучшие примеры Python кода для src.environments.Bandit.NormalCB.reset, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

NormalCB(7)

step(4)

expected_reward(2)

generate_mc_samples(2)

regret(2)

reset(2)

sample_from_posterior(1)

Пример #1

Показать файл

Файл: rollout.py Проект: lwu9/bayesRL

def mHealth_rollout(tuning_function_parameter, policy, time_horizon, estimated_context_mean,
                    tuning_function, estimated_context_variance, env, nPatients, monte_carlo_reps):

  score = 0
  rollout_env = NormalCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list,
                         context_mean=estimated_context_mean, context_var=estimated_context_variance)
  for rep in range(monte_carlo_reps):
    rollout_env.reset()
    episode_score = 0

    # Initial assignments
    for t in range(10):
      for j in range(5):
        rollout_env.step(0)
      for j in range(5):
        rollout_env.step(1)

    for time in range(time_horizon):
      beta_hat = rollout_env.beta_hat_list
      sampling_cov_list = rollout_env.sampling_cov_list
      for j in range(nPatients):
        # Draw context and take action
        # context = context_sequence[time - current_time][j]
        action = policy(beta_hat, sampling_cov_list, rollout_env.curr_context, tuning_function,
                        tuning_function_parameter, time_horizon, time, env)
        expected_reward = rollout_env.expected_reward(action, rollout_env.curr_context)
        optimal_expected_reward = np.max([rollout_env.expected_reward(a, rollout_env.curr_context)
                                          for a in range(rollout_env.number_of_actions)])
        rollout_env.step(action)

        # Update regret
        regret = (expected_reward - optimal_expected_reward)
        episode_score += regret

    print(rep)
    score += (episode_score - score) / (rep + 1)
  return score

Пример #2

Показать файл

Файл: mHealth.py Проект: lwu9/bayesRL

def episode(policy_name,
            label,
            save=False,
            points_per_grid_dimension=50,
            monte_carlo_reps=100):
    if save:
        base_name = 'mhealth-{}-{}'.format(label, policy_name)
        prefix = os.path.join(project_dir, 'src', 'run', 'results', base_name)
        suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
        filename = '{}_{}.yml'.format(prefix, suffix)

    np.random.seed(label)
    T = 10

    # ToDo: Create policy class that encapsulates this behavior
    if policy_name == 'eps':
        tuning_function = lambda a, b, c: 0.05  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay-fixed':
        tuning_function = lambda a, t, c: 0.5 / (t + 1)
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.025
    elif policy_name == 'greedy':
        tuning_function = lambda a, b, c: 0.00  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'worst':
        tuning_function = lambda a, b, c: 0.00
        policy = ref.linear_cb_worst_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts':
        tuning_function = lambda a, b, c: 1.0  # No shrinkage
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = False
        tuning_function_parameter = None
    # elif policy_name == 'ts-shrink':
    #   tuning_function = tuned_bandit.expit_truncate
    #   policy = tuned_bandit.thompson_sampling_policy
    #   tune = True
    #   tuning_function_parameter = np.array([-2, 1])
    else:
        raise ValueError('Incorrect policy name')

    env = NormalCB(
        list_of_reward_betas=[np.array([1.0, 1.0]),
                              np.array([2.0, -2.0])])
    cumulative_regret = 0.0
    nPatients = 10
    env.reset()

    # Initial assignments
    for t in range(10):
        for j in range(5):
            env.step(0)
        for j in range(5):
            env.step(1)

    for t in range(T):
        X = env.X
        estimated_context_mean = np.mean(X, axis=0)
        estimated_context_variance = np.cov(X, rowvar=False)
        if tune:
            tuning_function_parameter = opt.bayesopt(
                rollout.mHealth_rollout, policy, tuning_function,
                tuning_function_parameter, T, estimated_context_mean,
                estimated_context_variance, env, nPatients,
                points_per_grid_dimension, monte_carlo_reps)
        # print('time {} epsilon {}'.format(t, tuning_function(T,t,tuning_function_parameter)))
        for j in range(nPatients):
            x = copy.copy(env.curr_context)

            beta_hat = env.beta_hat_list
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            env.step(action)

            # Compute regret
            expected_rewards = [
                env.expected_reward(a, env.curr_context)
                for a in range(env.number_of_actions)
            ]
            expected_reward_at_action = expected_rewards[action]
            optimal_expected_reward = np.max(expected_rewards)
            regret = optimal_expected_reward - expected_reward_at_action
            cumulative_regret += regret

        # Save results
        if save:
            results = {'t': float(t), 'regret': float(cumulative_regret)}
            with open(filename, 'w') as outfile:
                yaml.dump(results, outfile)

    return cumulative_regret