示例#1
0
def run(env_name='BanditOneHot2-v0',
        num_episodes=1,
        epsilon=0.1,
        epsilon_decay_tau=0,
        lr_R=.1,
        master_seed=42,
        write_to_disk=True,
        load=None,
        log_dir=None):
    """Play some slots!"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    env = gym.make(env_name)
    env.seed(master_seed)
    num_actions = env.action_space.n
    best_action = env.best

    # -
    default_reward_value = 0.0
    R_t = default_reward_value
    critic = Critic(num_actions, default_value=default_reward_value)
    actor = EpsilonActor(num_actions,
                         epsilon=epsilon,
                         decay_tau=epsilon_decay_tau,
                         seed_value=master_seed)
    all_actions = list(range(num_actions))

    # Update with pre-loaded data. This will let you run
    # test experiments on pre-trained model and/or to
    # continue training.
    if load is not None:
        result = load_checkpoint(load)
        critic.load_state_dict(result['critic'])

    # -
    num_best = 0
    total_R = 0.0
    total_regret = 0.0

    # ------------------------------------------------------------------------
    for n in range(num_episodes):
        env.reset()

        # Choose an action; Choose a bandit
        action = actor(list(critic.model.values()))
        if action in best_action:
            num_best += 1

        # Est. regret and save it
        regret = estimate_regret(all_actions, action, critic)

        # Pull a lever.
        state, R_t, _, _ = env.step(action)

        # Critic learns
        critic = Q_update(action, R_t, critic, lr_R)

        # Log data
        writer.add_scalar("state", int(state), n)
        writer.add_scalar("action", action, n)
        writer.add_scalar("epsilon", actor.epsilon, n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("score_R", R_t, n)
        writer.add_scalar("value_R", critic(action), n)

        total_R += R_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_R", total_R, n)
        writer.add_scalar("p_bests", num_best / (n + 1), n)

        # Decay ep?
        if epsilon_decay_tau > 0:
            actor.decay_epsilon()

    # -- Build the final result, and save or return it ---
    writer.close()

    result = dict(best=env.best,
                  env_name=env_name,
                  num_episodes=num_episodes,
                  lr_R=lr_R,
                  epsilon=epsilon,
                  epsilon_decay_tau=epsilon_decay_tau,
                  critic=critic.state_dict(),
                  total_R=total_R,
                  total_regret=total_regret,
                  master_seed=master_seed)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result
示例#2
0
def run(env_name='BanditOneHot10-v0',
        num_episodes=1000,
        tie_break='next',
        tie_threshold=0.0,
        lr_R=.1,
        master_seed=42,
        initial_bins=None,
        write_to_disk=True,
        load=None,
        log_dir=None):
    """Bandit agent - argmax (E, R)"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    env = gym.make(env_name)
    env.seed(master_seed)
    env.reset()

    num_actions = env.action_space.n
    all_actions = list(range(num_actions))
    best_action = env.best

    default_reward_value = 0
    default_info_value = entropy(np.ones(num_actions) / num_actions)
    E_t = default_info_value
    R_t = default_reward_value

    # - Init agents and memories
    critic_R = Critic(num_actions, default_value=default_reward_value)
    critic_E = Critic(num_actions, default_value=default_info_value)
    actor_R = DeterministicActor(num_actions,
                                 tie_break='first',
                                 tie_threshold=tie_threshold)
    actor_E = DeterministicActor(num_actions,
                                 tie_break=tie_break,
                                 tie_threshold=tie_threshold)

    memories = [
        DiscreteDistribution(initial_bins=initial_bins)
        for _ in range(num_actions)
    ]

    # Update with pre-loaded data. This will let you run
    # test experiments on pre-trained model and/or to
    # continue training.
    if load is not None:
        result = load_checkpoint(load)
        critic_E.load_state_dict(result['critic_E'])
        critic_R.load_state_dict(result['critic_R'])
        for i, mem in enumerate(memories):
            mem.load_state_dict(result['memories'][i])

    # -
    num_best = 0
    total_R = 0.0
    total_E = 0.0
    total_regret = 0.0

    # ------------------------------------------------------------------------
    for n in range(num_episodes):
        env.reset()

        # Meta-greed policy selection
        if (E_t - tie_threshold) > R_t:
            critic = critic_E
            actor = actor_E
            policy = 0
        else:
            critic = critic_R
            actor = actor_R
            policy = 1

        # Choose an action; Choose a bandit
        action = actor(list(critic.model.values()))
        if action in best_action:
            num_best += 1

        # Est. regret and save it
        regret = estimate_regret(all_actions, action, critic)

        # Pull a lever.
        state, R_t, _, _ = env.step(action)
        R_t = R_homeostasis(R_t, total_R, num_episodes)

        # Estimate E
        old = deepcopy(memories[action])
        memories[action].update((int(state), int(R_t)))
        new = deepcopy(memories[action])
        E_t = kl(new, old, default_info_value)

        # Learning, both policies.
        critic_R = R_update(action, R_t, critic_R, lr_R)
        critic_E = E_update(action, E_t, critic_E, lr=1)

        # Log data
        writer.add_scalar("policy", policy, n)
        writer.add_scalar("state", state, n)
        writer.add_scalar("action", action, n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("score_E", E_t, n)
        writer.add_scalar("score_R", R_t, n)
        writer.add_scalar("value_E", critic_E(action), n)
        writer.add_scalar("value_R", critic_R(action), n)

        total_E += E_t
        total_R += R_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_E", total_E, n)
        writer.add_scalar("total_R", total_R, n)
        writer.add_scalar("p_bests", num_best / (n + 1), n)

        tie = 0
        if actor.tied:
            tie = 1
        writer.add_scalar("ties", tie, n)

    # -- Build the final result, and save or return it ---
    writer.close()

    result = dict(best=env.best,
                  num_episodes=num_episodes,
                  tie_break=tie_break,
                  tie_threshold=tie_threshold,
                  critic_E=critic_E.state_dict(),
                  critic_R=critic_R.state_dict(),
                  memories=[m.state_dict() for m in memories],
                  total_E=total_E,
                  total_R=total_R,
                  total_regret=total_regret,
                  env_name=env_name,
                  lr_R=lr_R,
                  master_seed=master_seed)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result
示例#3
0
def run(env_name='BanditOneHot10-v0',
        num_episodes=1000,
        temp=1.0,
        tie_threshold=0.0,
        tie_break=None,
        lr_R=.1,
        master_seed=42,
        write_to_disk=True,
        log_dir=None):
    """Bandit agent - softmax (E, R)"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    env = gym.make(env_name)
    env.seed(master_seed)
    num_actions = env.action_space.n
    all_actions = list(range(num_actions))
    best_action = env.best

    default_reward_value = 0
    default_info_value = entropy(np.ones(num_actions) / num_actions)
    E_t = default_info_value
    R_t = default_reward_value

    # --- Agents and memories ---
    critic_R = Critic(num_actions, default_value=default_reward_value)
    critic_E = Critic(num_actions, default_value=default_info_value)
    actor_R = SoftmaxActor(num_actions, temp=temp, seed_value=master_seed)
    actor_E = SoftmaxActor(num_actions, temp=temp, seed_value=master_seed)
    memories = [DiscreteDistribution() for _ in range(num_actions)]

    # -
    num_best = 0
    total_R = 0.0
    total_E = 0.0
    total_regret = 0.0

    # ------------------------------------------------------------------------
    for n in range(num_episodes):
        env.reset()

        # Meta-greed policy selection
        if (E_t - tie_threshold) > R_t:
            critic = critic_E
            actor = actor_E
            policy = 0
        else:
            critic = critic_R
            actor = actor_R
            policy = 1

        # Choose an action; Choose a bandit
        action = actor(list(critic.model.values()))
        if action in best_action:
            num_best += 1

        # Est. regret and save it
        regret = estimate_regret(all_actions, action, critic)

        # Pull a lever.
        state, R_t, _, _ = env.step(action)
        R_t = R_homeostasis(R_t, total_R, num_episodes)

        # Estimate E
        old = deepcopy(memories[action])
        memories[action].update((int(state), int(R_t)))
        new = deepcopy(memories[action])
        E_t = kl(new, old, default_info_value)

        # Learning, both policies.
        critic_R = R_update(action, R_t, critic_R, lr_R)
        critic_E = E_update(action, E_t, critic_E, lr=1)

        # Log data
        writer.add_scalar("policy", policy, n)
        writer.add_scalar("state", int(state), n)
        writer.add_scalar("action", action, n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("score_E", E_t, n)
        writer.add_scalar("score_R", R_t, n)
        writer.add_scalar("value_E", critic_E(action), n)
        writer.add_scalar("value_R", critic_R(action), n)

        total_E += E_t
        total_R += R_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_E", total_E, n)
        writer.add_scalar("total_R", total_R, n)
        writer.add_scalar("p_bests", num_best / (n + 1), n)

    # -- Build the final result, and save or return it ---
    writer.close()

    result = dict(best=env.best,
                  num_episodes=num_episodes,
                  temp=temp,
                  tie_threshold=tie_threshold,
                  critic_E=critic_E.state_dict(),
                  critic_R=critic_R.state_dict(),
                  total_E=total_E,
                  total_R=total_R,
                  total_regret=total_regret,
                  env_name=env_name,
                  lr_R=lr_R,
                  master_seed=master_seed)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result
示例#4
0
def run(env_name='BanditOneHigh2-v0',
        num_episodes=1,
        temp=1.0,
        beta=1.0,
        lr_R=.1,
        mode='EB',
        master_seed=42,
        write_to_disk=True,
        load=None,
        log_dir=None):
    """Bandit agent - sample(R + beta count^(-1/2))"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    env = gym.make(env_name)
    env.seed(master_seed)
    num_actions = env.action_space.n
    all_actions = list(range(num_actions))
    best_action = env.best

    default_reward_value = 0  # Null R
    R_t = default_reward_value

    # Agents and memories
    critic = Critic(num_actions, default_value=default_reward_value)
    actor = SoftmaxActor(num_actions, temp=temp, seed_value=master_seed)
    count = CountMemory()

    # Update with pre-loaded data. This will let you run
    # test experiments on pre-trained model and/or to
    # continue training.
    if load is not None:
        result = load_checkpoint(load)
        critic.load_state_dict(result['critic'])
        count.load_state_dict(result['count'])

    # -
    total_R = 0.0
    total_regret = 0.0
    num_best = 0

    # ------------------------------------------------------------------------
    for n in range(num_episodes):
        env.reset()

        # Choose an action; Choose a bandit
        action = actor(list(critic.model.values()))
        if action in best_action:
            num_best += 1

        # Est. regret and save it
        regret = estimate_regret(all_actions, action, critic)

        # Pull a lever.
        state, R_t, _, _ = env.step(action)

        # Apply count bonus
        if mode == "EB":
            count_bonus = count(action)**(-0.5)
        elif mode == "UCB":
            count_bonus = ((2 * np.log(n + 1)) / count(action))**(0.5)
        else:
            raise ValueError("mode must be EB or UCB")

        # Critic learns
        payout = R_t + (beta * count_bonus)
        critic = Q_update(action, payout, critic, lr_R)

        # Log data
        writer.add_scalar("state", int(state), n)
        writer.add_scalar("action", action, n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("bonus", count_bonus, n)
        writer.add_scalar("score_R", R_t, n)
        writer.add_scalar("value_R", critic(action), n)

        total_R += R_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_R", total_R, n)
        writer.add_scalar("p_bests", num_best / (n + 1), n)

    # --- Build the final result, and save or return it ---
    writer.close()

    result = dict(best=env.best,
                  beta=beta,
                  temp=temp,
                  mode=mode,
                  env_name=env_name,
                  num_episodes=num_episodes,
                  critic=critic.state_dict(),
                  count=count.state_dict(),
                  total_R=total_R,
                  total_regret=total_regret,
                  master_seed=master_seed)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result
示例#5
0
def run(env_name='InfoBlueYellow4b-v0',
        num_episodes=1,
        lr_E=1.0,
        actor='DeterministicActor',
        initial_count=1,
        initial_bins=None,
        initial_noise=0.0,
        master_seed=None,
        env_seed=None,
        actor_seed=None,
        critic_seed=None,
        reward_mode=False,
        log_dir=None,
        write_to_disk=True,
        **actor_kwargs):
    """Play some slots!"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    if master_seed is not None:
        env_seed = master_seed
        critic_seed = master_seed
        actor_seed = master_seed

    # -
    env = gym.make(env_name)
    env.seed(env_seed)
    num_actions = env.action_space.n
    best_action = env.best
    default_info_value = entropy(np.ones(num_actions) / num_actions)
    E_t = default_info_value

    # --- Agents and memories ---
    # Critic
    all_actions = list(range(num_actions))
    critic_E = NoisyCritic(num_actions,
                           default_value=default_info_value,
                           default_noise_scale=initial_noise,
                           seed_value=critic_seed)

    # Actor
    if actor == "DeterministicActor":
        actor_E = DeterministicActor(num_actions, **actor_kwargs)
    elif actor == "SoftmaxActor":
        actor_E = SoftmaxActor(num_actions,
                               **actor_kwargs,
                               seed_value=actor_seed)
    elif actor == "RandomActor":
        actor_E = RandomActor(num_actions,
                              **actor_kwargs,
                              seed_value=actor_seed)
    elif actor == "ThresholdActor":
        actor_E = ThresholdActor(num_actions,
                                 **actor_kwargs,
                                 seed_value=actor_seed)
    else:
        raise ValueError("actor was not a valid choice")

    # Memory
    memories = [
        DiscreteDistribution(initial_bins=initial_bins,
                             initial_count=initial_count)
        for _ in range(num_actions)
    ]

    # --- Init log ---
    num_best = 0
    total_E = 0.0
    total_regret = 0.0

    # --- Main loop ---
    for n in range(num_episodes):
        # Each ep resets the env
        env.reset()

        # Choose a bandit arm
        values = list(critic_E.model.values())
        action = actor_E(values)
        if action is None:
            break
        regret = estimate_regret(all_actions, action, critic_E)

        # Pull a lever.
        state, reward, _, _ = env.step(action)

        # Estimate E, save regret
        old = deepcopy(memories[action])
        memories[action].update((int(state), int(reward)))
        new = deepcopy(memories[action])
        E_t = kl(new, old, critic_E.inital_values[action])

        # --- Learn ---
        critic_E = updateE(action, E_t, critic_E, lr=lr_E)

        # --- Log data ---
        num_stop = n
        writer.add_scalar("state", int(state), n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("score_E", E_t, n)
        writer.add_scalar("value_E", critic_E(action), n)

        total_E += E_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_E", total_E, n)

        if action in best_action:
            num_best += 1
        writer.add_scalar("p_bests", num_best / (n + 1), n)
        writer.add_scalar("action", action, n)
        tie = 0
        if actor_E.tied:
            tie = 1
        writer.add_scalar("ties", tie, n)

    # -- Build the final result, and save or return it ---
    writer.close()

    result = dict(best=best_action,
                  critic_E=critic_E.state_dict(),
                  intial_E=list(critic_E.inital_values.values()),
                  total_E=total_E,
                  total_regret=total_regret,
                  env_name=env_name,
                  num_episodes=num_episodes,
                  lr_E=lr_E,
                  master_seed=master_seed,
                  actor_seed=actor_seed,
                  critic_seed=critic_seed,
                  env_seed=env_seed,
                  initial_bins=initial_bins,
                  initial_count=initial_count,
                  actor=actor,
                  memories=[m.state_dict() for m in memories],
                  actor_kwargs=actor_kwargs,
                  num_stop=num_stop + 1)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result
示例#6
0
def run(env_name='BanditOneHigh2-v0',
        num_episodes=1,
        tie_break='next',
        tie_threshold=0.0,
        beta=1.0,
        lr_R=.1,
        master_seed=42,
        log_dir=None,
        write_to_disk=True):
    """Bandit agent - argmax(R + beta E)"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    env = gym.make(env_name)
    env.seed(master_seed)
    num_actions = env.action_space.n
    best_action = env.best

    # -
    default_reward_value = 0  # Null R
    default_info_value = entropy(np.ones(num_actions) /
                                 num_actions)  # Uniform p(a)
    E_t = default_info_value
    R_t = default_reward_value

    # Agents and memories
    critic = Critic(num_actions,
                    default_value=default_reward_value +
                    (beta * default_info_value))
    actor = Actor(num_actions,
                  tie_break=tie_break,
                  tie_threshold=tie_threshold)

    memories = [DiscreteDistribution() for _ in range(num_actions)]
    all_actions = list(range(num_actions))

    # -
    total_R = 0.0
    total_E = 0.0
    total_regret = 0.0
    num_best = 0

    # ------------------------------------------------------------------------
    for n in range(num_episodes):
        env.reset()

        # Choose an action; Choose a bandit
        action = actor(list(critic.model.values()))
        if action in best_action:
            num_best += 1

        # Est. regret and save it
        regret = estimate_regret(all_actions, action, critic)

        # Pull a lever.
        state, R_t, _, _ = env.step(action)

        # Estimate E
        old = deepcopy(memories[action])
        memories[action].update((int(state), int(R_t)))
        new = deepcopy(memories[action])
        E_t = kl(new, old, default_info_value)

        # Critic learns
        critic = Q_update(action, R_t + (beta * E_t), critic, lr_R)

        # Log data
        writer.add_scalar("state", int(state), n)
        writer.add_scalar("action", action, n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("score_E", E_t, n)
        writer.add_scalar("score_R", R_t, n)
        writer.add_scalar("value_ER", critic(action), n)

        total_E += E_t
        total_R += R_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_E", total_E, n)
        writer.add_scalar("total_R", total_R, n)
        writer.add_scalar("p_bests", num_best / (n + 1), n)

        tie = 0
        if actor.tied:
            tie = 1
        writer.add_scalar("ties", tie, n)

    # -- Build the final result, and save or return it ---
    writer.close()

    result = dict(best=env.best,
                  beta=beta,
                  env_name=env_name,
                  num_episodes=num_episodes,
                  tie_break=tie_break,
                  tie_threshold=tie_threshold,
                  critic=critic.state_dict(),
                  memories=[m.state_dict() for m in memories],
                  total_E=total_E,
                  total_R=total_R,
                  total_regret=total_regret,
                  master_seed=master_seed)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result
示例#7
0
def run(env_name='BanditOneHot2-v0',
        num_episodes=1,
        lr_R=.1,
        master_seed=42,
        write_to_disk=True,
        log_dir=None):
    """Bandit agent - random"""

    # --- Init ---
    writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk)

    # -
    env = gym.make(env_name)
    env.seed(master_seed)
    num_actions = env.action_space.n
    best_action = env.best

    # -
    default_reward_value = 0  # Null R
    R_t = default_reward_value
    critic = Critic(num_actions, default_value=default_reward_value)
    actor = RandomActor(num_actions, seed_value=master_seed)
    all_actions = list(range(num_actions))

    # ------------------------------------------------------------------------
    num_best = 0
    total_R = 0.0
    total_regret = 0.0

    for n in range(num_episodes):
        env.reset()

        # Choose an action; Choose a bandit
        action = actor(list(critic.model.values()))
        if action in best_action:
            num_best += 1

        # Est. regret and save it
        regret = estimate_regret(all_actions, action, critic)

        # Pull a lever.
        state, R_t, _, _ = env.step(action)

        # Critic learns
        critic = Q_update(action, R_t, critic, lr_R)

        # Log data
        writer.add_scalar("state", int(state), n)
        writer.add_scalar("action", action, n)
        writer.add_scalar("regret", regret, n)
        writer.add_scalar("score_R", R_t, n)
        writer.add_scalar("value_R", critic(action), n)

        total_R += R_t
        total_regret += regret
        writer.add_scalar("total_regret", total_regret, n)
        writer.add_scalar("total_R", total_R, n)
        writer.add_scalar("p_bests", num_best / (n + 1), n)

    # -- Build the final result, and save or return it ---
    writer.close()
    result = dict(best=env.best,
                  env_name=env_name,
                  num_episodes=num_episodes,
                  lr_R=lr_R,
                  critic=critic.state_dict(),
                  total_R=total_R,
                  total_regret=total_regret,
                  master_seed=master_seed)

    if write_to_disk:
        save_checkpoint(result,
                        filename=os.path.join(writer.log_dir, "result.pkl"))

    return result