def run(env_name='BanditOneHot2-v0', num_episodes=1, epsilon=0.1, epsilon_decay_tau=0, lr_R=.1, master_seed=42, write_to_disk=True, load=None, log_dir=None): """Play some slots!""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - env = gym.make(env_name) env.seed(master_seed) num_actions = env.action_space.n best_action = env.best # - default_reward_value = 0.0 R_t = default_reward_value critic = Critic(num_actions, default_value=default_reward_value) actor = EpsilonActor(num_actions, epsilon=epsilon, decay_tau=epsilon_decay_tau, seed_value=master_seed) all_actions = list(range(num_actions)) # Update with pre-loaded data. This will let you run # test experiments on pre-trained model and/or to # continue training. if load is not None: result = load_checkpoint(load) critic.load_state_dict(result['critic']) # - num_best = 0 total_R = 0.0 total_regret = 0.0 # ------------------------------------------------------------------------ for n in range(num_episodes): env.reset() # Choose an action; Choose a bandit action = actor(list(critic.model.values())) if action in best_action: num_best += 1 # Est. regret and save it regret = estimate_regret(all_actions, action, critic) # Pull a lever. state, R_t, _, _ = env.step(action) # Critic learns critic = Q_update(action, R_t, critic, lr_R) # Log data writer.add_scalar("state", int(state), n) writer.add_scalar("action", action, n) writer.add_scalar("epsilon", actor.epsilon, n) writer.add_scalar("regret", regret, n) writer.add_scalar("score_R", R_t, n) writer.add_scalar("value_R", critic(action), n) total_R += R_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_R", total_R, n) writer.add_scalar("p_bests", num_best / (n + 1), n) # Decay ep? if epsilon_decay_tau > 0: actor.decay_epsilon() # -- Build the final result, and save or return it --- writer.close() result = dict(best=env.best, env_name=env_name, num_episodes=num_episodes, lr_R=lr_R, epsilon=epsilon, epsilon_decay_tau=epsilon_decay_tau, critic=critic.state_dict(), total_R=total_R, total_regret=total_regret, master_seed=master_seed) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result
def run(env_name='BanditOneHot10-v0', num_episodes=1000, tie_break='next', tie_threshold=0.0, lr_R=.1, master_seed=42, initial_bins=None, write_to_disk=True, load=None, log_dir=None): """Bandit agent - argmax (E, R)""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - env = gym.make(env_name) env.seed(master_seed) env.reset() num_actions = env.action_space.n all_actions = list(range(num_actions)) best_action = env.best default_reward_value = 0 default_info_value = entropy(np.ones(num_actions) / num_actions) E_t = default_info_value R_t = default_reward_value # - Init agents and memories critic_R = Critic(num_actions, default_value=default_reward_value) critic_E = Critic(num_actions, default_value=default_info_value) actor_R = DeterministicActor(num_actions, tie_break='first', tie_threshold=tie_threshold) actor_E = DeterministicActor(num_actions, tie_break=tie_break, tie_threshold=tie_threshold) memories = [ DiscreteDistribution(initial_bins=initial_bins) for _ in range(num_actions) ] # Update with pre-loaded data. This will let you run # test experiments on pre-trained model and/or to # continue training. if load is not None: result = load_checkpoint(load) critic_E.load_state_dict(result['critic_E']) critic_R.load_state_dict(result['critic_R']) for i, mem in enumerate(memories): mem.load_state_dict(result['memories'][i]) # - num_best = 0 total_R = 0.0 total_E = 0.0 total_regret = 0.0 # ------------------------------------------------------------------------ for n in range(num_episodes): env.reset() # Meta-greed policy selection if (E_t - tie_threshold) > R_t: critic = critic_E actor = actor_E policy = 0 else: critic = critic_R actor = actor_R policy = 1 # Choose an action; Choose a bandit action = actor(list(critic.model.values())) if action in best_action: num_best += 1 # Est. regret and save it regret = estimate_regret(all_actions, action, critic) # Pull a lever. state, R_t, _, _ = env.step(action) R_t = R_homeostasis(R_t, total_R, num_episodes) # Estimate E old = deepcopy(memories[action]) memories[action].update((int(state), int(R_t))) new = deepcopy(memories[action]) E_t = kl(new, old, default_info_value) # Learning, both policies. critic_R = R_update(action, R_t, critic_R, lr_R) critic_E = E_update(action, E_t, critic_E, lr=1) # Log data writer.add_scalar("policy", policy, n) writer.add_scalar("state", state, n) writer.add_scalar("action", action, n) writer.add_scalar("regret", regret, n) writer.add_scalar("score_E", E_t, n) writer.add_scalar("score_R", R_t, n) writer.add_scalar("value_E", critic_E(action), n) writer.add_scalar("value_R", critic_R(action), n) total_E += E_t total_R += R_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_E", total_E, n) writer.add_scalar("total_R", total_R, n) writer.add_scalar("p_bests", num_best / (n + 1), n) tie = 0 if actor.tied: tie = 1 writer.add_scalar("ties", tie, n) # -- Build the final result, and save or return it --- writer.close() result = dict(best=env.best, num_episodes=num_episodes, tie_break=tie_break, tie_threshold=tie_threshold, critic_E=critic_E.state_dict(), critic_R=critic_R.state_dict(), memories=[m.state_dict() for m in memories], total_E=total_E, total_R=total_R, total_regret=total_regret, env_name=env_name, lr_R=lr_R, master_seed=master_seed) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result
def run(env_name='BanditOneHot10-v0', num_episodes=1000, temp=1.0, tie_threshold=0.0, tie_break=None, lr_R=.1, master_seed=42, write_to_disk=True, log_dir=None): """Bandit agent - softmax (E, R)""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - env = gym.make(env_name) env.seed(master_seed) num_actions = env.action_space.n all_actions = list(range(num_actions)) best_action = env.best default_reward_value = 0 default_info_value = entropy(np.ones(num_actions) / num_actions) E_t = default_info_value R_t = default_reward_value # --- Agents and memories --- critic_R = Critic(num_actions, default_value=default_reward_value) critic_E = Critic(num_actions, default_value=default_info_value) actor_R = SoftmaxActor(num_actions, temp=temp, seed_value=master_seed) actor_E = SoftmaxActor(num_actions, temp=temp, seed_value=master_seed) memories = [DiscreteDistribution() for _ in range(num_actions)] # - num_best = 0 total_R = 0.0 total_E = 0.0 total_regret = 0.0 # ------------------------------------------------------------------------ for n in range(num_episodes): env.reset() # Meta-greed policy selection if (E_t - tie_threshold) > R_t: critic = critic_E actor = actor_E policy = 0 else: critic = critic_R actor = actor_R policy = 1 # Choose an action; Choose a bandit action = actor(list(critic.model.values())) if action in best_action: num_best += 1 # Est. regret and save it regret = estimate_regret(all_actions, action, critic) # Pull a lever. state, R_t, _, _ = env.step(action) R_t = R_homeostasis(R_t, total_R, num_episodes) # Estimate E old = deepcopy(memories[action]) memories[action].update((int(state), int(R_t))) new = deepcopy(memories[action]) E_t = kl(new, old, default_info_value) # Learning, both policies. critic_R = R_update(action, R_t, critic_R, lr_R) critic_E = E_update(action, E_t, critic_E, lr=1) # Log data writer.add_scalar("policy", policy, n) writer.add_scalar("state", int(state), n) writer.add_scalar("action", action, n) writer.add_scalar("regret", regret, n) writer.add_scalar("score_E", E_t, n) writer.add_scalar("score_R", R_t, n) writer.add_scalar("value_E", critic_E(action), n) writer.add_scalar("value_R", critic_R(action), n) total_E += E_t total_R += R_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_E", total_E, n) writer.add_scalar("total_R", total_R, n) writer.add_scalar("p_bests", num_best / (n + 1), n) # -- Build the final result, and save or return it --- writer.close() result = dict(best=env.best, num_episodes=num_episodes, temp=temp, tie_threshold=tie_threshold, critic_E=critic_E.state_dict(), critic_R=critic_R.state_dict(), total_E=total_E, total_R=total_R, total_regret=total_regret, env_name=env_name, lr_R=lr_R, master_seed=master_seed) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result
def run(env_name='BanditOneHigh2-v0', num_episodes=1, temp=1.0, beta=1.0, lr_R=.1, mode='EB', master_seed=42, write_to_disk=True, load=None, log_dir=None): """Bandit agent - sample(R + beta count^(-1/2))""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - env = gym.make(env_name) env.seed(master_seed) num_actions = env.action_space.n all_actions = list(range(num_actions)) best_action = env.best default_reward_value = 0 # Null R R_t = default_reward_value # Agents and memories critic = Critic(num_actions, default_value=default_reward_value) actor = SoftmaxActor(num_actions, temp=temp, seed_value=master_seed) count = CountMemory() # Update with pre-loaded data. This will let you run # test experiments on pre-trained model and/or to # continue training. if load is not None: result = load_checkpoint(load) critic.load_state_dict(result['critic']) count.load_state_dict(result['count']) # - total_R = 0.0 total_regret = 0.0 num_best = 0 # ------------------------------------------------------------------------ for n in range(num_episodes): env.reset() # Choose an action; Choose a bandit action = actor(list(critic.model.values())) if action in best_action: num_best += 1 # Est. regret and save it regret = estimate_regret(all_actions, action, critic) # Pull a lever. state, R_t, _, _ = env.step(action) # Apply count bonus if mode == "EB": count_bonus = count(action)**(-0.5) elif mode == "UCB": count_bonus = ((2 * np.log(n + 1)) / count(action))**(0.5) else: raise ValueError("mode must be EB or UCB") # Critic learns payout = R_t + (beta * count_bonus) critic = Q_update(action, payout, critic, lr_R) # Log data writer.add_scalar("state", int(state), n) writer.add_scalar("action", action, n) writer.add_scalar("regret", regret, n) writer.add_scalar("bonus", count_bonus, n) writer.add_scalar("score_R", R_t, n) writer.add_scalar("value_R", critic(action), n) total_R += R_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_R", total_R, n) writer.add_scalar("p_bests", num_best / (n + 1), n) # --- Build the final result, and save or return it --- writer.close() result = dict(best=env.best, beta=beta, temp=temp, mode=mode, env_name=env_name, num_episodes=num_episodes, critic=critic.state_dict(), count=count.state_dict(), total_R=total_R, total_regret=total_regret, master_seed=master_seed) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result
def run(env_name='InfoBlueYellow4b-v0', num_episodes=1, lr_E=1.0, actor='DeterministicActor', initial_count=1, initial_bins=None, initial_noise=0.0, master_seed=None, env_seed=None, actor_seed=None, critic_seed=None, reward_mode=False, log_dir=None, write_to_disk=True, **actor_kwargs): """Play some slots!""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - if master_seed is not None: env_seed = master_seed critic_seed = master_seed actor_seed = master_seed # - env = gym.make(env_name) env.seed(env_seed) num_actions = env.action_space.n best_action = env.best default_info_value = entropy(np.ones(num_actions) / num_actions) E_t = default_info_value # --- Agents and memories --- # Critic all_actions = list(range(num_actions)) critic_E = NoisyCritic(num_actions, default_value=default_info_value, default_noise_scale=initial_noise, seed_value=critic_seed) # Actor if actor == "DeterministicActor": actor_E = DeterministicActor(num_actions, **actor_kwargs) elif actor == "SoftmaxActor": actor_E = SoftmaxActor(num_actions, **actor_kwargs, seed_value=actor_seed) elif actor == "RandomActor": actor_E = RandomActor(num_actions, **actor_kwargs, seed_value=actor_seed) elif actor == "ThresholdActor": actor_E = ThresholdActor(num_actions, **actor_kwargs, seed_value=actor_seed) else: raise ValueError("actor was not a valid choice") # Memory memories = [ DiscreteDistribution(initial_bins=initial_bins, initial_count=initial_count) for _ in range(num_actions) ] # --- Init log --- num_best = 0 total_E = 0.0 total_regret = 0.0 # --- Main loop --- for n in range(num_episodes): # Each ep resets the env env.reset() # Choose a bandit arm values = list(critic_E.model.values()) action = actor_E(values) if action is None: break regret = estimate_regret(all_actions, action, critic_E) # Pull a lever. state, reward, _, _ = env.step(action) # Estimate E, save regret old = deepcopy(memories[action]) memories[action].update((int(state), int(reward))) new = deepcopy(memories[action]) E_t = kl(new, old, critic_E.inital_values[action]) # --- Learn --- critic_E = updateE(action, E_t, critic_E, lr=lr_E) # --- Log data --- num_stop = n writer.add_scalar("state", int(state), n) writer.add_scalar("regret", regret, n) writer.add_scalar("score_E", E_t, n) writer.add_scalar("value_E", critic_E(action), n) total_E += E_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_E", total_E, n) if action in best_action: num_best += 1 writer.add_scalar("p_bests", num_best / (n + 1), n) writer.add_scalar("action", action, n) tie = 0 if actor_E.tied: tie = 1 writer.add_scalar("ties", tie, n) # -- Build the final result, and save or return it --- writer.close() result = dict(best=best_action, critic_E=critic_E.state_dict(), intial_E=list(critic_E.inital_values.values()), total_E=total_E, total_regret=total_regret, env_name=env_name, num_episodes=num_episodes, lr_E=lr_E, master_seed=master_seed, actor_seed=actor_seed, critic_seed=critic_seed, env_seed=env_seed, initial_bins=initial_bins, initial_count=initial_count, actor=actor, memories=[m.state_dict() for m in memories], actor_kwargs=actor_kwargs, num_stop=num_stop + 1) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result
def run(env_name='BanditOneHigh2-v0', num_episodes=1, tie_break='next', tie_threshold=0.0, beta=1.0, lr_R=.1, master_seed=42, log_dir=None, write_to_disk=True): """Bandit agent - argmax(R + beta E)""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - env = gym.make(env_name) env.seed(master_seed) num_actions = env.action_space.n best_action = env.best # - default_reward_value = 0 # Null R default_info_value = entropy(np.ones(num_actions) / num_actions) # Uniform p(a) E_t = default_info_value R_t = default_reward_value # Agents and memories critic = Critic(num_actions, default_value=default_reward_value + (beta * default_info_value)) actor = Actor(num_actions, tie_break=tie_break, tie_threshold=tie_threshold) memories = [DiscreteDistribution() for _ in range(num_actions)] all_actions = list(range(num_actions)) # - total_R = 0.0 total_E = 0.0 total_regret = 0.0 num_best = 0 # ------------------------------------------------------------------------ for n in range(num_episodes): env.reset() # Choose an action; Choose a bandit action = actor(list(critic.model.values())) if action in best_action: num_best += 1 # Est. regret and save it regret = estimate_regret(all_actions, action, critic) # Pull a lever. state, R_t, _, _ = env.step(action) # Estimate E old = deepcopy(memories[action]) memories[action].update((int(state), int(R_t))) new = deepcopy(memories[action]) E_t = kl(new, old, default_info_value) # Critic learns critic = Q_update(action, R_t + (beta * E_t), critic, lr_R) # Log data writer.add_scalar("state", int(state), n) writer.add_scalar("action", action, n) writer.add_scalar("regret", regret, n) writer.add_scalar("score_E", E_t, n) writer.add_scalar("score_R", R_t, n) writer.add_scalar("value_ER", critic(action), n) total_E += E_t total_R += R_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_E", total_E, n) writer.add_scalar("total_R", total_R, n) writer.add_scalar("p_bests", num_best / (n + 1), n) tie = 0 if actor.tied: tie = 1 writer.add_scalar("ties", tie, n) # -- Build the final result, and save or return it --- writer.close() result = dict(best=env.best, beta=beta, env_name=env_name, num_episodes=num_episodes, tie_break=tie_break, tie_threshold=tie_threshold, critic=critic.state_dict(), memories=[m.state_dict() for m in memories], total_E=total_E, total_R=total_R, total_regret=total_regret, master_seed=master_seed) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result
def run(env_name='BanditOneHot2-v0', num_episodes=1, lr_R=.1, master_seed=42, write_to_disk=True, log_dir=None): """Bandit agent - random""" # --- Init --- writer = SummaryWriter(log_dir=log_dir, write_to_disk=write_to_disk) # - env = gym.make(env_name) env.seed(master_seed) num_actions = env.action_space.n best_action = env.best # - default_reward_value = 0 # Null R R_t = default_reward_value critic = Critic(num_actions, default_value=default_reward_value) actor = RandomActor(num_actions, seed_value=master_seed) all_actions = list(range(num_actions)) # ------------------------------------------------------------------------ num_best = 0 total_R = 0.0 total_regret = 0.0 for n in range(num_episodes): env.reset() # Choose an action; Choose a bandit action = actor(list(critic.model.values())) if action in best_action: num_best += 1 # Est. regret and save it regret = estimate_regret(all_actions, action, critic) # Pull a lever. state, R_t, _, _ = env.step(action) # Critic learns critic = Q_update(action, R_t, critic, lr_R) # Log data writer.add_scalar("state", int(state), n) writer.add_scalar("action", action, n) writer.add_scalar("regret", regret, n) writer.add_scalar("score_R", R_t, n) writer.add_scalar("value_R", critic(action), n) total_R += R_t total_regret += regret writer.add_scalar("total_regret", total_regret, n) writer.add_scalar("total_R", total_R, n) writer.add_scalar("p_bests", num_best / (n + 1), n) # -- Build the final result, and save or return it --- writer.close() result = dict(best=env.best, env_name=env_name, num_episodes=num_episodes, lr_R=lr_R, critic=critic.state_dict(), total_R=total_R, total_regret=total_regret, master_seed=master_seed) if write_to_disk: save_checkpoint(result, filename=os.path.join(writer.log_dir, "result.pkl")) return result