def generate_episode_from_Q(env: BlackjackEnv, Q, epsilon, action_count) -> [tuple]: """ Generates an episode @param env: @param Q: @param epsilon: @param action_count Returns """ episode = [] # stores the initial state. [sum of player cards, open dealer card, has usable Ace] state = env.reset() while True: if state in Q: # choose the action with the Q table in mind action = np.random.choice(np.arange(action_count), p=get_probs(Q[state], epsilon, action_count)) else: # if we have never visited this state before, just throw the dice action = env.action_space.sample() next_state, reward, done, info = env.step(action) episode.append((state, action, reward)) state = next_state if done: break return episode
def gen_episode_data(policy: DeterministicPolicy, env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]: episode_history = [] state = env.reset() done = False while not done: action = policy(state) next_state, reward, done, _ = env.step(action) episode_history.append((state, action, reward)) state = next_state return episode_history
def gen_stochastic_episode( policy: Policy, env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]: episode_history = [] state = env.reset() done = False while not done: A: ActionValue = policy[state] action = np.random.choice([0, 1], p=A / sum(A)) next_state, reward, done, _ = env.step(action) episode_history.append((state, action, reward)) state = next_state return episode_history
def reset_env_with_s0(env: BlackjackEnv, s0: State) -> BlackjackEnv: env.reset() player_sum = s0[0] oppo_sum = s0[1] has_usable = s0[2] env.dealer[0] = oppo_sum if has_usable: env.player[0] = 1 env.player[1] = player_sum - 11 else: if player_sum > 11: env.player[0] = 10 env.player[1] = player_sum - 10 else: env.player[0] = 2 env.player[1] = player_sum - 2 return env
def mc_control_exploring_starts_state(env: BlackjackEnv, s_0: State, num_episodes, discount_factor=1.0) \ -> Tuple[ActionValue, Policy]: states = list(product(range(10, 22), range(1, 11), (True, False))) policy = { s: np.ones(env.action_space.n) * 1.0 / env.action_space.n for s in states } Q = defaultdict(lambda: np.zeros(env.action_space.n)) returns_sum = defaultdict(float) returns_count = defaultdict(float) for episode_i in range(1, num_episodes + 1): player_sum = s_0[0] oppo_sum = s_0[1] has_usable = s_0[2] env.reset() env.dealer[0] = oppo_sum if has_usable: env.player[0] = 1 env.player[1] = player_sum - 11 else: if player_sum > 11: env.player[0] = 10 env.player[1] = player_sum - 10 else: env.player[0] = 2 env.player[1] = player_sum - 2 episode_history = gen_custom_s0_stochastic_episode(policy, env, s_0) G = 0 a = episode_history[0][1] for s_a_r in episode_history: G += s_a_r[2] returns_sum[s_0, a] += G returns_count[s_0, a] += 1.0 Q[s_0][a] = returns_sum[s_0, a] / returns_count[s_0, a] best_a = np.argmax(Q[s_0]) policy[s_0][best_a] = 1.0 policy[s_0][1 - best_a] = 0.0 return Q, policy
def test_openai_gym(self): self.start_tests(name='openai-gym') # state: box, action: discrete self.unittest(environment=dict(environment='gym', level='CartPole-v0'), num_episodes=2) # state: discrete, action: box # self.unittest(environment=dict(environment='gym', level='GuessingGame'), num_episodes=2) # state: discrete, action: tuple(discrete) # from gym.envs.algorithmic import ReverseEnv # self.unittest(environment=ReverseEnv, num_episodes=2) # state: discrete, action: discrete from gym.envs.toy_text import FrozenLakeEnv self.unittest(environment=FrozenLakeEnv, num_episodes=2) # state: tuple, action: discrete from gym.envs.toy_text import BlackjackEnv self.unittest(environment=BlackjackEnv(), num_episodes=2) # Classic control self.unittest(environment='CartPole-v1', num_episodes=2) self.unittest(environment='MountainCar-v0', num_episodes=2) self.unittest(environment='MountainCarContinuous-v0', num_episodes=2) self.unittest(environment='Pendulum-v1', num_episodes=2) self.unittest(environment='Acrobot-v1', num_episodes=2) # Box2d self.unittest(environment='LunarLander-v2', num_episodes=2) self.unittest(environment='LunarLanderContinuous-v2', num_episodes=2) self.unittest(environment='BipedalWalker-v3', num_episodes=2) self.unittest(environment='BipedalWalkerHardcore-v3', num_episodes=2) # below: self.unittest(environment='CarRacing-v0', num_episodes=2) # Toy text # above: self.unittest(environment='Blackjack-v1', num_episodes=2) self.unittest(environment='FrozenLake-v1', num_episodes=2) self.unittest(environment='FrozenLake8x8-v1', num_episodes=2) self.unittest(environment='CliffWalking-v0', num_episodes=2) self.unittest(environment='Taxi-v3', num_episodes=2) # Unit test self.unittest(environment='CubeCrash-v0', num_episodes=2) self.unittest(environment='CubeCrashSparse-v0', num_episodes=2) self.unittest(environment='CubeCrashScreenBecomesBlack-v0', num_episodes=2) self.unittest(environment='MemorizeDigits-v0', num_episodes=2)
def test_openai_gym(self): self.start_tests(name='openai-gym') # state: box, action: discrete self.unittest(environment=dict(environment='gym', level='CartPole-v0')) # state: discrete, action: box self.unittest(environment=dict( environment='gym', level='GuessingGame', max_episode_steps=False)) # state: discrete, action: tuple(discrete) from gym.envs.algorithmic import ReverseEnv self.unittest(environment=ReverseEnv) # state: tuple, action: discrete from gym.envs.toy_text import BlackjackEnv self.unittest(environment=BlackjackEnv())
# Find all states the we've visited in this episode # We convert each state to a tuple so that we can use it as a dict key states_in_episode = set([tuple(x[0]) for x in episode]) for state in states_in_episode: # Find the first occurance of the state in the episode first_occurence_idx = next(i for i, x in enumerate(episode) if x[0] == state) # Sum up all rewards since the first occurance G = sum([ x[2] * (discount_factor**i) for i, x in enumerate(episode[first_occurence_idx:]) ]) # Calculate average return for this state over all sampled episodes returns_sum[state] += G returns_count[state] += 1.0 V[state] = returns_sum[state] / returns_count[state] return V if __name__ == "__main__": # matplotlib.style.use('ggplot') env = BlackjackEnv() V_10k = mc_prediction(sample_policy, env, num_episodes=100000) print(V_10k) plot_value_function(V_10k, title="10,000 Steps") # V_500k = mc_prediction(sample_policy, env, num_episodes=50000) # plotting.plot_value_function(V_500k, title="500,000 Steps")
def test_openai_gym(self): self.start_tests(name='openai-gym') # state: box, action: discrete self.unittest(environment=dict(environment='gym', level='CartPole-v0'), num_episodes=2) # state: discrete, action: box self.unittest(environment=dict(environment='gym', level='GuessingGame'), num_episodes=2) # state: discrete, action: tuple(discrete) from gym.envs.algorithmic import ReverseEnv self.unittest(environment=ReverseEnv, num_episodes=2) # state: tuple, action: discrete from gym.envs.toy_text import BlackjackEnv self.unittest(environment=BlackjackEnv(), num_episodes=2) # Classic control # above: self.unittest(environment='CartPole-v1', num_episodes=2) self.unittest(environment='MountainCar-v0', num_episodes=2) self.unittest(environment='MountainCarContinuous-v0', num_episodes=2) self.unittest(environment='Pendulum-v0', num_episodes=2) self.unittest(environment='Acrobot-v1', num_episodes=2) # Box2d self.unittest(environment='LunarLander-v2', num_episodes=2) self.unittest(environment='LunarLanderContinuous-v2', num_episodes=2) self.unittest(environment='BipedalWalker-v3', num_episodes=2) self.unittest(environment='BipedalWalkerHardcore-v3', num_episodes=2) # below: self.unittest(environment='CarRacing-v0', num_episodes=2) # Toy text # above: self.unittest(environment='Blackjack-v0', num_episodes=2) self.unittest(environment='KellyCoinflip-v0', num_episodes=2) # TODO: out-of-bounds problems! # self.unittest(environment=dict( # environment='KellyCoinflipGeneralized-v0', clip_distributions=True # ), num_episodes=2) self.unittest(environment='FrozenLake-v0', num_episodes=2) self.unittest(environment='FrozenLake8x8-v0', num_episodes=2) self.unittest(environment='CliffWalking-v0', num_episodes=2) self.unittest(environment='NChain-v0', num_episodes=2) self.unittest(environment='Roulette-v0', num_episodes=2) self.unittest(environment='Taxi-v3', num_episodes=2) # above: self.unittest(environment='GuessingGame-v0', num_episodes=2) self.unittest(environment='HotterColder-v0', num_episodes=2) # Algorithmic self.unittest(environment='Copy-v0', num_episodes=2) self.unittest(environment='RepeatCopy-v0', num_episodes=2) self.unittest(environment='ReversedAddition-v0', num_episodes=2) self.unittest(environment='ReversedAddition3-v0', num_episodes=2) self.unittest(environment='DuplicatedInput-v0', num_episodes=2) # above: self.unittest(environment='Reverse-v0', num_episodes=2) # Unit test self.unittest(environment='CubeCrash-v0', num_episodes=2) self.unittest(environment='CubeCrashSparse-v0', num_episodes=2) self.unittest(environment='CubeCrashScreenBecomesBlack-v0', num_episodes=2) self.unittest(environment='MemorizeDigits-v0', num_episodes=2)
def __init__(self): env = BlackjackEnv() super().__init__(env) self.observation_space = spaces.Discrete(704)