Пример #1
0
def generate_episode_from_Q(env: BlackjackEnv, Q, epsilon,
                            action_count) -> [tuple]:
    """
    Generates an episode
    @param env:
    @param Q:
    @param epsilon:
    @param action_count
    Returns
    """
    episode = []
    # stores the initial state.  [sum of player cards, open dealer card, has usable Ace]
    state = env.reset()
    while True:
        if state in Q:
            # choose the action with the Q table in mind
            action = np.random.choice(np.arange(action_count),
                                      p=get_probs(Q[state], epsilon,
                                                  action_count))
        else:
            # if we have never visited this state before, just throw the dice
            action = env.action_space.sample()

        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode
Пример #2
0
def gen_episode_data(policy: DeterministicPolicy,
                     env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]:
    episode_history = []
    state = env.reset()
    done = False
    while not done:
        action = policy(state)
        next_state, reward, done, _ = env.step(action)
        episode_history.append((state, action, reward))
        state = next_state
    return episode_history
Пример #3
0
def gen_stochastic_episode(
        policy: Policy,
        env: BlackjackEnv) -> List[Tuple[State, Action, Reward]]:
    episode_history = []
    state = env.reset()
    done = False
    while not done:
        A: ActionValue = policy[state]
        action = np.random.choice([0, 1], p=A / sum(A))
        next_state, reward, done, _ = env.step(action)
        episode_history.append((state, action, reward))
        state = next_state
    return episode_history
Пример #4
0
def reset_env_with_s0(env: BlackjackEnv, s0: State) -> BlackjackEnv:
    env.reset()
    player_sum = s0[0]
    oppo_sum = s0[1]
    has_usable = s0[2]

    env.dealer[0] = oppo_sum
    if has_usable:
        env.player[0] = 1
        env.player[1] = player_sum - 11
    else:
        if player_sum > 11:
            env.player[0] = 10
            env.player[1] = player_sum - 10
        else:
            env.player[0] = 2
            env.player[1] = player_sum - 2
    return env
Пример #5
0
def mc_control_exploring_starts_state(env: BlackjackEnv, s_0: State, num_episodes, discount_factor=1.0) \
        -> Tuple[ActionValue, Policy]:
    states = list(product(range(10, 22), range(1, 11), (True, False)))
    policy = {
        s: np.ones(env.action_space.n) * 1.0 / env.action_space.n
        for s in states
    }
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    for episode_i in range(1, num_episodes + 1):
        player_sum = s_0[0]
        oppo_sum = s_0[1]
        has_usable = s_0[2]

        env.reset()
        env.dealer[0] = oppo_sum
        if has_usable:
            env.player[0] = 1
            env.player[1] = player_sum - 11
        else:
            if player_sum > 11:
                env.player[0] = 10
                env.player[1] = player_sum - 10
            else:
                env.player[0] = 2
                env.player[1] = player_sum - 2

        episode_history = gen_custom_s0_stochastic_episode(policy, env, s_0)

        G = 0
        a = episode_history[0][1]
        for s_a_r in episode_history:
            G += s_a_r[2]
            returns_sum[s_0, a] += G
            returns_count[s_0, a] += 1.0
            Q[s_0][a] = returns_sum[s_0, a] / returns_count[s_0, a]
            best_a = np.argmax(Q[s_0])
            policy[s_0][best_a] = 1.0
            policy[s_0][1 - best_a] = 0.0

    return Q, policy
Пример #6
0
    def test_openai_gym(self):
        self.start_tests(name='openai-gym')

        # state: box, action: discrete
        self.unittest(environment=dict(environment='gym', level='CartPole-v0'),
                      num_episodes=2)

        # state: discrete, action: box
        # self.unittest(environment=dict(environment='gym', level='GuessingGame'), num_episodes=2)

        # state: discrete, action: tuple(discrete)
        # from gym.envs.algorithmic import ReverseEnv
        # self.unittest(environment=ReverseEnv, num_episodes=2)

        # state: discrete, action: discrete
        from gym.envs.toy_text import FrozenLakeEnv
        self.unittest(environment=FrozenLakeEnv, num_episodes=2)

        # state: tuple, action: discrete
        from gym.envs.toy_text import BlackjackEnv
        self.unittest(environment=BlackjackEnv(), num_episodes=2)

        # Classic control
        self.unittest(environment='CartPole-v1', num_episodes=2)
        self.unittest(environment='MountainCar-v0', num_episodes=2)
        self.unittest(environment='MountainCarContinuous-v0', num_episodes=2)
        self.unittest(environment='Pendulum-v1', num_episodes=2)
        self.unittest(environment='Acrobot-v1', num_episodes=2)

        # Box2d
        self.unittest(environment='LunarLander-v2', num_episodes=2)
        self.unittest(environment='LunarLanderContinuous-v2', num_episodes=2)
        self.unittest(environment='BipedalWalker-v3', num_episodes=2)
        self.unittest(environment='BipedalWalkerHardcore-v3', num_episodes=2)
        # below: self.unittest(environment='CarRacing-v0', num_episodes=2)

        # Toy text
        # above: self.unittest(environment='Blackjack-v1', num_episodes=2)
        self.unittest(environment='FrozenLake-v1', num_episodes=2)
        self.unittest(environment='FrozenLake8x8-v1', num_episodes=2)
        self.unittest(environment='CliffWalking-v0', num_episodes=2)
        self.unittest(environment='Taxi-v3', num_episodes=2)

        # Unit test
        self.unittest(environment='CubeCrash-v0', num_episodes=2)
        self.unittest(environment='CubeCrashSparse-v0', num_episodes=2)
        self.unittest(environment='CubeCrashScreenBecomesBlack-v0',
                      num_episodes=2)
        self.unittest(environment='MemorizeDigits-v0', num_episodes=2)
Пример #7
0
    def test_openai_gym(self):
        self.start_tests(name='openai-gym')

        # state: box, action: discrete
        self.unittest(environment=dict(environment='gym', level='CartPole-v0'))

        # state: discrete, action: box
        self.unittest(environment=dict(
            environment='gym', level='GuessingGame', max_episode_steps=False))

        # state: discrete, action: tuple(discrete)
        from gym.envs.algorithmic import ReverseEnv
        self.unittest(environment=ReverseEnv)

        # state: tuple, action: discrete
        from gym.envs.toy_text import BlackjackEnv
        self.unittest(environment=BlackjackEnv())
Пример #8
0
        # Find all states the we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        states_in_episode = set([tuple(x[0]) for x in episode])
        for state in states_in_episode:
            # Find the first occurance of the state in the episode
            first_occurence_idx = next(i for i, x in enumerate(episode)
                                       if x[0] == state)
            # Sum up all rewards since the first occurance
            G = sum([
                x[2] * (discount_factor**i)
                for i, x in enumerate(episode[first_occurence_idx:])
            ])
            # Calculate average return for this state over all sampled episodes
            returns_sum[state] += G
            returns_count[state] += 1.0
            V[state] = returns_sum[state] / returns_count[state]

    return V


if __name__ == "__main__":
    # matplotlib.style.use('ggplot')

    env = BlackjackEnv()
    V_10k = mc_prediction(sample_policy, env, num_episodes=100000)
    print(V_10k)
    plot_value_function(V_10k, title="10,000 Steps")

    # V_500k = mc_prediction(sample_policy, env, num_episodes=50000)
    # plotting.plot_value_function(V_500k, title="500,000 Steps")
Пример #9
0
    def test_openai_gym(self):
        self.start_tests(name='openai-gym')

        # state: box, action: discrete
        self.unittest(environment=dict(environment='gym', level='CartPole-v0'),
                      num_episodes=2)

        # state: discrete, action: box
        self.unittest(environment=dict(environment='gym',
                                       level='GuessingGame'),
                      num_episodes=2)

        # state: discrete, action: tuple(discrete)
        from gym.envs.algorithmic import ReverseEnv
        self.unittest(environment=ReverseEnv, num_episodes=2)

        # state: tuple, action: discrete
        from gym.envs.toy_text import BlackjackEnv
        self.unittest(environment=BlackjackEnv(), num_episodes=2)

        # Classic control
        # above: self.unittest(environment='CartPole-v1', num_episodes=2)
        self.unittest(environment='MountainCar-v0', num_episodes=2)
        self.unittest(environment='MountainCarContinuous-v0', num_episodes=2)
        self.unittest(environment='Pendulum-v0', num_episodes=2)
        self.unittest(environment='Acrobot-v1', num_episodes=2)

        # Box2d
        self.unittest(environment='LunarLander-v2', num_episodes=2)
        self.unittest(environment='LunarLanderContinuous-v2', num_episodes=2)
        self.unittest(environment='BipedalWalker-v3', num_episodes=2)
        self.unittest(environment='BipedalWalkerHardcore-v3', num_episodes=2)
        # below: self.unittest(environment='CarRacing-v0', num_episodes=2)

        # Toy text
        # above: self.unittest(environment='Blackjack-v0', num_episodes=2)
        self.unittest(environment='KellyCoinflip-v0', num_episodes=2)
        # TODO: out-of-bounds problems!
        # self.unittest(environment=dict(
        #     environment='KellyCoinflipGeneralized-v0', clip_distributions=True
        # ), num_episodes=2)
        self.unittest(environment='FrozenLake-v0', num_episodes=2)
        self.unittest(environment='FrozenLake8x8-v0', num_episodes=2)
        self.unittest(environment='CliffWalking-v0', num_episodes=2)
        self.unittest(environment='NChain-v0', num_episodes=2)
        self.unittest(environment='Roulette-v0', num_episodes=2)
        self.unittest(environment='Taxi-v3', num_episodes=2)
        # above: self.unittest(environment='GuessingGame-v0', num_episodes=2)
        self.unittest(environment='HotterColder-v0', num_episodes=2)

        # Algorithmic
        self.unittest(environment='Copy-v0', num_episodes=2)
        self.unittest(environment='RepeatCopy-v0', num_episodes=2)
        self.unittest(environment='ReversedAddition-v0', num_episodes=2)
        self.unittest(environment='ReversedAddition3-v0', num_episodes=2)
        self.unittest(environment='DuplicatedInput-v0', num_episodes=2)
        # above: self.unittest(environment='Reverse-v0', num_episodes=2)

        # Unit test
        self.unittest(environment='CubeCrash-v0', num_episodes=2)
        self.unittest(environment='CubeCrashSparse-v0', num_episodes=2)
        self.unittest(environment='CubeCrashScreenBecomesBlack-v0',
                      num_episodes=2)
        self.unittest(environment='MemorizeDigits-v0', num_episodes=2)
Пример #10
0
 def __init__(self):
     env = BlackjackEnv()
     super().__init__(env)
     self.observation_space = spaces.Discrete(704)