def mc_control_importance_sampling(
        env: BlackjackEnv,
        num_episodes: float,
        behavior_policy: Callable,
        discount_factor: float = 1.0) -> Tuple[Dict, Callable]:
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    C = defaultdict(lambda: np.zeros(env.action_space.n))

    target_policy = create_greedy_policy(Q)

    for _ in tqdm(range(int(num_episodes))):
        episode = []
        state = env.reset()
        is_over = False
        while not is_over:
            probs = behavior_policy()
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, is_over, _ = env.step(action)
            episode.append(Step(state, action, reward))
            state = next_state

        g = 0.0
        w = 1.0
        for ep in episode[::-1]:
            g = discount_factor * g + ep.reward
            C[ep.state][ep.action] += w
            Q[ep.state][ep.action] += (w / C[ep.state][ep.action]) * (
                g - Q[ep.state][ep.action])
            if ep.action != np.argmax(target_policy(ep.state)):
                break
            w = w * 1.0 / behavior_policy()[ep.action]

    return Q, target_policy
def mc_prediction(policy: np.array,
                  env: BlackjackEnv,
                  num_episodes: Union[int, float],
                  discount_factor: float = 1.0) -> Dict[Tuple, float]:
    rewards_sum = defaultdict(float)
    rewards_count = defaultdict(float)
    v = defaultdict(float)

    for _ in tqdm(range(int(num_episodes))):

        episode = []
        state = env.reset()
        is_over = False
        while not is_over:
            action = policy(state)
            next_state, reward, is_over, _ = env.step(action)
            episode.append(Step(state, action, reward))
            state = next_state

        states_in_episode = set([tuple(ep.state) for ep in episode])
        for state in states_in_episode:
            first_visit_idx = next(i for i, ep in enumerate(episode)
                                   if ep.state == state)
            g = sum([
                ep.reward * (discount_factor**i)
                for i, ep in enumerate(episode[first_visit_idx:])
            ])
            rewards_sum[state] += g
            rewards_count[state] += 1.0
            v[state] = rewards_sum[state] / rewards_count[state]

    return v
def mc_control_epsilon_greedy(
        env: BlackjackEnv,
        num_episodes: float,
        discount_factor: float = 1.0,
        epsilon: float = 0.1) -> Tuple[Dict, Callable[[int], np.array]]:
    rewards_sum = defaultdict(float)
    rewards_count = defaultdict(float)

    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for _ in tqdm(range(int(num_episodes))):

        episode = []
        state = env.reset()
        is_over = False
        while not is_over:
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, is_over, _ = env.step(action)
            episode.append(Step(state, action, reward))
            state = next_state

        sa_in_episode = set([(tuple(ep.state), ep.action) for ep in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            first_visit = next(i for i, ep in enumerate(episode)
                               if ep.state == state and ep.action == action)
            g = sum([
                ep.reward * (discount_factor**i)
                for i, ep in enumerate(episode[first_visit:])
            ])
            rewards_sum[sa_pair] += g
            rewards_count[sa_pair] += 1.0
            Q[state][action] = rewards_sum[sa_pair] / rewards_count[sa_pair]

    return Q, policy
Exemplo n.º 4
0
# -*- coding: utf-8 -*-

import gym
import matplotlib
import numpy as np
import sys
import matplotlib.pyplot as pl
from collections import defaultdict
from envs.blackjack import BlackjackEnv
from lib import plotting
import envs
#matplotlib.style.use('ggplot')

env = BlackjackEnv()


def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    """
    Monte Carlo prediction algorithm. Calculates the value function
    for a given policy using sampling.
    
    Args:
        policy: A function that maps an observation to action probabilities.
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
    
    Returns:
        A dictionary that maps from state -> value.
        The state is a tuple and the value is a float.
    """
Exemplo n.º 5
0
import numpy as np
import sys
if "../" not in sys.path:
    sys.path.append("../")
from envs.blackjack import BlackjackEnv

env = BlackjackEnv()


def print_observation(observation):
    score, dealer_score, usable_ace = observation
    print("Player Score: {} (Usable Ace: {}), Dealer Score: {}".format(
        score, usable_ace, dealer_score))


def strategy(observation):
    score, dealer_score, usable_ace = observation
    # Stick (action 0) if the score is > 20, hit (action 1) otherwise
    return 0 if score >= 20 else 1


for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        print_observation(observation)
        action = strategy(observation)
        print("Taking action: {}".format(["Stick", "Hit"][action]))
        observation, reward, done, _ = env.step(action)
        if done:
            print_observation(observation)
            print("Game end. Reward: {}\n".format(float(reward)))