def main():
    # non-deterministic
    env = FrozenLakeEnv(is_slippery=True)

    Q, V, pi = sarsa(env, 2000)
    plot_value_func(V, "value func: sarsa - non-determ")
    plot_policy(pi, "policy: sarsa - non-determ")
    print "pi1", pi
    print "Q1:", Q

    Q, V, pi = q_learning(env, 2000)
    plot_value_func(V, "value func: q-learning - non-determ")
    plot_policy(pi, "policy: q-learning - non-determ")
    print "pi2", pi
    print "Q2:", Q

    # deterministic
    env = FrozenLakeEnv(is_slippery=False)

    Q, V, pi = sarsa(env, 1000)
    plot_value_func(V, "value func: sarsa - determ")
    plot_policy(pi, "policy: sarsa - determ")
    # print "pi4", pi
    print "Q4:", Q

    Q, V, pi = q_learning(env, 1000)
    plot_value_func(V, "value func: q-learning - determ")
    plot_policy(pi, "policy: q-learning - determ")
    # print "pi5", pi
    print "Q5:", Q
예제 #2
0
def main():
    new_map = ["SFFF", "FHFH", "FFFH", "HFFG"]
    env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY)
    env = env.unwrapped
    succeed_episode = 0

    for i_episode in range(1000000):

        if use_random_map and i_episode % 10 == 0:
            env.close()
            new_map = random_map(HOLE_NUM)
            env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY)
            env = env.unwrapped

        pos = env.reset()
        state = encode_state(new_map, pos)

        ep_r = 0

        while True:
            a = select_action(state)

            pos_next, r, done, info = env.step(a)
            ep_r += r
            #state_next = encode_state(new_map, pos_next)

            if args.render:
                env.render()
            model.rewards.append(r)

            if done:
                break

        finish_episode()

        episode_durations.append(ep_r)

        if ep_r > 0:
            # EPSILON = 1 - 1. / ((i_episode / 500) + 10)
            succeed_episode += 1

        if i_episode % 1000 == 1:
            print('EP: {:d} succeed rate {:4f}'.format(i_episode,
                                                       succeed_episode / 1000))
            succeed_episode = 0

        if i_episode % 5000 == 1:
            plot_durations()
예제 #3
0
def find_good_maps(map_p=0.8):
    sizes = MAP_SIZES
    # sizes = [4, 8]
    seeds = range(20)
    best_maps = {}

    for size in sizes:
        smallest_lost_games_perc = float('inf')
        best_map = None
        for seed in seeds:
            print(f'Finding best maps with size {size} (seed {seed})...')
            np.random.seed(seed)
            map = generate_random_map(size=size, p=map_p)
            env = FrozenLakeEnv(desc=map)
            optimal_policy, optimal_value_function = value_iteration(
                env, theta=0.0000001, discount_factor=0.999)
            optimal_policy_flat = np.where(optimal_policy == 1)[1]
            mean_number_of_steps, lost_games_perc = score_frozen_lake(
                env, optimal_policy_flat)
            if lost_games_perc < smallest_lost_games_perc:
                smallest_lost_games_perc = lost_games_perc
                best_map = map
        best_maps[size] = {
            'lost_games_perc': smallest_lost_games_perc,
            'map': best_map
        }

    with open(f'best_maps_{map_p}.json', "wb") as f:
        f.write(json.dumps(best_maps).encode("utf-8"))
    return best_maps
예제 #4
0
파일: rtdp.py 프로젝트: instance01/RTDP
def run():
    env = FrozenLakeEnv(desc=MAP_20x20)
    rtdp = RTDP(env)
    tot_rewards = 0.
    eval_iter = int(1e4)
    for i in range(eval_iter):
        if i % 100 == 0:
            print(i)
        tot_rewards += evaluate(rtdp, env)
    print(tot_rewards / eval_iter)
예제 #5
0
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """ loads premade env from openai gym
        desc: either none or a list of lists of custom description of map
        map_name: either none or a string containing name of premade map
          (if both desc and map name are none will load randomly
        is_slippery: bool for if ice is slippery
        returns: the env
    """
    env = FrozenLakeEnv(desc, map_name, is_slippery)
    return env
예제 #6
0
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """ loads the pre-made FrozenLakeEnv evnironment from OpenAIs gym.
        Args:
            desc: (list/None) lists containing a custom description
                  of the map to load for the environment.
            map_name: (str/None) containing the pre-made map to load.
            is_slippery: (bool) boolean to determine if the ice is slippery.
        Returns:
            the environment.
    """
    return FrozenLakeEnv(desc, map_name, is_slippery)
예제 #7
0
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """
    Loads a premade FrozenLakeEnv environment from OpenAI's gym
    desc: None, or a list of lists containing a custom description of the map
        to load for the enironment.
    map_name: None or a string containing the pre-made map to load
    is_slippery: boolean to determine if the ice is slippery
    Returns: The environment
    """
    env = FrozenLakeEnv(desc, map_name, is_slippery)
    return env
예제 #8
0
    def test_expected(self):
        env = FrozenLakeEnv(is_slippery=False)
        policy = UserInputPolicy(env)

        s = env.reset()
        env.render()

        for i in [RIGHT, RIGHT, DOWN, DOWN, DOWN, RIGHT]:
            with MockInputFunction(return_value=i):
                a = policy(s)

            s, r, done, info = env.step(a)
            env.render()

            if done:
                break
예제 #9
0
def main():
    from gym.envs.toy_text.frozen_lake import FrozenLakeEnv
    env = FrozenLakeEnv(is_slippery=False, map_name=f'{SIZE[0]}x{SIZE[1]}')
    env.actions = [0, 1, 2, 3]
    #Q = np.zeros((SIZE[0]*SIZE[1], len(env.actions)))
    Q = np.random.uniform(low=0,
                          high=1,
                          size=(SIZE[0] * SIZE[1], len(env.actions)))
    print(obtain_value_function(Q))
    episodes = 10000
    eps = .3
    Qs, episode_lengths = [], []
    """for n in range(1,15):
        plt.figure()
        for alpha in np.arange(.1, 1., .1):
            Q_learned, episode_length = n_step_sarsa(Q.copy(), n, episodes, env, alpha, 0.8, eps=eps)
            Qs.append(Q_learned)
            episode_lengths.append(episode_length)
            plt.plot(smooth(episode_length), label=f'$n=${n}, $\\alpha=${alpha:.1f}')
        plt.legend()"""
    n = 4
    Q_learned, episode_lengths_qlearning = n_step_sarsa(Q.copy(),
                                                        n,
                                                        episodes,
                                                        env,
                                                        0.1,
                                                        0.8,
                                                        eps=eps)
    plt.legend()

    value = obtain_value_function(Q_learned)

    policy = obtain_policy(Q_learned)

    pretty_print(value, policy, f'${n}$-step Sarsa')
    plt.show()
예제 #10
0
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    '''loads the pre-made FrozenLakeEnv evnironment from OpenAI’s gym
    Args:
        desc: is either None or a list of lists containing a custom description
                of the map to load for the environment
        map_name:is either None or a string containing the pre-made map to load
        Note: If both desc and map_name are None, the environment will load a
            randomly generated 8x8 map
        is_slippery: is a boolean to determine if the ice is slippery
    Returns: the environment
    '''
    if desc is None and map_name is None:
        map_name = "8x8"
    env = FrozenLakeEnv(desc, map_name, is_slippery)
    return env
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """Loads the pre-made FrozenLakeEnv environment from OpenAI’s gym.

    Args:
        desc (list): is either None or a list of lists containing a custom
                     description of the map to load for the environment.
        map_name (str):  is either None or a string containing the pre-made
                         map to load.
        is_slippery (bool): determine if the ice is slippery.

    Returns:
        the environment.
    """

    env = FrozenLakeEnv(desc, map_name, is_slippery)

    return env
예제 #12
0
파일: qlearning.py 프로젝트: huyoboy/dl
import numpy as np
import gym
import random
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

# env = gym.make("FrozenLake-v0",is_slippery=True)
env=FrozenLakeEnv(desc=None, map_name="4x4",is_slippery=False)

action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size, action_size))
print(qtable)

total_episodes = 10000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.2            # Minimum exploration probability
decay_rate = 0.01             # Exponential decay rate for exploration prob

# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
예제 #13
0
import gym
import random
import numpy as np
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

char_list = list('SFFFFFFFFFFFFFFG')
for i in range(2):
    char_list[random.randint(1, 14)] = 'H'
my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]]
env = FrozenLakeEnv(desc=np.asarray(my_map, dtype='c'), is_slippery=False)
env = env.unwrapped

for i in range(10):
    b = env.render()
    a = env.step(1)
    print(a)
예제 #14
0
#
#  train_q_agent.py
#  Training Q-learning agent in OpenAI Gym's FrozenLake env
#
import random

import gym
from q_agent import QAgent
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

# How long do we play
NUM_EPISODES = 500
# How often we print results
PRINT_EVERY_EPS = 100

environment = FrozenLakeEnv(is_slippery=False)

num_states = environment.observation_space.n
num_actions = environment.action_space.n

agent = QAgent(num_states, num_actions)

sum_reward = 0

for episode in range(NUM_EPISODES):
    done = False
    last_state = environment.reset()
    last_reward = None
    # Number of steps taken. A bit of a safeguard...
    num_steps = 0
    while not done:
예제 #15
0
#!/usr/bin/env python3
#
#  learn_random_v.py
#  Learning value function of random agent in FrozenLakeEnv
#
import gym
from v_table import VTable
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

# How long do we play
NUM_EPISODES = 10000
# How often we show current V-estimate
SHOW_EVERY_EPISODES = 100

environment = FrozenLakeEnv(is_slippery=False)

num_states = environment.observation_space.n

# Create a tabular record of values
vtable = VTable(num_states)

for episode in range(NUM_EPISODES):
    done = False
    state = environment.reset()
    # Keep track of visited states and rewards
    # obtained
    states = []
    rewards = []
    while not done:
        # Store state
        states.append(state)
예제 #16
0
    char_list = list('SFFFFFFFFFFFFFFG')
    for i in range(holes_num):
        char_list[random.randint(1, 14)] = 'H'
    my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]]
    return my_map


def encode_state(map, position):
    np_map = np.asarray(map, dtype='c').reshape(16)
    holes = np.where(np_map == b'H', 1, 0)
    forzen = np.where(np_map == b'F', 1, 0)
    position = np.identity(16)[position]
    return np.hstack([holes, forzen, position])


env = FrozenLakeEnv(desc=random_map(HOLE_NUM), is_slippery=IS_SLIPPERY)
env = env.unwrapped
N_ACTIONS = env.action_space.n
N_STATES = 16 + 16 + 16  # 'F', 'H', 'where is the Agent'
ENV_A_SHAPE = 0 if isinstance(
    env.action_space.sample(),
    int) else env.action_space.sample().shape  # to confirm the shape

episode_durations = []


def plot_durations():
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode(1000)')
예제 #17
0
import numpy as np
import keras_gym as km
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

if tf.__version__ >= '2.0':
    tf.compat.v1.disable_eager_execution()  # otherwise incredibly slow


# the MDP
actions = {LEFT: 'L', RIGHT: 'R', UP: 'U', DOWN: 'D'}
env = FrozenLakeEnv(is_slippery=False)
env = km.wrappers.TrainMonitor(env)

# show logs from TrainMonitor
km.enable_logging()


class LinearFunc(km.FunctionApproximator):
    """ linear function approximator (body only does one-hot encoding) """
    def body(self, S):
        one_hot_encoding = keras.layers.Lambda(lambda x: K.one_hot(x, 16))
        return one_hot_encoding(S)


# define function approximators
func = LinearFunc(env, lr=0.01)
pi = km.SoftmaxPolicy(func, update_strategy='vanilla')
cache = km.caching.MonteCarloCache(env, gamma=0.99)
예제 #18
0
            # env.render()
            action = agent.select_action(state, selection_method)
            next_state, reward, done, _ = env.step(action)
            score += reward
            agent.update(state, action, reward, next_state, done,
                         update_method)
            state = next_state
            if done:
                break
        scores[episode] = score
        print('Episode %s \t Return %s' % (episode, score))

    return scores


env = FrozenLakeEnv(is_slippery=False)

n_episodes = 1000
horizon = 20

scores_vpi = simulate(env, n_episodes, horizon, BQLearning.MYOPIC_VPI,
                      BQLearning.MOMENT_UPDATING)
scores_qvs = simulate(env, n_episodes, horizon, BQLearning.Q_VALUE_SAMPLING,
                      BQLearning.MOMENT_UPDATING)

plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.plot(cummean(scores_vpi), label='Myopic VPI')
ax.plot(cummean(scores_qvs), label='QValue sampling')
ax.set_xlabel('episode')
ax.set_ylabel('cumulative average return')
예제 #19
0
def V(s):
    return (1 - epsilon) * np.max(Q[s, :]) + epsilon * np.mean(Q[s, :])


"""
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
"""
# Deterministic environment

from gym.envs.toy_text.frozen_lake import FrozenLakeEnv
#env = FrozenLakeEnv(is_slippery=False)
env = FrozenLakeEnv(map_name="8x8")
#env = gym.make("FrozenLake-v0")

num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros([num_states, num_actions])
#Q = np.random.randn(num_states,num_actions)

num_episodes = 100000
rewardvector = []
gamma = 0.7
alpha = 0.4
epsilon = 0.5


#print(Q.shape)
예제 #20
0
import numpy as np
import numpy.random as rd
import matplotlib.pyplot as plt
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

# Algorithm parameters
learning_rate = 0.5
gamma = 1.
epsilon = .01
render = False
N_trial = 1000
N_trial_test = 100
trial_duration = 100

# Generate the environment
env = FrozenLakeEnv(map_name='4x4', is_slippery=False)
n_state = env.observation_space.n
n_action = env.action_space.n

# Initialize the Q values
Q_table = np.zeros((n_state, n_action))


def policy(Q_table, state, epsilon):
    '''
       Implementation of the epsilon greedy policy.

       :param Q_table: Table containing the expected return for each action and state pair
       :param state:
       :return:
       '''
예제 #21
0
def create_env(lake_map):
    return FrozenLakeEnv(desc=lake_map)
예제 #22
0
def V(s):
    return (1 - epsilon) * np.max(Q[s, :]) + epsilon * np.mean(Q[s, :])


"""
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
"""
# Deterministic environment

from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

env = FrozenLakeEnv(is_slippery=False)
#env = FrozenLakeEnv(map_name="8x8")
#env = gym.make("FrozenLake-v0")

num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros([num_states, num_actions])
#Q = np.random.randn(num_states,num_actions)

num_episodes = 100
rewardvector = []
gamma = 0.5
alpha = 0.8
epsilon = 0.5

예제 #23
0
 def __init__(self):
     super().__init__()
     self.env = FrozenLakeEnv(map_name="4x4", is_slippery=True)
예제 #24
0
def load_frozen_lake(desc=None, map_name=None, is_slippery=False):
    """Loads a pre-made FrozenLakeEnv environment"""
    env = FrozenLakeEnv(desc, map_name, is_slippery)
    return env