Пример #1
0
def main():
    new_map = ["SFFF", "FHFH", "FFFH", "HFFG"]
    env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY)
    env = env.unwrapped
    succeed_episode = 0

    for i_episode in range(1000000):

        if use_random_map and i_episode % 10 == 0:
            env.close()
            new_map = random_map(HOLE_NUM)
            env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY)
            env = env.unwrapped

        pos = env.reset()
        state = encode_state(new_map, pos)

        ep_r = 0

        while True:
            a = select_action(state)

            pos_next, r, done, info = env.step(a)
            ep_r += r
            #state_next = encode_state(new_map, pos_next)

            if args.render:
                env.render()
            model.rewards.append(r)

            if done:
                break

        finish_episode()

        episode_durations.append(ep_r)

        if ep_r > 0:
            # EPSILON = 1 - 1. / ((i_episode / 500) + 10)
            succeed_episode += 1

        if i_episode % 1000 == 1:
            print('EP: {:d} succeed rate {:4f}'.format(i_episode,
                                                       succeed_episode / 1000))
            succeed_episode = 0

        if i_episode % 5000 == 1:
            plot_durations()
Пример #2
0
    def test_expected(self):
        env = FrozenLakeEnv(is_slippery=False)
        policy = UserInputPolicy(env)

        s = env.reset()
        env.render()

        for i in [RIGHT, RIGHT, DOWN, DOWN, DOWN, RIGHT]:
            with MockInputFunction(return_value=i):
                a = policy(s)

            s, r, done, info = env.step(a)
            env.render()

            if done:
                break
Пример #3
0
    else:
        delta = (reward + gamma * Q_table[new_state, new_action] -
                 Q_table[state, action])

    Q_table[state, action] += learning_rate * delta


reward_list = []
for k in range(N_trial + N_trial_test):

    acc_reward = 0  # Init the accumulated reward
    observation = env.reset()  # Init the state
    action = policy(Q_table, observation, epsilon)  # Init the first action

    for t in range(trial_duration):
        if render: env.render()

        new_observation, reward, done, info = env.step(
            action)  # Take the action
        new_action = policy(Q_table, new_observation, epsilon)
        update_Q_table(Q_table=Q_table,
                       state=observation,
                       action=action,
                       reward=reward,
                       new_state=new_observation,
                       new_action=new_action,
                       is_done=done)

        observation = new_observation  # Pass the new state to the next step
        action = new_action  # Pass the new action to the next step
        acc_reward += reward  # Accumulate the reward
Пример #4
0
import gym
import random
import numpy as np
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

char_list = list('SFFFFFFFFFFFFFFG')
for i in range(2):
    char_list[random.randint(1, 14)] = 'H'
my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]]
env = FrozenLakeEnv(desc=np.asarray(my_map, dtype='c'), is_slippery=False)
env = env.unwrapped

for i in range(10):
    b = env.render()
    a = env.step(1)
    print(a)
Пример #5
0
    #     print(episode, qtable)
    #     print(total_rewards)

print("Score over time: " + str(sum(rewards) / total_episodes))
print(qtable)

env.reset()

for episode in range(1):
    state = env.reset()
    print("state",state)
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state, :])

        new_state, reward, done, info = env.step(action)

        if done:
            print("done..........")
            # env.render()
            break
        state = new_state
env.close()