def main(): # non-deterministic env = FrozenLakeEnv(is_slippery=True) Q, V, pi = sarsa(env, 2000) plot_value_func(V, "value func: sarsa - non-determ") plot_policy(pi, "policy: sarsa - non-determ") print "pi1", pi print "Q1:", Q Q, V, pi = q_learning(env, 2000) plot_value_func(V, "value func: q-learning - non-determ") plot_policy(pi, "policy: q-learning - non-determ") print "pi2", pi print "Q2:", Q # deterministic env = FrozenLakeEnv(is_slippery=False) Q, V, pi = sarsa(env, 1000) plot_value_func(V, "value func: sarsa - determ") plot_policy(pi, "policy: sarsa - determ") # print "pi4", pi print "Q4:", Q Q, V, pi = q_learning(env, 1000) plot_value_func(V, "value func: q-learning - determ") plot_policy(pi, "policy: q-learning - determ") # print "pi5", pi print "Q5:", Q
def main(): new_map = ["SFFF", "FHFH", "FFFH", "HFFG"] env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY) env = env.unwrapped succeed_episode = 0 for i_episode in range(1000000): if use_random_map and i_episode % 10 == 0: env.close() new_map = random_map(HOLE_NUM) env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY) env = env.unwrapped pos = env.reset() state = encode_state(new_map, pos) ep_r = 0 while True: a = select_action(state) pos_next, r, done, info = env.step(a) ep_r += r #state_next = encode_state(new_map, pos_next) if args.render: env.render() model.rewards.append(r) if done: break finish_episode() episode_durations.append(ep_r) if ep_r > 0: # EPSILON = 1 - 1. / ((i_episode / 500) + 10) succeed_episode += 1 if i_episode % 1000 == 1: print('EP: {:d} succeed rate {:4f}'.format(i_episode, succeed_episode / 1000)) succeed_episode = 0 if i_episode % 5000 == 1: plot_durations()
def find_good_maps(map_p=0.8): sizes = MAP_SIZES # sizes = [4, 8] seeds = range(20) best_maps = {} for size in sizes: smallest_lost_games_perc = float('inf') best_map = None for seed in seeds: print(f'Finding best maps with size {size} (seed {seed})...') np.random.seed(seed) map = generate_random_map(size=size, p=map_p) env = FrozenLakeEnv(desc=map) optimal_policy, optimal_value_function = value_iteration( env, theta=0.0000001, discount_factor=0.999) optimal_policy_flat = np.where(optimal_policy == 1)[1] mean_number_of_steps, lost_games_perc = score_frozen_lake( env, optimal_policy_flat) if lost_games_perc < smallest_lost_games_perc: smallest_lost_games_perc = lost_games_perc best_map = map best_maps[size] = { 'lost_games_perc': smallest_lost_games_perc, 'map': best_map } with open(f'best_maps_{map_p}.json', "wb") as f: f.write(json.dumps(best_maps).encode("utf-8")) return best_maps
def run(): env = FrozenLakeEnv(desc=MAP_20x20) rtdp = RTDP(env) tot_rewards = 0. eval_iter = int(1e4) for i in range(eval_iter): if i % 100 == 0: print(i) tot_rewards += evaluate(rtdp, env) print(tot_rewards / eval_iter)
def load_frozen_lake(desc=None, map_name=None, is_slippery=False): """ loads premade env from openai gym desc: either none or a list of lists of custom description of map map_name: either none or a string containing name of premade map (if both desc and map name are none will load randomly is_slippery: bool for if ice is slippery returns: the env """ env = FrozenLakeEnv(desc, map_name, is_slippery) return env
def load_frozen_lake(desc=None, map_name=None, is_slippery=False): """ loads the pre-made FrozenLakeEnv evnironment from OpenAIs gym. Args: desc: (list/None) lists containing a custom description of the map to load for the environment. map_name: (str/None) containing the pre-made map to load. is_slippery: (bool) boolean to determine if the ice is slippery. Returns: the environment. """ return FrozenLakeEnv(desc, map_name, is_slippery)
def load_frozen_lake(desc=None, map_name=None, is_slippery=False): """ Loads a premade FrozenLakeEnv environment from OpenAI's gym desc: None, or a list of lists containing a custom description of the map to load for the enironment. map_name: None or a string containing the pre-made map to load is_slippery: boolean to determine if the ice is slippery Returns: The environment """ env = FrozenLakeEnv(desc, map_name, is_slippery) return env
def test_expected(self): env = FrozenLakeEnv(is_slippery=False) policy = UserInputPolicy(env) s = env.reset() env.render() for i in [RIGHT, RIGHT, DOWN, DOWN, DOWN, RIGHT]: with MockInputFunction(return_value=i): a = policy(s) s, r, done, info = env.step(a) env.render() if done: break
def main(): from gym.envs.toy_text.frozen_lake import FrozenLakeEnv env = FrozenLakeEnv(is_slippery=False, map_name=f'{SIZE[0]}x{SIZE[1]}') env.actions = [0, 1, 2, 3] #Q = np.zeros((SIZE[0]*SIZE[1], len(env.actions))) Q = np.random.uniform(low=0, high=1, size=(SIZE[0] * SIZE[1], len(env.actions))) print(obtain_value_function(Q)) episodes = 10000 eps = .3 Qs, episode_lengths = [], [] """for n in range(1,15): plt.figure() for alpha in np.arange(.1, 1., .1): Q_learned, episode_length = n_step_sarsa(Q.copy(), n, episodes, env, alpha, 0.8, eps=eps) Qs.append(Q_learned) episode_lengths.append(episode_length) plt.plot(smooth(episode_length), label=f'$n=${n}, $\\alpha=${alpha:.1f}') plt.legend()""" n = 4 Q_learned, episode_lengths_qlearning = n_step_sarsa(Q.copy(), n, episodes, env, 0.1, 0.8, eps=eps) plt.legend() value = obtain_value_function(Q_learned) policy = obtain_policy(Q_learned) pretty_print(value, policy, f'${n}$-step Sarsa') plt.show()
def load_frozen_lake(desc=None, map_name=None, is_slippery=False): '''loads the pre-made FrozenLakeEnv evnironment from OpenAI’s gym Args: desc: is either None or a list of lists containing a custom description of the map to load for the environment map_name:is either None or a string containing the pre-made map to load Note: If both desc and map_name are None, the environment will load a randomly generated 8x8 map is_slippery: is a boolean to determine if the ice is slippery Returns: the environment ''' if desc is None and map_name is None: map_name = "8x8" env = FrozenLakeEnv(desc, map_name, is_slippery) return env
def load_frozen_lake(desc=None, map_name=None, is_slippery=False): """Loads the pre-made FrozenLakeEnv environment from OpenAI’s gym. Args: desc (list): is either None or a list of lists containing a custom description of the map to load for the environment. map_name (str): is either None or a string containing the pre-made map to load. is_slippery (bool): determine if the ice is slippery. Returns: the environment. """ env = FrozenLakeEnv(desc, map_name, is_slippery) return env
import numpy as np import gym import random from gym.envs.toy_text.frozen_lake import FrozenLakeEnv # env = gym.make("FrozenLake-v0",is_slippery=True) env=FrozenLakeEnv(desc=None, map_name="4x4",is_slippery=False) action_size = env.action_space.n state_size = env.observation_space.n qtable = np.zeros((state_size, action_size)) print(qtable) total_episodes = 10000 # Total episodes learning_rate = 0.8 # Learning rate max_steps = 99 # Max steps per episode gamma = 0.95 # Discounting rate # Exploration parameters epsilon = 1.0 # Exploration rate max_epsilon = 1.0 # Exploration probability at start min_epsilon = 0.2 # Minimum exploration probability decay_rate = 0.01 # Exponential decay rate for exploration prob # List of rewards rewards = [] # 2 For life or until learning is stopped for episode in range(total_episodes): # Reset the environment
import gym import random import numpy as np from gym.envs.toy_text.frozen_lake import FrozenLakeEnv char_list = list('SFFFFFFFFFFFFFFG') for i in range(2): char_list[random.randint(1, 14)] = 'H' my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]] env = FrozenLakeEnv(desc=np.asarray(my_map, dtype='c'), is_slippery=False) env = env.unwrapped for i in range(10): b = env.render() a = env.step(1) print(a)
# # train_q_agent.py # Training Q-learning agent in OpenAI Gym's FrozenLake env # import random import gym from q_agent import QAgent from gym.envs.toy_text.frozen_lake import FrozenLakeEnv # How long do we play NUM_EPISODES = 500 # How often we print results PRINT_EVERY_EPS = 100 environment = FrozenLakeEnv(is_slippery=False) num_states = environment.observation_space.n num_actions = environment.action_space.n agent = QAgent(num_states, num_actions) sum_reward = 0 for episode in range(NUM_EPISODES): done = False last_state = environment.reset() last_reward = None # Number of steps taken. A bit of a safeguard... num_steps = 0 while not done:
#!/usr/bin/env python3 # # learn_random_v.py # Learning value function of random agent in FrozenLakeEnv # import gym from v_table import VTable from gym.envs.toy_text.frozen_lake import FrozenLakeEnv # How long do we play NUM_EPISODES = 10000 # How often we show current V-estimate SHOW_EVERY_EPISODES = 100 environment = FrozenLakeEnv(is_slippery=False) num_states = environment.observation_space.n # Create a tabular record of values vtable = VTable(num_states) for episode in range(NUM_EPISODES): done = False state = environment.reset() # Keep track of visited states and rewards # obtained states = [] rewards = [] while not done: # Store state states.append(state)
char_list = list('SFFFFFFFFFFFFFFG') for i in range(holes_num): char_list[random.randint(1, 14)] = 'H' my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]] return my_map def encode_state(map, position): np_map = np.asarray(map, dtype='c').reshape(16) holes = np.where(np_map == b'H', 1, 0) forzen = np.where(np_map == b'F', 1, 0) position = np.identity(16)[position] return np.hstack([holes, forzen, position]) env = FrozenLakeEnv(desc=random_map(HOLE_NUM), is_slippery=IS_SLIPPERY) env = env.unwrapped N_ACTIONS = env.action_space.n N_STATES = 16 + 16 + 16 # 'F', 'H', 'where is the Agent' ENV_A_SHAPE = 0 if isinstance( env.action_space.sample(), int) else env.action_space.sample().shape # to confirm the shape episode_durations = [] def plot_durations(): plt.figure(1) durations_t = torch.tensor(episode_durations, dtype=torch.float) plt.title('Training...') plt.xlabel('Episode(1000)')
import numpy as np import keras_gym as km import tensorflow as tf from tensorflow import keras from tensorflow.keras import backend as K from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT if tf.__version__ >= '2.0': tf.compat.v1.disable_eager_execution() # otherwise incredibly slow # the MDP actions = {LEFT: 'L', RIGHT: 'R', UP: 'U', DOWN: 'D'} env = FrozenLakeEnv(is_slippery=False) env = km.wrappers.TrainMonitor(env) # show logs from TrainMonitor km.enable_logging() class LinearFunc(km.FunctionApproximator): """ linear function approximator (body only does one-hot encoding) """ def body(self, S): one_hot_encoding = keras.layers.Lambda(lambda x: K.one_hot(x, 16)) return one_hot_encoding(S) # define function approximators func = LinearFunc(env, lr=0.01) pi = km.SoftmaxPolicy(func, update_strategy='vanilla') cache = km.caching.MonteCarloCache(env, gamma=0.99)
# env.render() action = agent.select_action(state, selection_method) next_state, reward, done, _ = env.step(action) score += reward agent.update(state, action, reward, next_state, done, update_method) state = next_state if done: break scores[episode] = score print('Episode %s \t Return %s' % (episode, score)) return scores env = FrozenLakeEnv(is_slippery=False) n_episodes = 1000 horizon = 20 scores_vpi = simulate(env, n_episodes, horizon, BQLearning.MYOPIC_VPI, BQLearning.MOMENT_UPDATING) scores_qvs = simulate(env, n_episodes, horizon, BQLearning.Q_VALUE_SAMPLING, BQLearning.MOMENT_UPDATING) plt.style.use('ggplot') fig, ax = plt.subplots() ax.plot(cummean(scores_vpi), label='Myopic VPI') ax.plot(cummean(scores_qvs), label='QValue sampling') ax.set_xlabel('episode') ax.set_ylabel('cumulative average return')
def V(s): return (1 - epsilon) * np.max(Q[s, :]) + epsilon * np.mean(Q[s, :]) """ LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 """ # Deterministic environment from gym.envs.toy_text.frozen_lake import FrozenLakeEnv #env = FrozenLakeEnv(is_slippery=False) env = FrozenLakeEnv(map_name="8x8") #env = gym.make("FrozenLake-v0") num_states = env.observation_space.n num_actions = env.action_space.n Q = np.zeros([num_states, num_actions]) #Q = np.random.randn(num_states,num_actions) num_episodes = 100000 rewardvector = [] gamma = 0.7 alpha = 0.4 epsilon = 0.5 #print(Q.shape)
import numpy as np import numpy.random as rd import matplotlib.pyplot as plt from gym.envs.toy_text.frozen_lake import FrozenLakeEnv # Algorithm parameters learning_rate = 0.5 gamma = 1. epsilon = .01 render = False N_trial = 1000 N_trial_test = 100 trial_duration = 100 # Generate the environment env = FrozenLakeEnv(map_name='4x4', is_slippery=False) n_state = env.observation_space.n n_action = env.action_space.n # Initialize the Q values Q_table = np.zeros((n_state, n_action)) def policy(Q_table, state, epsilon): ''' Implementation of the epsilon greedy policy. :param Q_table: Table containing the expected return for each action and state pair :param state: :return: '''
def create_env(lake_map): return FrozenLakeEnv(desc=lake_map)
def V(s): return (1 - epsilon) * np.max(Q[s, :]) + epsilon * np.mean(Q[s, :]) """ LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 """ # Deterministic environment from gym.envs.toy_text.frozen_lake import FrozenLakeEnv env = FrozenLakeEnv(is_slippery=False) #env = FrozenLakeEnv(map_name="8x8") #env = gym.make("FrozenLake-v0") num_states = env.observation_space.n num_actions = env.action_space.n Q = np.zeros([num_states, num_actions]) #Q = np.random.randn(num_states,num_actions) num_episodes = 100 rewardvector = [] gamma = 0.5 alpha = 0.8 epsilon = 0.5
def __init__(self): super().__init__() self.env = FrozenLakeEnv(map_name="4x4", is_slippery=True)
def load_frozen_lake(desc=None, map_name=None, is_slippery=False): """Loads a pre-made FrozenLakeEnv environment""" env = FrozenLakeEnv(desc, map_name, is_slippery) return env