def main(): new_map = ["SFFF", "FHFH", "FFFH", "HFFG"] env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY) env = env.unwrapped succeed_episode = 0 for i_episode in range(1000000): if use_random_map and i_episode % 10 == 0: env.close() new_map = random_map(HOLE_NUM) env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY) env = env.unwrapped pos = env.reset() state = encode_state(new_map, pos) ep_r = 0 while True: a = select_action(state) pos_next, r, done, info = env.step(a) ep_r += r #state_next = encode_state(new_map, pos_next) if args.render: env.render() model.rewards.append(r) if done: break finish_episode() episode_durations.append(ep_r) if ep_r > 0: # EPSILON = 1 - 1. / ((i_episode / 500) + 10) succeed_episode += 1 if i_episode % 1000 == 1: print('EP: {:d} succeed rate {:4f}'.format(i_episode, succeed_episode / 1000)) succeed_episode = 0 if i_episode % 5000 == 1: plot_durations()
def test_expected(self): env = FrozenLakeEnv(is_slippery=False) policy = UserInputPolicy(env) s = env.reset() env.render() for i in [RIGHT, RIGHT, DOWN, DOWN, DOWN, RIGHT]: with MockInputFunction(return_value=i): a = policy(s) s, r, done, info = env.step(a) env.render() if done: break
Q_table[state, action]) Q_table[state, action] += learning_rate * delta reward_list = [] for k in range(N_trial + N_trial_test): acc_reward = 0 # Init the accumulated reward observation = env.reset() # Init the state action = policy(Q_table, observation, epsilon) # Init the first action for t in range(trial_duration): if render: env.render() new_observation, reward, done, info = env.step( action) # Take the action new_action = policy(Q_table, new_observation, epsilon) update_Q_table(Q_table=Q_table, state=observation, action=action, reward=reward, new_state=new_observation, new_action=new_action, is_done=done) observation = new_observation # Pass the new state to the next step action = new_action # Pass the new action to the next step acc_reward += reward # Accumulate the reward if done: break # Stop the trial when you fall in a hole or when you find the goal
import gym import random import numpy as np from gym.envs.toy_text.frozen_lake import FrozenLakeEnv char_list = list('SFFFFFFFFFFFFFFG') for i in range(2): char_list[random.randint(1, 14)] = 'H' my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]] env = FrozenLakeEnv(desc=np.asarray(my_map, dtype='c'), is_slippery=False) env = env.unwrapped for i in range(10): b = env.render() a = env.step(1) print(a)
# 3. Choose an action a in the current world state (s) ## First we randomize a number exp_exp_tradeoff = random.uniform(0, 1) # print(exp_exp_tradeoff,epsilon) ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state) if exp_exp_tradeoff > epsilon: action = np.argmax(qtable[state, :]) # print("action",action) # Else doing a random choice --> exploration else: action = env.action_space.sample() # Take the action (a) and observe the outcome state(s') and reward (r) new_state, reward, done, info = env.step(action) if reward>0: print(episode,qtable) # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] # qtable[new_state,:] : all the actions we can take from new state qtable[state, action] = qtable[state, action] + learning_rate * ( reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]) # print("qvalue",qtable[state, action]) total_rewards += reward # Our new state is state
agent = QAgent(num_states, num_actions) sum_reward = 0 for episode in range(NUM_EPISODES): done = False last_state = environment.reset() last_reward = None # Number of steps taken. A bit of a safeguard... num_steps = 0 while not done: # Epsilon-greedy policy action = agent.get_action(last_state, environment) state, reward, done, info = environment.step(action) # A crude timeout: If we play too long without # completing the level, kill the game num_steps += 1 if num_steps > 1000: print( "Episode timeout! Could not finish in 1000 steps. Check your actions!" ) done = True # Update Q-table if we have one whole experience of # s, a, r, s', t' if last_state is not None: agent.update( last_state,
return a averageepisodelength = [] for i in range(num_episodes): episodelength = 0 state = env.reset() totalreward = 0 rand = np.random.randn(1, env.action_space.n) #action = random.randint(0,num_actions-1) done = False action = epsilon_policy(state, Q, epsilon) #print(state,action) while not done: newstate, reward, done, q = env.step(action) #print(newstate, reward, done , q) newaction = epsilon_policy(newstate, Q, epsilon) #newaction = np.argmax(Q[newstate, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1)) ) #print("A:",newaction) Q[state, action] = Q[state, action] + alpha * ( reward + gamma * Q[newstate, newaction] - Q[state, action]) totalreward += reward state = newstate action = newaction episodelength += 1 rewardvector.append(totalreward) averageepisodelength.append(episodelength) if i % 500 == 0 and i is not 0: print("Average episode length", np.mean(averageepisodelength))
cache = km.caching.MonteCarloCache(env, gamma=0.99) # static parameters num_episodes = 250 num_steps = 30 # train for ep in range(num_episodes): s = env.reset() cache.reset() for t in range(num_steps): a = pi(s) s_next, r, done, info = env.step(a) # small incentive to keep moving if np.array_equal(s_next, s): r = -0.1 cache.add(s, a, r, done) if done: while cache: S, A, G = cache.pop() pi.batch_update(S, A, G) break s = s_next
SHOW_EVERY_EPISODES = 100 environment = FrozenLakeEnv(is_slippery=False) num_states = environment.observation_space.n # Create a tabular record of values vtable = VTable(num_states) for episode in range(NUM_EPISODES): done = False state = environment.reset() # Keep track of visited states and rewards # obtained states = [] rewards = [] while not done: # Store state states.append(state) # Take random action state, reward, done, info = environment.step( environment.action_space.sample()) # Store reward rewards.append(reward) # Update v-estimate with the played game vtable.process_trajectory(states, rewards) if ((episode + 1) % SHOW_EVERY_EPISODES) == 0: vtable.visualize_v((4, 4))