def evaluate(goals, EQ): env = GridWorld(goals=goals, T_states=T_states) policy = EQ_P(EQ) state = env.reset() done = False t = 0 G = 0 while not done and t < 100: action = policy[state] state_, reward, done, _ = env.step(action) state = state_ G += reward t += 1 return G
next_value = self.sess.run(self.value, {self.state: next_state}) td_error, _ = self.sess.run([self.td_error, self.train_op], {self.state: state, self.next_value: next_value, self.reward: reward}) return td_error sess = tf.Session() actor = Actor(sess, n_features=n_features, n_actions=n_actions, lr=lr_actor) critic = Critic(sess, n_features=n_features, lr=lr_critic) sess.run(tf.global_variables_initializer()) for i_episode in range(MAX_EPISODE): _, state = env.reset() step = 0 track_r = [] while True: action = actor.choose_action(state) _, next_state, reward, done = env.step(action) env.render() track_r.append(reward) td_error = critic.learn(state, reward, next_state) actor.learn(state, action, td_error) state = next_state step += 1 if done or step >= MAX_EP_STEPS:
#env = Env() grid_world = GridWorld() grid_world.set_obstacle_reward() #Functions.create_random_obstacles(grid_world, 0.05) Functions.create_fixed_obstacles(grid_world, 5) grid_world.create_grid_ui(grid_world.m, grid_world.n, (grid_world.start_x, grid_world.start_y), (grid_world.end_x, grid_world.end_y), grid_world.obstacles) agent = SARSAgent(actions=list(range(grid_world.action_size))) number_of_episodes = 10 for episode in range(number_of_episodes): # reset environment and initialize state state = grid_world.reset() # get action of state from agent action = agent.get_action(str(state)) while True: grid_world.render() # take action and proceed one step in the environment next_state, reward, done = grid_world.step(action) next_action = agent.get_action(str(next_state)) # with sample <s,a,r,s',a'>, agent learns new q function agent.learn(str(state), action, reward, str(next_state), next_action) state = next_state
__author__ = 'dot' from GridWorld import GridWorld from QAgent import QAgent import numpy as np import matplotlib.pyplot as plt ## env = GridWorld(size=10) q_agent = QAgent(env.get_number_of_states(), GridWorld.get_number_of_actions()) res = [] for idx_epoch in range(400): s, r, d, info = env.reset() print("Reset:st=%d,r=%f,d=%d,%s" % (s, r, d, str(info))) for t in range(100): # env.render() curAction = q_agent.get_action_epsilon_greedy(env.get_state()) # print("State=%d,Action=%d"%(env.getState(),curAction)) nxtSt, nxtR, done, info = env.step(curAction) # print("nxtSt=%d,nxtR=%f,d=%d,info=%s"%(nxtSt,nxtR,done,str(info))) q_agent.update(curAction, nxtSt, nxtR) if done: print("Episode %d finished after %d time steps" % (idx_epoch, t + 1)) #print(q_agent.show_q()) print("=============") res.append(t + 1) break print("p1")