Exemplo n.º 1
0
def evaluate(goals, EQ):
    env = GridWorld(goals=goals, T_states=T_states)
    policy = EQ_P(EQ)
    state = env.reset()
    done = False
    t = 0
    G = 0
    while not done and t < 100:
        action = policy[state]
        state_, reward, done, _ = env.step(action)
        state = state_
        G += reward
        t += 1
    return G
Exemplo n.º 2
0
        next_value = self.sess.run(self.value, {self.state: next_state})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                    {self.state: state, self.next_value: next_value, self.reward: reward})
        return td_error


sess = tf.Session()

actor = Actor(sess, n_features=n_features, n_actions=n_actions, lr=lr_actor)
critic = Critic(sess, n_features=n_features, lr=lr_critic)

sess.run(tf.global_variables_initializer())

for i_episode in range(MAX_EPISODE):
    _, state = env.reset()
    step = 0
    track_r = []
    while True:

        action = actor.choose_action(state)
        _, next_state, reward, done = env.step(action)
        env.render()
        track_r.append(reward)

        td_error = critic.learn(state, reward, next_state)
        actor.learn(state, action, td_error)
        state = next_state
        step += 1

        if done or step >= MAX_EP_STEPS:
Exemplo n.º 3
0
    #env = Env()
    grid_world = GridWorld()
    grid_world.set_obstacle_reward()
    #Functions.create_random_obstacles(grid_world, 0.05)
    Functions.create_fixed_obstacles(grid_world, 5)
    grid_world.create_grid_ui(grid_world.m, grid_world.n,
                              (grid_world.start_x, grid_world.start_y),
                              (grid_world.end_x, grid_world.end_y),
                              grid_world.obstacles)

    agent = SARSAgent(actions=list(range(grid_world.action_size)))
    number_of_episodes = 10
    for episode in range(number_of_episodes):
        # reset environment and initialize state

        state = grid_world.reset()
        # get action of state from agent
        action = agent.get_action(str(state))

        while True:
            grid_world.render()

            # take action and proceed one step in the environment
            next_state, reward, done = grid_world.step(action)
            next_action = agent.get_action(str(next_state))

            # with sample <s,a,r,s',a'>, agent learns new q function
            agent.learn(str(state), action, reward, str(next_state),
                        next_action)

            state = next_state
Exemplo n.º 4
0
__author__ = 'dot'

from GridWorld import GridWorld
from QAgent import QAgent
import numpy as np
import matplotlib.pyplot as plt

##
env = GridWorld(size=10)
q_agent = QAgent(env.get_number_of_states(), GridWorld.get_number_of_actions())
res = []
for idx_epoch in range(400):
    s, r, d, info = env.reset()
    print("Reset:st=%d,r=%f,d=%d,%s" % (s, r, d, str(info)))
    for t in range(100):
        # env.render()
        curAction = q_agent.get_action_epsilon_greedy(env.get_state())
        # print("State=%d,Action=%d"%(env.getState(),curAction))
        nxtSt, nxtR, done, info = env.step(curAction)
        # print("nxtSt=%d,nxtR=%f,d=%d,info=%s"%(nxtSt,nxtR,done,str(info)))
        q_agent.update(curAction, nxtSt, nxtR)
        if done:
            print("Episode %d finished after %d time steps" %
                  (idx_epoch, t + 1))
            #print(q_agent.show_q())
            print("=============")
            res.append(t + 1)
            break

print("p1")