Exemplo n.º 1
0
def execute():
    """ Execute Learning Algorithm """

    global eligibility, q_old

    assert initiated, "TOSL not initiated! setup() must be previously called"

    s = lp.s
    a = lp.a
    alpha = lp.alpha
    q = lp.q
    v = lp.v
    policy = lp.policy

    # Specific Learning Algorithm
    agent.execute_action(a)
    time.sleep(lp.step_time)
    # robot.stop()
    # time.sleep(TASK.STEP_TIME/2)
    sp = agent.observe_state()
    r = agent.obtain_reward(s, a, sp)

    ap = agent.select_action(sp)  # Exploration strategy

    diff_q = q[s, a] - q_old
    q_old = q[sp, ap]

    delta = r + exp.GAMMA * q[sp, ap] - q[s, a]  # TD error

    eligibility[s, a] = (1.0 - alpha) * eligibility[s, a] + 1

    if eli_queue.count(s) > 0:
        eli_queue.remove(s)
    assert eli_queue.count(s) == 0, ("duplicated states found in ET ", str(s))
    eli_queue.appendleft(s)

    for i in eli_queue:  # no all states updated, just those in eli_queue
        # replace eli_queue by range(task.n_states) for non-reduced ET
        for j in range(task.n_actions):
            if eligibility[i, j] > 0.01:
                q[i, j] = q[i, j] + alpha * (delta + diff_q) * eligibility[i, j]
                eligibility[i, j] *= exp.GAMMA * exp.LAMBDA
            else:
                eligibility[i, j] = 0

            if i == s and j == a:
                q[i, j] = q[i, j] - alpha * diff_q

        # update v and policy
        v[i] = np.max(q[i])
        policy[i] = np.argmax(q[i])

    lp.s, lp.a = s, a
    lp.sp, lp.ap = sp, ap
    lp.r = r
    lp.alpha = alpha
    lp.q = q
    lp.v = v
    lp.policy = policy
    lp.delta = delta
Exemplo n.º 2
0
def execute():
    """Execute the learning algorithm"""
    s = lp.s
    a = lp.a
    alpha = lp.alpha
    q = lp.q
    v = lp.v
    policy = lp.policy

    agent.execute_action(a)
    time.sleep(lp.step_time)

    sp = agent.observe_state()
    r = agent.obtain_reward(s, a, sp)

    ap = agent.select_action(sp)

    # update Q
    delta = r + exp.GAMMA * q[sp, ap] - q[s, a]  # TD error (SARSA)
    q[s, a] = q[s, a] + alpha * delta  # update rule

    # Update V and Policy
    v[s] = np.max(q[s])
    policy[s] = np.argmax(q[s])

    lp.s = s
    lp.a = a
    lp.sp = sp
    lp.ap = ap
    lp.r = r
    lp.alpha = alpha
    lp.q = q
    lp.v = v
    lp.policy = policy
    lp.delta = delta
Exemplo n.º 3
0
def execute():
    s=lp.s
    q=lp.q
    v=lp.v
    policy = lp.policy
    alpha = lp.alpha
    gamma = lp.gamma
    a=ap=agent.actionselection(s)
    agent.execute_action(a)
    time.sleep(lp.step_time)
    sp=agent.observestate()
    r=agent.get_reward()

    q[s,a]=q[s,a]+alpha*(r+gamma*q[sp,ap]-q[s,a])

    v[s]=np.max(q[s])
    policy[s]=np.argmax(q[s])
    lp.s = s
    lp.a = a
    lp.sp = sp
    lp.ap = ap
    lp.r = r
    lp.q = q
    lp.v = v
    lp.policy = policy

    return
Exemplo n.º 4
0
def execute():
    """ Execute the learning algorithm """

    global eligibility

    assert initiated, " SL not initiated! setup() must be previously called"

    s = lp.s
    a = lp.a
    alpha = lp.alpha
    q = lp.q
    v = lp.v
    policy = lp.policy

    # Specific Learning Algorithm
    agent.execute_action(a)
    time.sleep(lp.step_time)
    sp = agent.observe_state()
    r = agent.obtain_reward(s, a, sp)

    ap = agent.select_action(sp)  # Exploration strategy

    delta = r + exp.GAMMA * q[sp, ap] - q[s, a]  # TD error

    eligibility[s, a] = 1.0  # replace trace

    if eli_queue.count(s) > 0:
        eli_queue.remove(s)
    assert eli_queue.count(s) == 0, ("duplicated states found in ET: ", str(s))
    eli_queue.appendleft(s)

    # only the states in eli_queue are updated:
    for i in eli_queue:  # no all states updated, just those in eli_queue
        # replace eli_queue by range(task.n_states) for non-reduced ET
        for j in range(task.n_actions):
            if eligibility[i, j] > 0.01:
                q[i, j] = q[i, j] + alpha * delta * eligibility[i, j]
                eligibility[i, j] *= exp.GAMMA * exp.LAMBDA
            else:
                eligibility[i, j] = 0

        # update v and policy
        v[i] = np.max(q[i])
        policy[i] = np.argmax(q[i])

    lp.s = s
    lp.a = a
    lp.sp = sp
    lp.ap = ap
    lp.r = r
    lp.alpha = alpha
    lp.q = q
    lp.v = v
    lp.policy = policy
    lp.delta = delta
Exemplo n.º 5
0
def run_DQN():
    import agent
    import time
    import exp
    import rl_vrep

    rl_vrep.connect()
    rl_vrep.start()
    time.sleep(0.5)
    agent.setup_task()
    time.sleep(0.5)
    agent.setup()
    time.sleep(0.5)
    step = 0
    for epi in range(30000):
        print('step:', step)
        observation = np.empty(shape=(10, 10))
        observation_ = np.empty(shape=(10, 10))
        for i in range(10):
            observation_i = agent.observestate()
            observation_i = agent.unwrap_state(observation_i)
            observation[i] = np.array(observation_i)
        observation = observation[np.newaxis, :]
        observation = Robot.sess.run(Robot.rnn_out,
                                     feed_dict={Robot.rnn_in: observation})

        #observation = np.mean(observation,axis= 0)
        action = Robot.choose_action(observation)

        agent.execute_action(action)
        print('action:', action)
        #time.sleep(0.5)

        r = agent.get_reward()
        print('reward:', r)
        for i in range(10):
            observation_i = agent.observestate()
            observation_i = agent.unwrap_state(observation_i)
            observation_[i] = np.array(observation_i)
        #observation_ = np.mean(observation_,axis= 0)
        #.........
        observation_ = observation_[np.newaxis, :]
        observation_ = Robot.sess.run(Robot.rnn_out,
                                      feed_dict={Robot.rnn_in: observation_})
        Robot.store(observation, action, r, observation_)

        if (step > 200) and (step % 10 == 0):
            Robot.learn()

        observation = observation_

        step += 1
    print('run over!')
Exemplo n.º 6
0
    def work(self, sess, coord):
        print('starting %s...\n' % self.name)
        while not coord.should_stop():
            sess.run(self.update_local_op)
            memory = []
            s = agent.observestate()
            s = agent.unwrap_state(s)
            while True:
                p, v = sess.run([self.local_net.policy, self.local_net.value],
                                feed_dict={self.local_net.s: s[np.newaxis, :]})
                a = np.random.choice(range(N_A), p=p[0])

                agent.execute_action(a)
                print('action:', a)
                time.sleep(0.3)

                r = agent.get_reward()
                s1 = agent.observestate()
                s1 = agent.unwrap_state(s1)

                memory.append([s, a, r, s1, v[0][0]])
                s = s1
                global_steps = next(self.global_steps_counter)
                self.local_steps += 1
                #sess.run(self.anneal_lr_op)

                collide = None
                if np.where(s1 < 0):
                    collide = True
                else:
                    collide = False

                if not collide and len(memory) == MEMORY_SIZE:
                    v1 = sess.run(self.local_net.value,
                                  feed_dict={self.local_net.s: s})
                    self.train(memory, sess, v1[0][0], global_steps)
                    memory = []
                    sess.run(self.update_local_op)
                if collide:
                    break

            if len(memory) != 0:
                self.train(memory, sess, 0.0, global_steps)

            if global_steps >= MAX_STEP:
                COORD.request_stop()
            """  
Exemplo n.º 7
0
rl_vrep.connect()
rl_vrep.start()
time.sleep(0.5)
agent.setup_task()
time.sleep(0.5)
agent.setup()
#robot.setup(["MOBILE_BASE", "DISTANCE_SENSOR"],[])
time.sleep(0.5)
policy = np.zeros((16384, 1), dtype=np.int)

with open("policy_results.txt", 'r') as f:
    for line in f:
        line = line.strip()
        if len(line) == 0: continue
        results = line.split(":")
        policy[int(results[0])] = int(results[1])

learning_process.policy = policy
print('policy:')
print(learning_process.policy)
i = 0

while 1:
    s = agent.observestate()
    print('%d-s:' % (i), s)
    a = learning_process.policy[s]
    print('%d-a:' % (i), a)
    agent.execute_action(a)
    time.sleep(0.5)
    i += 1