예제 #1
0
class Worker(object):
    def __init__(self, name):
        self.env = SpyndraEnv(N_S, N_A)
        self.name = name
        self.agent = RandomSearch(N_A)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP

        while GLOBAL_EP < MAX_GLOBAL_EP:
            rewards = []
            for idx_pattern in range(self.agent.n_patterns):

                s = self.env._reset()
                ep_r, dist_traveled = 0, -999.
                for step in range(MAX_STEP):

                    a = self.agent.choose_action(idx_pattern,
                                                 step % self.agent.n_gates)

                    s_, r, done, info = self.env._step(a, s)
                    #done = True if ep_t == MAX_EP_STEP - 1 else False
                    if self.name == 'W_0':
                        print("Ep %4i, %2i th agent, step %4i" %
                              (GLOBAL_EP, idx_pattern, step))
                        print("distance to goal=", info, "reward=", r)
                        print("position before action=", list(s[:8]))
                        print("action=", list(a))
                        print("position after  action=", list(s_[:8]))

                    dist_traveled = 10. - info
                    print("dist_traveled", dist_traveled)

                    if done:
                        print("We get our best model!!!")
                        np.save('best_pattern.npy',
                                self.agent.patterns[idx_pattern])
                        break

                    s = s_
                rewards.append(dist_traveled)

            self.agent.update(rewards)
            GLOBAL_EP += 1
            print("Best pattern traveled for", max(rewards))
            with open('ep_reward.txt', 'a') as f:
                f.write('ep=%i, distence traveled=%f\n' %
                        (GLOBAL_EP, np.mean(sorted(rewards)[-5:])))
예제 #2
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = SpyndraEnv(N_S, N_A)
        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        # state, action, reward buffer
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env._reset()
            ep_r, dist_traveled = 0, -999.
            for ep_t in range(MAX_EP_STEP):
                # if self.name == 'W_0':
                #     self.env.render()
                a = self.AC.choose_action(s)
                s_, r, done, info = self.env._step(a, s)
                    #done = True if ep_t == MAX_EP_STEP - 1 else False
                if self.name == 'W_0':
                    print("Ep %4i, step %4i" % (GLOBAL_EP, ep_t))
                    print("distance to goal=", info, "reward=", r)
                    print("position before action=", list(s[:8]))
                    print("action=", list((np.array(a)*10).round()))
                    print("position after  action=", list(s_[:8]))
                
                ep_r += r
                # normalize state before sending into A3C
                s = np.array(s)
                s_[14:28] = normalize(s_[14:28])
                s[:14] = normalize(s[:14])
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r/10.) 
                dist_traveled = 10. - info
                print("dist_traveled", dist_traveled)
                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
                    if done:
                        v_s_ = 0   # terminal reward (based on A3C pesudo code)
                    else:
                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:    # reverse buffer r 
                        v_s_ = r + GAMMA * v_s_ # reward discount : R <-- ri + GAMMA * R (reverse order)
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    a_l, c_l = self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()
                    with open('loss.txt', 'a') as f:
                        f.write("A loss="+ str(a_l) + ", C loss=" + str(c_l) + "\n")
                    print("A loss = ", a_l, "C loss = ", c_l)
                s = s_
                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:", GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                          )
                    break
            GLOBAL_EP += 1
            with open('ep_reward.txt', 'a') as f:
                f.write('ep=%i, reward=%d, distence traveled=%f\n' % (GLOBAL_EP, ep_r, dist_traveled))