示例#1
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = Maze(Map)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            t = 0
            while True:
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                baseline_a = base.choose_action(self.env, 1)
                s_, r, done = self.env.step({(0, a), (1, U.ch(baseline_a))})
                r = r[0]
                buffer_s.append(s.flatten())
                buffer_a.append(a)
                buffer_r.append(r)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                t += 1
                #print('step : %d, reward : %d, done : %d' % (t, r, done))

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
                    print(GLOBAL_EP)
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or GLOBAL_EP >= EP_MAX:
                        print('update')
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update
                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

                if done:
                    # record reward changes, plot later
                    if len(GLOBAL_RUNNING_R) == 0:
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                                ep_r * 0.1)
                    GLOBAL_EP += 1
                    print(
                        '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                        '|W%i' % self.wid,
                        '|Ep_r: %.2f' % ep_r,
                    )
                    break
示例#2
0
        act[1]=0
    else:
        if(act[0]==0 and act[1]==0):
            return
        if(act[0]==0):
            print('Wait for P0')
        else: print('Wait for P1')
    
    print('你按下了: ' + event.keysym)


if human_play:
    Image.bind('<Key>', read_action)
else:
    while(True):
        env.reset()
        stepcnt=0
        start=time.time()
        score=[0,0]
        if(have_render):
            env.render()   # render
        while(True):
            stepcnt+=1
            act[0]=baseline.choose_action(env,0)
            act[1]=baseline.choose_action(env,1)
            res=env.step({(0,ch(act[0])),(1,ch(act[1]))})
            
            cv2.imshow("Image", res[0])
            cv2.waitKey()
            cv2.destroyAllWindows()
            
示例#3
0
    GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
    GLOBAL_RUNNING_R = []
    COORD = tf.train.Coordinator()
    QUEUE = queue.Queue()  # workers putting data in this queue
    threads = []
    for worker in workers:  # worker threads
        t = threading.Thread(target=worker.work, args=())
        t.start()  # training
        threads.append(t)
    # add a PPO updating thread
    threads.append(threading.Thread(target=GLOBAL_PPO.update, ))
    threads[-1].start()

    COORD.join(threads)
    print('aasdas')

    env = Maze(Map)
    tf.reset_default_graph()
    load_PPO = PPO(Load=True)
    while True:
        s = env.reset()
        for t in range(100):
            env.render()
            a = load_PPO.choose_action(s)
            baseline_a = base.choose_action(env, 1)
            s, r, done = env.step({(0, a), (1, U.ch(baseline_a))})
            if (r[0] != -1):
                print(r)
            if (done):
                break