class Worker(object): def __init__(self, wid): self.wid = wid self.env = Maze(Map) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] t = 0 while True: if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) baseline_a = base.choose_action(self.env, 1) s_, r, done = self.env.step({(0, a), (1, U.ch(baseline_a))}) r = r[0] buffer_s.append(s.flatten()) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r t += 1 #print('step : %d, reward : %d, done : %d' % (t, r, done)) GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done: print(GLOBAL_EP) if done: v_s_ = 0 # terminal else: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or GLOBAL_EP >= EP_MAX: print('update') ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if done: # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) break
act[1]=0 else: if(act[0]==0 and act[1]==0): return if(act[0]==0): print('Wait for P0') else: print('Wait for P1') print('你按下了: ' + event.keysym) if human_play: Image.bind('<Key>', read_action) else: while(True): env.reset() stepcnt=0 start=time.time() score=[0,0] if(have_render): env.render() # render while(True): stepcnt+=1 act[0]=baseline.choose_action(env,0) act[1]=baseline.choose_action(env,1) res=env.step({(0,ch(act[0])),(1,ch(act[1]))}) cv2.imshow("Image", res[0]) cv2.waitKey() cv2.destroyAllWindows()
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 GLOBAL_RUNNING_R = [] COORD = tf.train.Coordinator() QUEUE = queue.Queue() # workers putting data in this queue threads = [] for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() # training threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update, )) threads[-1].start() COORD.join(threads) print('aasdas') env = Maze(Map) tf.reset_default_graph() load_PPO = PPO(Load=True) while True: s = env.reset() for t in range(100): env.render() a = load_PPO.choose_action(s) baseline_a = base.choose_action(env, 1) s, r, done = env.step({(0, a), (1, U.ch(baseline_a))}) if (r[0] != -1): print(r) if (done): break