class Worker(object): def __init__(self, wid): self.wid = wid self.env = ArmEnv(mode=MODE[n_model]) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, )
def play(): print('Playing...') env = ArmEnv() ppo = PPO() ppo.load('unity-arm') s = env.reset() while True: a = ppo.choose_action(s) s, r, done = env.step(a) env.render()
class Worker(object): def __init__(self, wid): self.wid = wid self.env = ArmEnv(mode=MODE[n_model]) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1) GLOBAL_EP += 1 print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
if __name__ == '__main__': GLOBAL_PPO = PPO() UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() UPDATE_EVENT.clear() # no update now ROLLING_EVENT.set() # start to roll out workers = [Worker(wid=i) for i in range(N_WORKER)] GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 GLOBAL_RUNNING_R = [] COORD = tf.train.Coordinator() QUEUE = queue.Queue() threads = [] for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update,)) threads[-1].start() COORD.join(threads) # plot reward change and testing plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show() env.set_fps(30) while True: s = env.reset() for t in range(400): env.render() s = env.step(GLOBAL_PPO.choose_action(s))[0]
trajectory = [] total_rew = [] for step in range(num_steps): output = sess.run([soft_out], feed_dict={state: [s]}) probs = output[0][0] if not learn_flag: a = np.random.choice(env.action_space.n, p=[ 1. / env.action_space.n for i in range(env.action_space.n) ]) print("probs: ", probs, learn_flag, "action: ", a) else: a = np.random.choice(env.action_space.n, p=probs) print("probs: ", probs, learn_flag) new_state, reward, done, _ = env.step(a) total_rew.append(reward) trajectory.append((s, a, reward)) if done: if reward != 0: learn_flag = True env.render() print(reward) success_counter += 1 break s = new_state disc = discount_and_norm_rewards(total_rew, gamma) if learn_flag:
class Worker(object): def __init__(self, name, globalAC): self.env = ArmEnv(mode=MODE[n_model]) self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print(( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, )) GLOBAL_EP += 1 break
def train(config={}): tf.reset_default_graph() env = ArmEnv(mode='easy', should_random_target=True) ppo = PPO(config) all_ep_r = [] lambdas = [] should_render = 'should_render' in config.keys() and config['should_render'] start = time.clock() for ep in tqdm(range(ppo.EP_MAX), desc='Training'): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(ppo.EP_LEN): # in one episode if should_render: env.render() a = ppo.choose_action(s) s_, r, done = env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append((r+8)/8) # normalize reward, find to be useful s = s_ ep_r += r # update ppo if (t+1) % ppo.BATCH == 0 or t == ppo.EP_LEN-1: v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + ppo.GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] ppo.update(bs, ba, br) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1) print( 'Ep: %i' % ep, "|Ep_r: %i" % ep_r, ("|Lam: %.4f" % ppo.METHOD['lam']) if ppo.METHOD['name'] == 'kl_pen' else '', ) if ppo.METHOD['name'] == 'kl_pen': lambdas.append(ppo.METHOD['lam']) elapsed = time.clock() - start print('Train with method {} done!'.format(ppo.METHOD['name'])) print('Time elapsed {}s'.format(elapsed)) if 'should_save' in config and config['should_save']: ppo.save('arm') return { 'method': ppo.METHOD['name'], 'ep_r': all_ep_r, 'lambda': lambdas, 'time': elapsed, # 耗时 'config': config, # 当前变量 }
from arm_env import ArmEnv MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv(mode=MODE[n_model]) env = ArmEnv(mode=MODE[n_model]) S_DIM = env.state_dim A_DIM = env.action_dim A_BOUND = env.action_bound[1] print(S_DIM) print(A_DIM) print(A_BOUND) s = env.reset() print(s) a = env.sample_action() print(a) r = env.step(a) print(r)
class Worker(object): def __init__(self, name, globalAC): self.env = ArmEnv(mode=MODE[n_model]) self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, ) GLOBAL_EP += 1 break
from arm_env import ArmEnv import pyglet from pyglet.window import key import random import cv2 import numpy as np MODE = ['easy', 'hard'] n_model = 0 env = ArmEnv(mode=MODE[n_model]) env.set_fps(30) env.render() env.step([3, 3]) env.render() while True: env.step([0, 0]) env.render() #for j in range(100000000): # continue #cv2.imwrite('loop'+str(t)+'.jpg',im) #v1 = random.randint(1, 10) #v2 = random.randint(1, 10) #env.step2([100,100]) #env.render() #env.step2([100,50])