def __init__(self, wid): self.wid = wid self.env = ReacherEnv( headless=True) # for multi-process the headless has to be True self.ppo = GLOBAL_PPO self.pins_x = [] self.pins_y = []
def run(): env = ReacherEnv(headless=True) s = env.reset() for i in range(10): # action=sac_trainer.policy_net.sample_action() action = sac_trainer.policy_net.get_action(s, deterministic=DETERMINISTIC) action = np.random.rand(7) s, r, done = env.step(action) print(s)
class Worker(): def __init__(self): self.env=ReacherEnv(headless=True) # for multi-process the headless has to be True def work(self): frame_idx=0 # training loop for eps in range(max_episodes): state = self.env.reset() episode_reward = 0 for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() try: next_state, reward, done = self.env.step(action) except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) self.env.shutdown() replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 # if len(replay_buffer) > batch_size: # for i in range(update_itr): # _=self.sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) if eps % 10 == 0 and eps>0: plot(rewards) sac_trainer.save_model(model_path) if done: break print('Episode: ', eps, '| Episode Reward: ', episode_reward) rewards.append(episode_reward) sac_trainer.save_model(model_path) self.env.shutdown()
def make_env(max_steps, seed): from reacher_sawyer_env import ReacherEnv # from reacher_sawyer_visual_env import ReacherEnv env = ReacherEnv(headless=True, control_mode='end_position') return Monitor(TimeLimit(env, max_steps))
def __init__(self): self.env=ReacherEnv(headless=True) # for multi-process the headless has to be True
def get_v(self, s): if s.ndim < 2: s = s[np.newaxis, :] return self.sess.run(self.v, {self.tfs: s})[0, 0] def save(self, path): saver = tf.train.Saver() saver.save(self.sess, path) def load(self, path): saver = tf.train.Saver() saver.restore(self.sess, path) if __name__ == '__main__': model_path = './model/ppo_single' env = ReacherEnv(headless=False) S_DIM = env.observation_space.shape[0] A_DIM = env.action_space.shape[0] ppo = PPO( ) # if true, using visual-based input, or esle using numerical intput all_ep_r = [] # ppo.load(model_path) for ep in range(EP_MAX): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(EP_LEN): # in one episode a = ppo.choose_action(s) s_, r, done = env.step(a)
def worker(wid): env = ReacherEnv( headless=True) # for multi-process the headless has to be True ppo = GLOBAL_PPO pins_x = [] pins_y = [] global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER step_set = [] epr_set = [] step = 0 while not COORD.should_stop(): s = env.reset() step += 1 ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] pins_x = [] pins_y = [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = ppo.choose_action(s) # print('a: ', a) s_, r, done = env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if GLOBAL_EP % 100 == 0 and GLOBAL_EP > 0: ppo.save(model_path) # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % wid, '|Ep_r: %.2f' % ep_r, ) step_set.append(step) # print(step) epr_set.append(ep_r) if step % 10 == 0: # plot every N episode; some error about main thread for plotting plt.plot(step_set, epr_set) # no moving average try: plt.savefig('./ppo_multi.png') except: print('writing conflict!') env.shutdown()
GLOBAL_RUNNING_R = [] COORD = tf.train.Coordinator() QUEUE = queue.Queue() # workers putting data in this queue threads = [] for i in range(N_WORKER): t = threading.Thread(target=worker, args=(i, )) t.daemon = True # kill the main thread, the sub-threads die as well t.start() # training threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update, )) threads[-1].start() # start the updating thread as well COORD.join(threads) # waiting to finish all threads GLOBAL_PPO.save(model_path) if args.test: env = ReacherEnv(headless=True) env.reset() GLOBAL_PPO = PPO() GLOBAL_PPO.load(model_path) test_steps = 200 test_episode = 10 for _ in range(test_episode): s, info = env.reset() for t in range(test_steps): s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) env.shutdown()
self.policy_net.eval() def plot(rewards): clear_output(True) plt.figure(figsize=(20, 5)) plt.plot(rewards) plt.savefig('sac_v2.png') # plt.show() replay_buffer_size = 1e6 replay_buffer = ReplayBuffer(replay_buffer_size) # choose env env = ReacherEnv(headless=False) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] model_path = './model/sac' # hyper-parameters for RL training max_episodes = 10000 max_steps = 30 frame_idx = 0 batch_size = 256 explore_steps = 2000 # for random action sampling in the beginning of training update_itr = 1 AUTO_ENTROPY = True DETERMINISTIC = False hidden_dim = 512