def __init__(self, wid):
        self.wid = wid
        self.env = ReacherEnv(
            headless=True)  # for multi-process the headless has to be True
        self.ppo = GLOBAL_PPO

        self.pins_x = []
        self.pins_y = []
예제 #2
0
def run():
    env = ReacherEnv(headless=True)
    s = env.reset()

    for i in range(10):
        # action=sac_trainer.policy_net.sample_action()
        action = sac_trainer.policy_net.get_action(s,
                                                   deterministic=DETERMINISTIC)

        action = np.random.rand(7)
        s, r, done = env.step(action)
        print(s)
class Worker():
    def __init__(self):
        self.env=ReacherEnv(headless=True) # for multi-process the headless has to be True

    def work(self):
        frame_idx=0
        # training loop
        for eps in range(max_episodes):
            state = self.env.reset()
            episode_reward = 0
            
            
            for step in range(max_steps):
                if frame_idx > explore_steps:
                    action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC)
                else:
                    action = sac_trainer.policy_net.sample_action()
        
                try:
                    next_state, reward, done = self.env.step(action)
                except KeyboardInterrupt:
                    print('Finished')
                    sac_trainer.save_model(model_path)
                    self.env.shutdown()
        
                replay_buffer.push(state, action, reward, next_state, done)
                
                state = next_state
                episode_reward += reward
                frame_idx += 1
                
                
                # if len(replay_buffer) > batch_size:
                    # for i in range(update_itr):
                        # _=self.sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim)
                
                if eps % 10 == 0 and eps>0:
                    plot(rewards)
                    sac_trainer.save_model(model_path)
                
                if done:
                    break
            print('Episode: ', eps, '| Episode Reward: ', episode_reward)
            rewards.append(episode_reward)
        sac_trainer.save_model(model_path)
        self.env.shutdown()
예제 #4
0
def make_env(max_steps, seed):
    from reacher_sawyer_env import ReacherEnv
    # from reacher_sawyer_visual_env import ReacherEnv
    env = ReacherEnv(headless=True, control_mode='end_position')
    return Monitor(TimeLimit(env, max_steps))
 def __init__(self):
     self.env=ReacherEnv(headless=True) # for multi-process the headless has to be True
예제 #6
0
    def get_v(self, s):
        if s.ndim < 2: s = s[np.newaxis, :]
        return self.sess.run(self.v, {self.tfs: s})[0, 0]

    def save(self, path):
        saver = tf.train.Saver()
        saver.save(self.sess, path)

    def load(self, path):
        saver = tf.train.Saver()
        saver.restore(self.sess, path)


if __name__ == '__main__':
    model_path = './model/ppo_single'
    env = ReacherEnv(headless=False)
    S_DIM = env.observation_space.shape[0]
    A_DIM = env.action_space.shape[0]
    ppo = PPO(
    )  # if true, using visual-based input, or esle using numerical intput
    all_ep_r = []
    # ppo.load(model_path)

    for ep in range(EP_MAX):
        s = env.reset()
        buffer_s, buffer_a, buffer_r = [], [], []
        ep_r = 0
        for t in range(EP_LEN):  # in one episode
            a = ppo.choose_action(s)
            s_, r, done = env.step(a)
예제 #7
0
def worker(wid):
    env = ReacherEnv(
        headless=True)  # for multi-process the headless has to be True
    ppo = GLOBAL_PPO

    pins_x = []
    pins_y = []
    global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
    step_set = []
    epr_set = []
    step = 0

    while not COORD.should_stop():
        s = env.reset()
        step += 1

        ep_r = 0
        buffer_s, buffer_a, buffer_r = [], [], []
        pins_x = []
        pins_y = []
        for t in range(EP_LEN):
            if not ROLLING_EVENT.is_set():  # while global PPO is updating
                ROLLING_EVENT.wait()  # wait until PPO is updated
                buffer_s, buffer_a, buffer_r = [], [], [
                ]  # clear history buffer, use new policy to collect data
            a = ppo.choose_action(s)
            # print('a: ', a)
            s_, r, done = env.step(a)

            buffer_s.append(s)
            buffer_a.append(a)
            buffer_r.append(r)  # normalize reward, find to be useful
            s = s_
            ep_r += r

            GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
            if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                v_s_ = ppo.get_v(s_)
                discounted_r = []  # compute discounted reward
                for r in buffer_r[::-1]:
                    v_s_ = r + GAMMA * v_s_
                    discounted_r.append(v_s_)
                discounted_r.reverse()

                bs, ba, br = np.vstack(buffer_s), np.vstack(
                    buffer_a), np.array(discounted_r)[:, np.newaxis]
                buffer_s, buffer_a, buffer_r = [], [], []
                QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    ROLLING_EVENT.clear()  # stop collecting data
                    UPDATE_EVENT.set()  # globalPPO update

                if GLOBAL_EP >= EP_MAX:  # stop training
                    COORD.request_stop()
                    break

        if GLOBAL_EP % 100 == 0 and GLOBAL_EP > 0:
            ppo.save(model_path)
        # record reward changes, plot later
        if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
        else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1)
        GLOBAL_EP += 1
        print(
            '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
            '|W%i' % wid,
            '|Ep_r: %.2f' % ep_r,
        )
        step_set.append(step)
        # print(step)
        epr_set.append(ep_r)
        if step % 10 == 0:  # plot every N episode; some error about main thread for plotting
            plt.plot(step_set, epr_set)  # no moving average
            try:
                plt.savefig('./ppo_multi.png')
            except:
                print('writing conflict!')

    env.shutdown()
예제 #8
0
        GLOBAL_RUNNING_R = []
        COORD = tf.train.Coordinator()
        QUEUE = queue.Queue()  # workers putting data in this queue
        threads = []
        for i in range(N_WORKER):
            t = threading.Thread(target=worker, args=(i, ))
            t.daemon = True  # kill the main thread, the sub-threads die as well
            t.start()  # training
            threads.append(t)
        # add a PPO updating thread
        threads.append(threading.Thread(target=GLOBAL_PPO.update, ))
        threads[-1].start()  # start the updating thread as well
        COORD.join(threads)  # waiting to finish all threads

        GLOBAL_PPO.save(model_path)

    if args.test:
        env = ReacherEnv(headless=True)
        env.reset()
        GLOBAL_PPO = PPO()
        GLOBAL_PPO.load(model_path)
        test_steps = 200
        test_episode = 10

        for _ in range(test_episode):
            s, info = env.reset()

            for t in range(test_steps):
                s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))

        env.shutdown()
예제 #9
0
        self.policy_net.eval()


def plot(rewards):
    clear_output(True)
    plt.figure(figsize=(20, 5))
    plt.plot(rewards)
    plt.savefig('sac_v2.png')
    # plt.show()


replay_buffer_size = 1e6
replay_buffer = ReplayBuffer(replay_buffer_size)

# choose env
env = ReacherEnv(headless=False)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

model_path = './model/sac'

# hyper-parameters for RL training
max_episodes = 10000
max_steps = 30
frame_idx = 0
batch_size = 256
explore_steps = 2000  # for random action sampling in the beginning of training
update_itr = 1
AUTO_ENTROPY = True
DETERMINISTIC = False
hidden_dim = 512