예제 #1
0
    def __init__(self,
                 env_name,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=4,
                 h=84,
                 w=84,
                 sticky_action=True,
                 p=0.25,
                 max_episode_steps=18000):
        super(AtariEnvironment, self).__init__()
        self.seed = np.random.randint(0, 60000)
        self.daemon = True
        self.env = ObstacleTowerEnv('../ObstacleTower/obstacletower',
                                    worker_id=self.seed,
                                    retro=True,
                                    config={
                                        'starting-floor': 10,
                                        'total-floors': 15,
                                        'tower-seed': -1
                                    },
                                    greyscale=True,
                                    timeout_wait=300)
        self.env._flattener = ActionFlattener([2, 3, 2, 1])
        self.env._action_space = self.env._flattener.action_space
        self.env_name = env_name
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p
        self.max_episode_steps = max_episode_steps

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()
예제 #2
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')
    seed = np.random.randint(0, 100)

    env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed,
                               retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300)
    env._flattener = ActionFlattener([2, 3, 2, 1])
    env._action_space = env._flattener.action_space
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, 'main.model')
    predictor_path = os.path.join(args.save_dir, 'main.pred')
    target_path = os.path.join(args.save_dir, 'main.target')

    writer = SummaryWriter()#log_dir=args.log_dir)



    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr)
   
    if args.load_model:
        "Loading model..."
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))


    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(
            args.env_name,
            is_render,
            idx,
            child_conn,
            sticky_action=args.sticky_action,
            p=args.sticky_action_prob,
            max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0   # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    print("Load RMS =", args.load_rms)
    if args.load_rms:
        print("Loading RMS values for observation and reward normalization")
        with open('reward_rms.pkl', 'rb') as f:
            reward_rms = dill.load(f)
        with open('obs_rms.pkl', 'rb') as f:
            obs_rms = dill.load(f)
    else:
        reward_rms = RunningMeanStd()
        obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))

        # normalize observation
        print('Initializing observation normalization...')
        next_obs = []
        for step in range(args.num_step * args.pre_obs_norm_steps):
            actions = np.random.randint(0, output_size, size=(args.num_worker,))

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            for parent_conn in parent_conns:
                next_state, reward, done, realdone, log_reward = parent_conn.recv()
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            if len(next_obs) % (args.num_step * args.num_worker) == 0:
                next_obs = np.stack(next_obs)
                obs_rms.update(next_obs)
                next_obs = []
        with open('reward_rms.pkl', 'wb') as f:
            dill.dump(reward_rms, f)
        with open('obs_rms.pkl', 'wb') as f:
            dill.dump(obs_rms, f)

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv()
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              args.ext_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              args.int_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                        np.float32(total_state) / 255., ext_target, int_target, total_action,
                        total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                        total_action_probs)

        if global_step % (args.num_worker * args.num_step * args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)

            """
            checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))])
            if len(checkpoint_list) == 0:
                last_checkpoint = -1
            else:
                last_checkpoint = checkpoint_list.max()
            next_checkpoint = last_checkpoint + 1
            print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint))

            incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model')
            incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred')
            incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target')
            with open(incre_model_path, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(incre_predictor_path, 'wb') as f:
                torch.save(rnd.predictor.state_dict(), f)
            with open(incre_target_path, 'wb') as f:
                torch.save(rnd.target.state_dict(), f)
            """
            if args.terminate and (global_step > args.terminate_steps):
                with open('reward_rms.pkl', 'wb') as f:
                    dill.dump(reward_rms, f)
                with open('obs_rms.pkl', 'wb') as f:
                    dill.dump(obs_rms, f)
                break
예제 #3
0
class AtariEnvironment(Process):
    def __init__(self,
                 env_name,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=4,
                 h=84,
                 w=84,
                 sticky_action=True,
                 p=0.25,
                 max_episode_steps=18000):
        super(AtariEnvironment, self).__init__()
        self.seed = np.random.randint(0, 60000)
        self.daemon = True
        self.env = ObstacleTowerEnv('../ObstacleTower/obstacletower',
                                    worker_id=self.seed,
                                    retro=True,
                                    config={
                                        'total-floors': 12,
                                        'tower-seed': -1
                                    },
                                    greyscale=True,
                                    timeout_wait=300)
        self.env._flattener = ActionFlattener([2, 3, 2, 1])
        self.env._action_space = self.env._flattener.action_space
        self.env_name = env_name
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p
        self.max_episode_steps = max_episode_steps

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(AtariEnvironment, self).run()
        while True:
            action = self.child_conn.recv()

            if 'Breakout' in self.env_name:
                action += 1

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            s, reward, done, info = self.env.step(action)

            if self.max_episode_steps < self.steps:
                done = True

            log_reward = reward
            force_done = done

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(s)

            self.rall += reward
            self.steps += 1

            if done:
                real_reward = self.rall - ((self.rall * 100) % 10) * 1.01
                self.recent_rlist.append(real_reward)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}".
                    format(self.episode, self.env_idx, self.steps, real_reward,
                           np.mean(self.recent_rlist)))

                self.history = self.reset()

            self.child_conn.send(
                [self.history[:, :, :], reward, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        s = self.env.reset()
        self.get_init_state(self.pre_proc(s))
        return self.history[:, :, :]

    def pre_proc(self, X):
        #X = np.array(Image.fromarray(X).convert('L')).astype('float32')
        #x = cv2.resize(X, (self.h, self.w))
        #return x
        frame = Image.fromarray(X.reshape(84, 84)).convert('L')
        frame = np.array(frame.resize((self.h, self.w)))
        return frame.astype(np.float32)

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)