예제 #1
0
    def __init__(self,
                 envs,
                 model,
                 val_envs,
                 train_mode='nstep',
                 log_dir='logs/UnrealA2C2',
                 model_dir='models/UnrealA2C2',
                 total_steps=1000000,
                 nsteps=5,
                 normalise_obs=True,
                 validate_freq=1000000,
                 save_freq=0,
                 render_freq=0,
                 num_val_episodes=50,
                 replay_length=2000,
                 log_scalars=True,
                 gpu_growth=True):

        super().__init__(envs,
                         model,
                         val_envs,
                         train_mode=train_mode,
                         log_dir=log_dir,
                         model_dir=model_dir,
                         total_steps=total_steps,
                         nsteps=nsteps,
                         validate_freq=validate_freq,
                         save_freq=save_freq,
                         render_freq=render_freq,
                         update_target_freq=0,
                         num_val_episodes=num_val_episodes,
                         log_scalars=log_scalars,
                         gpu_growth=gpu_growth)

        self.replay = deque([], maxlen=replay_length)  #replay length per actor
        self.runner = self.Runner(self.model, self.env, self.nsteps,
                                  self.replay)

        hyper_paras = {
            'learning_rate': model.lr,
            'grad_clip': model.grad_clip,
            'nsteps': nsteps,
            'num_workers': self.num_envs,
            'total_steps': self.total_steps,
            'entropy_coefficient': model.entropy_coeff,
            'value_coefficient': model.value_coeff
        }

        if log_scalars:
            filename = log_dir + '/hyperparameters.txt'
            self.save_hyperparameters(filename, **hyper_paras)

        self.normalise_obs = normalise_obs

        if self.normalise_obs:
            self.obs_running = RunningMeanStd()
            self.state_mean = np.zeros_like(self.runner.states)
            self.state_std = np.ones_like(self.runner.states)
            self.aux_reward_rolling = RunningMeanStd()
예제 #2
0
class RollingObs(object):
    def __init__(self, mean=0):
        self.rolling = RunningMeanStd()

    def update(self, x):
        if len(x.shape) == 4:  # assume image obs
            return self.rolling.update(
                np.mean(x, axis=1, keepdims=True
                        ))  #[time*batch,height,width,stack] -> [height, width]
        else:
            return self.rolling.update(x)  #[time*batch,*shape] -> [*shape]
예제 #3
0
class rolling_obs(object):
    def __init__(self, shape=()):
        self.rolling = RunningMeanStd(shape=shape)

    def update(self, x):
        if len(x.shape) == 5:  # assume image obs
            return self.rolling.update(fold_batch(
                x[...,
                  -1:]))  #[time,batch,height,width,stack] -> [height, width,1]
        else:
            return self.rolling.update(
                fold_batch(x))  #[time,batch,*shape] -> [*shape]
예제 #4
0
    def _train_nstep(self):
        batch_size = (self.num_envs * self.nsteps)
        num_updates = self.total_steps // batch_size
        s = 0
        rolling = RunningMeanStd(shape=())
        self.state_rolling = rolling_obs(shape=(), lastFrame=False)
        self.init_state_obs(128 * 50)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)

        # main loop
        start = time.time()
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run(
            )
            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            policy, extr_last_values, intr_last_values = self.model.forward(
                next_states[-1])
            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            R_intr_mean, R_intr_std = rolling.update(int_rff.ravel())
            intr_rewards /= R_intr_std

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                extr_last_values,
                                dones,
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                values_intr,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            # perform minibatch gradient descent for K epochs
            l = 0
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                mini_batch_size = self.nsteps // self.num_minibatches
                np.random.shuffle(idxs)
                for batch in range(0, len(states), mini_batch_size):
                    batch_idxs = idxs[batch:batch + mini_batch_size]
                    # stack all states, next_states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \
                                                    fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                    fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                    #mb_nextstates = mb_nextstates[np.where(np.random.uniform(size=(batch_size)) < self.pred_prob)]

                    mean, std = self.runner.state_mean, self.runner.state_std
                    l += self.model.backprop(mb_states, mb_nextstates,
                                             mb_Rextr, mb_Rintr, mb_Adv,
                                             mb_actions, mb_old_policies, mean,
                                             std)

            l /= (self.num_epochs * self.num_minibatches)

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + '/' + str(s) + ".ckpt"))
                print('saved model')
예제 #5
0
 def __init__(self, shape=(), lastFrame=False):
     self.rolling = RunningMeanStd(shape=shape)
     self.lastFrame = lastFrame
예제 #6
0
    def __init__(self,
                 envs,
                 model,
                 val_envs,
                 train_mode='nstep',
                 log_dir='logs/',
                 model_dir='models/',
                 total_steps=1000000,
                 nsteps=5,
                 gamma_extr=0.999,
                 gamma_intr=0.99,
                 lambda_=0.95,
                 init_obs_steps=600,
                 num_epochs=4,
                 num_minibatches=4,
                 validate_freq=1000000.0,
                 save_freq=0,
                 render_freq=0,
                 num_val_episodes=50,
                 max_val_steps=10000,
                 log_scalars=True):

        super().__init__(envs,
                         model,
                         val_envs,
                         train_mode=train_mode,
                         log_dir=log_dir,
                         model_dir=model_dir,
                         total_steps=total_steps,
                         nsteps=nsteps,
                         gamma=gamma_extr,
                         lambda_=lambda_,
                         validate_freq=validate_freq,
                         save_freq=save_freq,
                         render_freq=render_freq,
                         update_target_freq=0,
                         num_val_episodes=num_val_episodes,
                         max_val_steps=max_val_steps,
                         log_scalars=log_scalars)

        self.gamma_intr = gamma_intr
        self.num_epochs = num_epochs
        self.num_minibatches = num_minibatches
        self.pred_prob = 1 / (self.num_envs / 32.0)
        self.state_obs = RunningMeanStd()
        self.forward_filter = RewardForwardFilter(gamma_intr)
        self.intr_rolling = RunningMeanStd()
        self.init_obs_steps = init_obs_steps

        hyper_paras = {
            'learning_rate': model.lr,
            'grad_clip': model.grad_clip,
            'nsteps': self.nsteps,
            'num_workers': self.num_envs,
            'total_steps': self.total_steps,
            'entropy_coefficient': 0.001,
            'value_coefficient': 1.0,
            'intrinsic_value_coefficient': model.intr_coeff,
            'extrinsic_value_coefficient': model.extr_coeff,
            'init_obs_steps': init_obs_steps,
            'gamma_intrinsic': self.gamma_intr,
            'gamma_extrinsic': self.gamma,
            'lambda': self.lambda_,
            'predictor_dropout_probability': self.pred_prob
        }

        if log_scalars:
            filename = log_dir + '/hyperparameters.txt'
            self.save_hyperparameters(filename, **hyper_paras)
예제 #7
0
class RNDTrainer(SyncMultiEnvTrainer):
    def __init__(self,
                 envs,
                 model,
                 val_envs,
                 train_mode='nstep',
                 log_dir='logs/',
                 model_dir='models/',
                 total_steps=1000000,
                 nsteps=5,
                 gamma_extr=0.999,
                 gamma_intr=0.99,
                 lambda_=0.95,
                 init_obs_steps=600,
                 num_epochs=4,
                 num_minibatches=4,
                 validate_freq=1000000.0,
                 save_freq=0,
                 render_freq=0,
                 num_val_episodes=50,
                 max_val_steps=10000,
                 log_scalars=True):

        super().__init__(envs,
                         model,
                         val_envs,
                         train_mode=train_mode,
                         log_dir=log_dir,
                         model_dir=model_dir,
                         total_steps=total_steps,
                         nsteps=nsteps,
                         gamma=gamma_extr,
                         lambda_=lambda_,
                         validate_freq=validate_freq,
                         save_freq=save_freq,
                         render_freq=render_freq,
                         update_target_freq=0,
                         num_val_episodes=num_val_episodes,
                         max_val_steps=max_val_steps,
                         log_scalars=log_scalars)

        self.gamma_intr = gamma_intr
        self.num_epochs = num_epochs
        self.num_minibatches = num_minibatches
        self.pred_prob = 1 / (self.num_envs / 32.0)
        self.state_obs = RunningMeanStd()
        self.forward_filter = RewardForwardFilter(gamma_intr)
        self.intr_rolling = RunningMeanStd()
        self.init_obs_steps = init_obs_steps

        hyper_paras = {
            'learning_rate': model.lr,
            'grad_clip': model.grad_clip,
            'nsteps': self.nsteps,
            'num_workers': self.num_envs,
            'total_steps': self.total_steps,
            'entropy_coefficient': 0.001,
            'value_coefficient': 1.0,
            'intrinsic_value_coefficient': model.intr_coeff,
            'extrinsic_value_coefficient': model.extr_coeff,
            'init_obs_steps': init_obs_steps,
            'gamma_intrinsic': self.gamma_intr,
            'gamma_extrinsic': self.gamma,
            'lambda': self.lambda_,
            'predictor_dropout_probability': self.pred_prob
        }

        if log_scalars:
            filename = log_dir + '/hyperparameters.txt'
            self.save_hyperparameters(filename, **hyper_paras)

    def init_state_obs(self, num_steps):
        states = 0
        for i in range(num_steps):
            rand_actions = np.random.randint(0,
                                             self.model.action_size,
                                             size=self.num_envs)
            next_states, rewards, dones, infos = self.env.step(rand_actions)
            next_states = next_states[:, -1] if len(
                next_states.shape
            ) == 4 else next_states  # [num_envs, channels, height, width] for convolutions, assume frame stack
            states += next_states
        return states / num_steps

    def _train_nstep(self):
        # stats for normalising states
        self.state_mean, self.state_std = self.state_obs.update(
            self.init_state_obs(self.init_obs_steps))
        self.states = self.env.reset()  # reset to state s_0

        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        s = 0
        mini_batch_size = self.nsteps // self.num_minibatches
        start = time.time()
        # main loop
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, old_policies, dones = self.rollout(
            )
            self.state_mean, self.state_std = self.state_obs.update(
                next_states)  # update state normalisation statistics
            mean, std = self.state_mean, self.state_std

            int_rff = np.array([
                self.forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            R_intr_mean, R_intr_std = self.intr_rolling.update(
                int_rff.ravel())  # normalise intrinsic rewards
            intr_rewards /= R_intr_std

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                last_values_extr,
                                dones,
                                gamma=self.gamma,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(intr_rewards,
                                values_intr,
                                last_values_intr,
                                dones,
                                gamma=self.gamma_intr,
                                lambda_=self.lambda_)
            Re = Adv_extr + values_extr
            Ri = Adv_intr + values_intr
            total_Adv = Adv_extr + Adv_intr
            l = 0

            # perform minibatch gradient descent for K epochs
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                np.random.shuffle(idxs)
                for batch in range(0, len(states), mini_batch_size):
                    batch_idxs = idxs[batch:batch + mini_batch_size]
                    # stack all states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Re, mb_Ri, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], next_states[batch_idxs], \
                                                                                                                 actions[batch_idxs], Re[batch_idxs], Ri[batch_idxs], \
                                                                                                                 total_Adv[batch_idxs], old_policies[batch_idxs])

                    mb_nextstates = mb_nextstates[np.where(
                        np.random.uniform(
                            size=(mini_batch_size)) < self.pred_prob)]
                    l += self.model.backprop(mb_states.copy(),
                                             mb_nextstates.copy(),
                                             mb_Re.copy(), mb_Ri.copy(),
                                             mb_Adv.copy(), mb_actions.copy(),
                                             mb_old_policies.copy(),
                                             mean.copy(), std.copy())

            l /= self.num_epochs

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.save(s)
                print('saved model')

    def get_action(self, states):
        policies, values_extr, values_intr = self.model.evaluate(states)
        actions = fastsample(policies)
        return actions

    def rollout(self):
        rollout = []
        for t in range(self.nsteps):
            policies, values_extr, values_intr = self.model.evaluate(
                self.states)
            actions = fastsample(policies)
            next_states, extr_rewards, dones, infos = self.env.step(actions)

            next_states__ = next_states[:, -1:] if len(
                next_states.shape
            ) == 4 else next_states  # [num_envs, channels, height, width] for convolutions
            intr_rewards = self.model.intrinsic_reward(next_states__,
                                                       self.state_mean,
                                                       self.state_std)

            rollout.append(
                (self.states, next_states__, actions, extr_rewards,
                 intr_rewards, values_extr, values_intr, policies, dones))
            self.states = next_states

        states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(
            *zip(*rollout))
        last_policy, last_values_extr, last_values_intr, = self.model.evaluate(
            self.states)
        return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
예제 #8
0
    def _train_nstep(self):
        start = time.time()
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        alpha_step = 1 / num_updates
        s = 0
        rolling = RunningMeanStd(shape=())
        self.state_rolling = rolling_obs(shape=())
        self.init_state_obs(129)
        #self.runner.state_mean, self.runner.state_std = self.state_rolling.mean, np.sqrt(self.state_rolling.var)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)

        # main loop
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run(
            )
            policy, extr_last_values, intr_last_values = self.model.forward(
                next_states[-1])
            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            #R_intr_mean, R_intr_std = rolling.update(self.discount(intr_rewards, self.gamma).ravel().mean()) #
            rolling.update(int_rff.ravel())
            R_intr_std = np.sqrt(rolling.var)
            intr_rewards /= R_intr_std
            #print('intr reward', intr_rewards)

            forward_loss = self.forward_model.backprop(
                states[0], fold_batch(next_states), fold_batch(actions),
                fold_batch(extr_rewards), self.nsteps)

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                extr_last_values,
                                dones,
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                values_intr,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            #self.runner.state_mean, self.runner.state_std = state_rolling.update(fold_batch(next_states)[:,:,:,-1:]) # update state normalisation statistics
            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            # perform minibatch gradient descent for K epochs
            l = 0
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                batch_size = self.nsteps // self.num_minibatches
                np.random.shuffle(idxs)
                for batch in range(0, len(states), batch_size):
                    batch_idxs = idxs[batch:batch + batch_size]
                    # stack all states, next_states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \
                                                    fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                    fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                    mb_nextstates = mb_nextstates[np.where(
                        np.random.uniform(
                            size=(batch_size)) < self.pred_prob)][:, :, :, -1:]
                    #mb_nextstates = (mb_nextstates  - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis]
                    mean, std = self.runner.state_mean, self.runner.state_std
                    l += self.model.backprop(mb_states, mb_nextstates,
                                             mb_Rextr, mb_Rintr, mb_Adv,
                                             mb_actions, mb_old_policies,
                                             self.alpha, mean, std)

            l /= (self.num_epochs * self.num_minibatches)

            # Imagined future rollout

            hidden = self.forward_model.get_initial_hidden(self.num_envs)
            obs = next_states[-1]
            encoded_last_state = self.forward_model.encode_state(
                next_states[-1])  # o_t -> s_t
            actions = [
                np.random.choice(policy.shape[1], p=policy[i])
                for i in range(policy.shape[0])
            ]
            imagined_rollout = []
            with tf.variable_scope('forward_model/latent-space-rnn',
                                   reuse=tf.AUTO_REUSE):
                for i in range(self.nsteps):
                    next_obs, extr_rewards, encoded_last_state, hidden = self.forward_model.predict_next(
                        encoded_last_state, hidden, actions)
                    #print('imagined obs', next_obs.shape)
                    intr_rewards = self.model.intrinsic_reward(
                        next_obs[..., -1:], self.runner.state_mean,
                        self.runner.state_std)
                    policies, extr_values, intr_values = self.model.forward(
                        obs)
                    actions = [
                        np.random.choice(policy.shape[1], p=policy[i])
                        for i in range(policy.shape[0])
                    ]
                    imagined_rollout.append([
                        obs, next_obs, actions, extr_rewards[:, 0],
                        intr_rewards, extr_values, intr_values, policies
                    ])
                    obs = next_obs

            obs, next_obs, actions, extr_rewards, intr_rewards, extr_values, intr_values, old_policies = stack_many(
                zip(*imagined_rollout))
            #print('imagined obs', obs.shape)
            #print('imagined extr rew', extr_rewards.shape)
            #print('imagined extr_values', extr_values.shape)
            #print('imagined intr_values', intr_values.shape)

            intr_rewards /= R_intr_std

            policies, extr_last_values, intr_last_values = self.model.forward(
                next_obs[-1])
            Adv_extr = self.GAE(extr_rewards,
                                extr_values,
                                extr_last_values,
                                np.zeros_like(dones),
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                intr_values,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            for batch in range(0, len(obs), batch_size):
                batch_idxs = idxs[batch:batch + batch_size]
                # stack all states, next_states, actions and Rs across all workers into a single batch
                mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(obs[batch_idxs]), fold_batch(next_obs[batch_idxs]), \
                                                fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                mb_nextstates = mb_nextstates[np.where(
                    np.random.uniform(
                        size=(batch_size)) < self.pred_prob)][..., -1:]
                #mb_nextstates = (mb_nextstates  - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis]
                mean, std = self.runner.state_mean, self.runner.state_std
                l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr,
                                         mb_Rintr, mb_Adv, mb_actions,
                                         mb_old_policies, self.alpha, mean,
                                         std)

            if self.render_freq > 0 and t % (self.validate_freq *
                                             self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % self.validate_freq == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % self.save_freq == 0:
                s += 1
                self.saver.save(
                    self.sess,
                    str(self.model_dir + self.current_time + '/' + str(s) +
                        ".ckpt"))
                print('saved model')
예제 #9
0
 def __init__(self, shape=()):
     self.rolling = RunningMeanStd(shape=shape)
예제 #10
0
 def __init__(self, mean=0):
     self.rolling = RunningMeanStd()
    def _train_nstep(self):
        batch_size = (self.num_envs * self.nsteps)
        num_updates = self.total_steps // batch_size
        s = 0
        rolling = RunningMeanStd()
        self.state_rolling = rolling_obs(shape=())
        self.init_state_obs(128 * 50)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)
        # main loop
        start = time.time()
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run(
            )
            policy, last_extr_values, last_intr_values = self.model.forward(
                next_states[-1])

            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            R_intr_mean, R_intr_std = rolling.update(int_rff.ravel())
            intr_rewards /= R_intr_std

            if self.return_type == 'GAE':
                R_extr = self.GAE(extr_rewards,
                                  extr_values,
                                  last_extr_values,
                                  dones,
                                  gamma=0.999,
                                  lambda_=self.lambda_) + extr_values
                R_intr = self.GAE(intr_rewards,
                                  intr_values,
                                  last_intr_values,
                                  np.zeros_like(dones),
                                  gamma=0.99,
                                  lambda_=self.lambda_) + intr_values
            else:
                R_extr = self.nstep_return(extr_rewards,
                                           last_extr_values,
                                           dones,
                                           gamma=0.999,
                                           clip=False)
                R_intr = self.nstep_return(
                    intr_rewards,
                    last_intr_values,
                    np.zeros_like(dones),
                    gamma=0.99,
                    clip=False)  # non episodic intr reward signal

            Adv = self.model.extr_coeff * (
                R_extr - extr_values) + self.model.intr_coeff * (R_intr -
                                                                 intr_values)

            # stack all states, next_states, actions and Rs across all workers into a single batch
            states, next_states, actions, R_extr, R_intr, Adv = fold_batch(
                states), fold_batch(next_states), fold_batch(
                    actions), fold_batch(R_extr), fold_batch(
                        R_intr), fold_batch(Adv)

            l = self.model.backprop(states, next_states, R_extr, R_intr, Adv,
                                    actions, self.runner.state_mean,
                                    self.runner.state_std)

            #start= time.time()
            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + '/' + str(s) + ".ckpt"))
                print('saved model')
예제 #12
0
    def _train_nstep(self):
        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        s = 0
        rolling = RunningMeanStd()
        self.init_state_obs(50 * 128)
        forward_filter = RewardForwardFilter(0.99)
        self.runner.states = self.env.reset()
        # main loop
        start = time.time()
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run(
            )
            policy, last_extr_values, last_intr_values = self.model.forward(
                next_states[-1])

            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            r_intr = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])  # update intrinsic return estimate
            #r_intr = self.nstep_return(intr_rewards, last_intr_values, np.zeros_like(dones))
            R_intr_mean, R_intr_std = rolling.update(r_intr.ravel())
            intr_rewards /= R_intr_std  # normalise intr rewards
            #print('intr_reward', intr_rewards)

            R_extr = self.GAE(extr_rewards,
                              extr_values,
                              last_extr_values,
                              dones,
                              gamma=0.999,
                              lambda_=self.lambda_) + extr_values
            R_intr = self.GAE(intr_rewards,
                              intr_values,
                              last_intr_values,
                              np.zeros_like(dones),
                              gamma=0.99,
                              lambda_=self.lambda_) + intr_values
            #R_mean, R_std = rolling.update(R_intr.ravel())

            Adv = self.model.extr_coeff * (
                R_extr - extr_values) + self.model.intr_coeff * (R_intr -
                                                                 intr_values)

            # stack all states, next_states, actions and Rs across all workers into a single batch
            next_states = next_states[..., -1:] if len(
                next_states.shape) == 5 else next_states
            states, next_states, actions, R_extr, R_intr, Adv = fold_batch(
                states), fold_batch(next_states), fold_batch(
                    actions), fold_batch(R_extr), fold_batch(
                        R_intr), fold_batch(Adv)

            l = self.model.backprop(states, next_states, R_extr, R_intr, Adv,
                                    actions, self.runner.state_mean,
                                    self.runner.state_std)
            #print('backprop time', time.time() -start)

            #start= time.time()
            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + '/' + str(s) + ".ckpt"))
                print('saved model')
예제 #13
0
class RANDALTrainer(SyncMultiEnvTrainer):
    def __init__(self,
                 envs,
                 model,
                 val_envs,
                 train_mode='nstep',
                 log_dir='logs/',
                 model_dir='models/',
                 total_steps=1000000,
                 nsteps=5,
                 gamma_extr=0.999,
                 gamma_intr=0.99,
                 lambda_=0.95,
                 init_obs_steps=600,
                 num_epochs=4,
                 num_minibatches=4,
                 validate_freq=1000000.0,
                 save_freq=0,
                 render_freq=0,
                 num_val_episodes=50,
                 max_val_steps=10000,
                 replay_length=2000,
                 norm_pixel_reward=True,
                 log_scalars=True):

        super().__init__(envs,
                         model,
                         val_envs,
                         train_mode=train_mode,
                         log_dir=log_dir,
                         model_dir=model_dir,
                         total_steps=total_steps,
                         nsteps=nsteps,
                         gamma=gamma_extr,
                         lambda_=lambda_,
                         validate_freq=validate_freq,
                         save_freq=save_freq,
                         render_freq=render_freq,
                         update_target_freq=0,
                         num_val_episodes=num_val_episodes,
                         max_val_steps=max_val_steps,
                         log_scalars=log_scalars)

        self.gamma_intr = gamma_intr
        self.num_epochs = num_epochs
        self.num_minibatches = num_minibatches
        self.pred_prob = 1 / (self.num_envs / 32.0)
        self.state_obs = RunningMeanStd()
        self.forward_filter = RewardForwardFilter(gamma_intr)
        self.intr_rolling = RunningMeanStd()
        self.init_obs_steps = init_obs_steps
        self.replay = deque([],
                            maxlen=replay_length)  # replay length per actor
        self.normalise_obs = norm_pixel_reward
        self.replay_length = replay_length

        hyper_paras = {
            'learning_rate': model.lr,
            'grad_clip': model.grad_clip,
            'nsteps': self.nsteps,
            'num_workers': self.num_envs,
            'total_steps': self.total_steps,
            'entropy_coefficient': model.entropy_coeff,
            'value_coefficient': 1.0,
            'intrinsic_value_coefficient': model.intr_coeff,
            'extrinsic_value_coefficient': model.extr_coeff,
            'init_obs_steps': init_obs_steps,
            'gamma_intrinsic': self.gamma_intr,
            'gamma_extrinsic': self.gamma,
            'lambda': self.lambda_,
            'predictor_dropout_probability': self.pred_prob,
            'replay_length': replay_length,
            'normalise_pixel_reward': norm_pixel_reward,
            'replay_value_coefficient': model.VR,
            'pixel_control_coefficient': model.PC,
            'reward_prediction_coefficient': model.RP
        }

        if log_scalars:
            filename = log_dir + '/hyperparameters.txt'
            self.save_hyperparameters(filename, **hyper_paras)

    def populate_memory(self):
        for t in range(self.replay_length // self.nsteps):
            states, *_ = self.rollout()
            #self.state_mean, self.state_std = self.obs_running.update(fold_batch(states)[...,-1:])
            self.update_minmax(states)

    def update_minmax(self, obs):
        minima = obs.min()
        maxima = obs.max()
        if minima < self.state_min:
            self.state_min = minima
        if maxima > self.state_max:
            self.state_max = maxima

    def norm_obs(self, obs):
        ''' normalise pixel intensity changes by recording min and max pixel observations
            not using per pixel normalisation because expected image is singular greyscale frame
        '''
        return (obs - self.state_min) * (1 / (self.state_max - self.state_min))

    def auxiliary_target(self, pixel_rewards, last_values, dones):
        T = len(pixel_rewards)
        R = np.zeros((T, *last_values.shape))
        dones = dones[:, :, np.newaxis, np.newaxis]
        R[-1] = last_values * (1 - dones[-1])

        for i in reversed(range(T - 1)):
            # restart score if done as BatchEnv automatically resets after end of episode
            R[i] = pixel_rewards[i] + 0.99 * R[i + 1] * (1 - dones[-1])

        return R

    def pixel_rewards(self, prev_state, states):
        # states of rank [T, B, channels, 84, 84]
        T = len(states)  # time length
        B = states.shape[1]  # batch size
        pixel_rewards = np.zeros((T, B, 21, 21))
        states = states[:, :, -1, :, :]
        prev_state = prev_state[:, -1, :, :]
        if self.normalise_obs:
            states = self.norm_obs(states)
            #print('states, max', states.max(), 'min', states.min(), 'mean', states.mean())
            prev_state = self.norm_obs(prev_state)

        pixel_rewards[0] = np.abs(states[0] - prev_state).reshape(
            -1, 4, 4, 21, 21).mean(axis=(1, 2))
        for i in range(1, T):
            pixel_rewards[i] = np.abs(states[i] - states[i - 1]).reshape(
                -1, 4, 4, 21, 21).mean(axis=(1, 2))
        return pixel_rewards

    def sample_replay(self):
        workers = np.random.choice(
            self.num_envs, replace=False,
            size=2)  # randomly sample from one of n workers
        sample_start = np.random.randint(1, len(self.replay) - self.nsteps - 2)
        replay_sample = []
        for i in range(sample_start, sample_start + self.nsteps):
            replay_sample.append(self.replay[i])

        replay_states = np.stack(
            [replay_sample[i][0][workers] for i in range(len(replay_sample))])
        replay_actions = np.stack(
            [replay_sample[i][1][workers] for i in range(len(replay_sample))])
        replay_rewards = np.stack(
            [replay_sample[i][2][workers] for i in range(len(replay_sample))])
        replay_values = np.stack(
            [replay_sample[i][3][workers] for i in range(len(replay_sample))])
        replay_dones = np.stack(
            [replay_sample[i][4][workers] for i in range(len(replay_sample))])
        #print('replay dones shape', replay_dones.shape)
        #print('replay_values shape', replay_values.shape)

        next_state = self.replay[sample_start +
                                 self.nsteps][0][workers]  # get state
        _, replay_last_values_extr, replay_last_values_intr = self.model.evaluate(
            next_state)
        replay_R = self.GAE(replay_rewards,
                            replay_values,
                            replay_last_values_extr,
                            replay_dones,
                            gamma=0.99,
                            lambda_=0.95) + replay_values

        if self.model.pixel_control:
            prev_states = self.replay[sample_start - 1][0][workers]
            Qaux_value = self.model.get_pixel_control(next_state)
            pixel_rewards = self.pixel_rewards(prev_states, replay_states)
            Qaux_target = self.auxiliary_target(pixel_rewards,
                                                np.max(Qaux_value, axis=1),
                                                replay_dones)
        else:
            Qaux_target = np.zeros(
                (len(replay_states), 1, 1,
                 1))  # produce fake Qaux to save writing unecessary code

        return replay_states, replay_actions, replay_R, Qaux_target, replay_dones

    def sample_reward(self):
        # worker = np.random.randint(0,self.num_envs) # randomly sample from one of n workers
        replay_rewards = np.array(
            [self.replay[i][2] for i in range(len(self.replay))])
        worker = np.argmax(np.sum(
            replay_rewards, axis=0))  # sample experience from best worker
        nonzero_idxs = np.where(
            np.abs(replay_rewards) > 0)[0]  # idxs where |reward| > 0
        zero_idxs = np.where(replay_rewards == 0)[0]  # idxs where reward == 0

        if len(nonzero_idxs) == 0 or len(
                zero_idxs
        ) == 0:  # if nonzero or zero idxs do not exist i.e. all rewards same sign
            idx = np.random.randint(len(replay_rewards))
        elif np.random.uniform(
        ) > 0.5:  # sample from zero and nonzero rewards equally
            #print('nonzero')
            idx = np.random.choice(nonzero_idxs)
        else:
            idx = np.random.choice(zero_idxs)

        reward_states = self.replay[idx][0][worker]
        reward = np.array([sign(replay_rewards[idx,
                                               worker])])  # source of error

        return reward_states[None], reward

    def init_state_obs(self, num_steps):
        states = 0
        for i in range(num_steps):
            rand_actions = np.random.randint(0,
                                             self.model.action_size,
                                             size=self.num_envs)
            next_states, rewards, dones, infos = self.env.step(rand_actions)
            next_states = next_states[:, -1] if len(
                next_states.shape
            ) == 4 else next_states  # [num_envs, channels, height, width] for convolutions, assume frame stack
            states += next_states
        return states / num_steps

    def _train_nstep(self):
        # stats for normalising states
        self.state_mean, self.state_std = self.state_obs.update(
            self.init_state_obs(self.init_obs_steps))
        self.state_min, self.state_max = 0.0, 0.0
        self.populate_memory(
        )  # populate experience replay with random actions
        self.states = self.env.reset()  # reset to state s_0

        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        s = 0
        mini_batch_size = self.nsteps // self.num_minibatches
        start = time.time()
        # main loop
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, old_policies, dones = self.rollout(
            )
            # update state normalisation statistics
            self.update_minmax(states)
            self.state_mean, self.state_std = self.state_obs.update(
                next_states)
            mean, std = self.state_mean, self.state_std

            replay_states, replay_actions, replay_Re, Qaux_target, replay_dones = self.sample_replay(
            )  # sample experience replay

            int_rff = np.array([
                self.forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            R_intr_mean, R_intr_std = self.intr_rolling.update(
                int_rff.ravel())  # normalise intrinsic rewards
            intr_rewards /= R_intr_std

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                last_values_extr,
                                dones,
                                gamma=self.gamma,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(intr_rewards,
                                values_intr,
                                last_values_intr,
                                dones,
                                gamma=self.gamma_intr,
                                lambda_=self.lambda_)
            Re = Adv_extr + values_extr
            Ri = Adv_intr + values_intr
            total_Adv = Adv_extr + Adv_intr
            l = 0

            # perform minibatch gradient descent for K epochs
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                reward_states, sample_rewards = self.sample_reward(
                )  # sample reward from replay memory
                np.random.shuffle(idxs)
                for batch in range(0, len(states), mini_batch_size):
                    batch_idxs = idxs[batch:batch + mini_batch_size]
                    # stack all states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Re, mb_Ri, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], next_states[batch_idxs], \
                                                                                                                 actions[batch_idxs], Re[batch_idxs], Ri[batch_idxs], \
                                                                                                                 total_Adv[batch_idxs], old_policies[batch_idxs])

                    mb_replay_states, mb_replay_actions, mb_replay_Rextr, mb_Qaux_target = fold_many(replay_states[batch_idxs], replay_actions[batch_idxs], \
                                                                                                                        replay_Re[batch_idxs], Qaux_target[batch_idxs])

                    mb_nextstates = mb_nextstates[np.where(
                        np.random.uniform(
                            size=(mini_batch_size)) < self.pred_prob)]
                    # states, next_states, Re, Ri, Adv, actions, old_policy, reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R, state_mean, state_std
                    l += self.model.backprop(mb_states.copy(),
                                             mb_nextstates.copy(),
                                             mb_Re.copy(), mb_Ri.copy(),
                                             mb_Adv.copy(), mb_actions.copy(),
                                             mb_old_policies.copy(),
                                             reward_states.copy(),
                                             sample_rewards.copy(),
                                             mb_Qaux_target.copy(),
                                             mb_replay_actions.copy(),
                                             mb_replay_states.copy(),
                                             mb_replay_Rextr.copy(),
                                             mean.copy(), std.copy())

            l /= self.num_epochs

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.save(s)
                print('saved model')

    def get_action(self, states):
        policies, values_extr, values_intr = self.model.evaluate(states)
        actions = fastsample(policies)
        return actions

    def rollout(self):
        rollout = []
        for t in range(self.nsteps):
            policies, values_extr, values_intr = self.model.evaluate(
                self.states)
            actions = fastsample(policies)
            next_states, extr_rewards, dones, infos = self.env.step(actions)

            next_states__ = next_states[:, -1:] if len(
                next_states.shape
            ) == 4 else next_states  # [num_envs, channels, height, width] for convolutions
            intr_rewards = self.model.intrinsic_reward(next_states__,
                                                       self.state_mean,
                                                       self.state_std)

            rollout.append(
                (self.states, next_states__, actions, extr_rewards,
                 intr_rewards, values_extr, values_intr, policies, dones))
            self.replay.append((self.states, actions, extr_rewards,
                                values_extr, dones))  # add to replay memory
            self.states = next_states

        states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(
            *zip(*rollout))
        last_policy, last_values_extr, last_values_intr, = self.model.evaluate(
            self.states)
        return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones