class RollingObs(object): def __init__(self, mean=0): self.rolling = RunningMeanStd() def update(self, x): if len(x.shape) == 4: # assume image obs return self.rolling.update( np.mean(x, axis=1, keepdims=True )) #[time*batch,height,width,stack] -> [height, width] else: return self.rolling.update(x) #[time*batch,*shape] -> [*shape]
class rolling_obs(object): def __init__(self, shape=()): self.rolling = RunningMeanStd(shape=shape) def update(self, x): if len(x.shape) == 5: # assume image obs return self.rolling.update(fold_batch( x[..., -1:])) #[time,batch,height,width,stack] -> [height, width,1] else: return self.rolling.update( fold_batch(x)) #[time,batch,*shape] -> [*shape]
def _train_nstep(self): batch_size = (self.num_envs * self.nsteps) num_updates = self.total_steps // batch_size s = 0 rolling = RunningMeanStd(shape=()) self.state_rolling = rolling_obs(shape=(), lastFrame=False) self.init_state_obs(128 * 50) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop start = time.time() for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run( ) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics policy, extr_last_values, intr_last_values = self.model.forward( next_states[-1]) int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) R_intr_mean, R_intr_std = rolling.update(int_rff.ravel()) intr_rewards /= R_intr_std Adv_extr = self.GAE(extr_rewards, values_extr, extr_last_values, dones, gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, values_intr, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr # perform minibatch gradient descent for K epochs l = 0 idxs = np.arange(len(states)) for epoch in range(self.num_epochs): mini_batch_size = self.nsteps // self.num_minibatches np.random.shuffle(idxs) for batch in range(0, len(states), mini_batch_size): batch_idxs = idxs[batch:batch + mini_batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) #mb_nextstates = mb_nextstates[np.where(np.random.uniform(size=(batch_size)) < self.pred_prob)] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, mean, std) l /= (self.num_epochs * self.num_minibatches) if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt")) print('saved model')
class RNDTrainer(SyncMultiEnvTrainer): def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma_extr=0.999, gamma_intr=0.99, lambda_=0.95, init_obs_steps=600, num_epochs=4, num_minibatches=4, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma_extr, lambda_=lambda_, validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) self.gamma_intr = gamma_intr self.num_epochs = num_epochs self.num_minibatches = num_minibatches self.pred_prob = 1 / (self.num_envs / 32.0) self.state_obs = RunningMeanStd() self.forward_filter = RewardForwardFilter(gamma_intr) self.intr_rolling = RunningMeanStd() self.init_obs_steps = init_obs_steps hyper_paras = { 'learning_rate': model.lr, 'grad_clip': model.grad_clip, 'nsteps': self.nsteps, 'num_workers': self.num_envs, 'total_steps': self.total_steps, 'entropy_coefficient': 0.001, 'value_coefficient': 1.0, 'intrinsic_value_coefficient': model.intr_coeff, 'extrinsic_value_coefficient': model.extr_coeff, 'init_obs_steps': init_obs_steps, 'gamma_intrinsic': self.gamma_intr, 'gamma_extrinsic': self.gamma, 'lambda': self.lambda_, 'predictor_dropout_probability': self.pred_prob } if log_scalars: filename = log_dir + '/hyperparameters.txt' self.save_hyperparameters(filename, **hyper_paras) def init_state_obs(self, num_steps): states = 0 for i in range(num_steps): rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs) next_states, rewards, dones, infos = self.env.step(rand_actions) next_states = next_states[:, -1] if len( next_states.shape ) == 4 else next_states # [num_envs, channels, height, width] for convolutions, assume frame stack states += next_states return states / num_steps def _train_nstep(self): # stats for normalising states self.state_mean, self.state_std = self.state_obs.update( self.init_state_obs(self.init_obs_steps)) self.states = self.env.reset() # reset to state s_0 batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size s = 0 mini_batch_size = self.nsteps // self.num_minibatches start = time.time() # main loop for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, old_policies, dones = self.rollout( ) self.state_mean, self.state_std = self.state_obs.update( next_states) # update state normalisation statistics mean, std = self.state_mean, self.state_std int_rff = np.array([ self.forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) R_intr_mean, R_intr_std = self.intr_rolling.update( int_rff.ravel()) # normalise intrinsic rewards intr_rewards /= R_intr_std Adv_extr = self.GAE(extr_rewards, values_extr, last_values_extr, dones, gamma=self.gamma, lambda_=self.lambda_) Adv_intr = self.GAE(intr_rewards, values_intr, last_values_intr, dones, gamma=self.gamma_intr, lambda_=self.lambda_) Re = Adv_extr + values_extr Ri = Adv_intr + values_intr total_Adv = Adv_extr + Adv_intr l = 0 # perform minibatch gradient descent for K epochs idxs = np.arange(len(states)) for epoch in range(self.num_epochs): np.random.shuffle(idxs) for batch in range(0, len(states), mini_batch_size): batch_idxs = idxs[batch:batch + mini_batch_size] # stack all states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Re, mb_Ri, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], next_states[batch_idxs], \ actions[batch_idxs], Re[batch_idxs], Ri[batch_idxs], \ total_Adv[batch_idxs], old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(mini_batch_size)) < self.pred_prob)] l += self.model.backprop(mb_states.copy(), mb_nextstates.copy(), mb_Re.copy(), mb_Ri.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy(), mean.copy(), std.copy()) l /= self.num_epochs if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.save(s) print('saved model') def get_action(self, states): policies, values_extr, values_intr = self.model.evaluate(states) actions = fastsample(policies) return actions def rollout(self): rollout = [] for t in range(self.nsteps): policies, values_extr, values_intr = self.model.evaluate( self.states) actions = fastsample(policies) next_states, extr_rewards, dones, infos = self.env.step(actions) next_states__ = next_states[:, -1:] if len( next_states.shape ) == 4 else next_states # [num_envs, channels, height, width] for convolutions intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std) rollout.append( (self.states, next_states__, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many( *zip(*rollout)) last_policy, last_values_extr, last_values_intr, = self.model.evaluate( self.states) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
def _train_nstep(self): start = time.time() num_updates = self.total_steps // (self.num_envs * self.nsteps) alpha_step = 1 / num_updates s = 0 rolling = RunningMeanStd(shape=()) self.state_rolling = rolling_obs(shape=()) self.init_state_obs(129) #self.runner.state_mean, self.runner.state_std = self.state_rolling.mean, np.sqrt(self.state_rolling.var) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run( ) policy, extr_last_values, intr_last_values = self.model.forward( next_states[-1]) int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) #R_intr_mean, R_intr_std = rolling.update(self.discount(intr_rewards, self.gamma).ravel().mean()) # rolling.update(int_rff.ravel()) R_intr_std = np.sqrt(rolling.var) intr_rewards /= R_intr_std #print('intr reward', intr_rewards) forward_loss = self.forward_model.backprop( states[0], fold_batch(next_states), fold_batch(actions), fold_batch(extr_rewards), self.nsteps) Adv_extr = self.GAE(extr_rewards, values_extr, extr_last_values, dones, gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, values_intr, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr #self.runner.state_mean, self.runner.state_std = state_rolling.update(fold_batch(next_states)[:,:,:,-1:]) # update state normalisation statistics self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics # perform minibatch gradient descent for K epochs l = 0 idxs = np.arange(len(states)) for epoch in range(self.num_epochs): batch_size = self.nsteps // self.num_minibatches np.random.shuffle(idxs) for batch in range(0, len(states), batch_size): batch_idxs = idxs[batch:batch + batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(batch_size)) < self.pred_prob)][:, :, :, -1:] #mb_nextstates = (mb_nextstates - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, self.alpha, mean, std) l /= (self.num_epochs * self.num_minibatches) # Imagined future rollout hidden = self.forward_model.get_initial_hidden(self.num_envs) obs = next_states[-1] encoded_last_state = self.forward_model.encode_state( next_states[-1]) # o_t -> s_t actions = [ np.random.choice(policy.shape[1], p=policy[i]) for i in range(policy.shape[0]) ] imagined_rollout = [] with tf.variable_scope('forward_model/latent-space-rnn', reuse=tf.AUTO_REUSE): for i in range(self.nsteps): next_obs, extr_rewards, encoded_last_state, hidden = self.forward_model.predict_next( encoded_last_state, hidden, actions) #print('imagined obs', next_obs.shape) intr_rewards = self.model.intrinsic_reward( next_obs[..., -1:], self.runner.state_mean, self.runner.state_std) policies, extr_values, intr_values = self.model.forward( obs) actions = [ np.random.choice(policy.shape[1], p=policy[i]) for i in range(policy.shape[0]) ] imagined_rollout.append([ obs, next_obs, actions, extr_rewards[:, 0], intr_rewards, extr_values, intr_values, policies ]) obs = next_obs obs, next_obs, actions, extr_rewards, intr_rewards, extr_values, intr_values, old_policies = stack_many( zip(*imagined_rollout)) #print('imagined obs', obs.shape) #print('imagined extr rew', extr_rewards.shape) #print('imagined extr_values', extr_values.shape) #print('imagined intr_values', intr_values.shape) intr_rewards /= R_intr_std policies, extr_last_values, intr_last_values = self.model.forward( next_obs[-1]) Adv_extr = self.GAE(extr_rewards, extr_values, extr_last_values, np.zeros_like(dones), gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, intr_values, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr for batch in range(0, len(obs), batch_size): batch_idxs = idxs[batch:batch + batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(obs[batch_idxs]), fold_batch(next_obs[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(batch_size)) < self.pred_prob)][..., -1:] #mb_nextstates = (mb_nextstates - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, self.alpha, mean, std) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % self.validate_freq == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % self.save_freq == 0: s += 1 self.saver.save( self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt")) print('saved model')
def _train_nstep(self): batch_size = (self.num_envs * self.nsteps) num_updates = self.total_steps // batch_size s = 0 rolling = RunningMeanStd() self.state_rolling = rolling_obs(shape=()) self.init_state_obs(128 * 50) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop start = time.time() for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run( ) policy, last_extr_values, last_intr_values = self.model.forward( next_states[-1]) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) R_intr_mean, R_intr_std = rolling.update(int_rff.ravel()) intr_rewards /= R_intr_std if self.return_type == 'GAE': R_extr = self.GAE(extr_rewards, extr_values, last_extr_values, dones, gamma=0.999, lambda_=self.lambda_) + extr_values R_intr = self.GAE(intr_rewards, intr_values, last_intr_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) + intr_values else: R_extr = self.nstep_return(extr_rewards, last_extr_values, dones, gamma=0.999, clip=False) R_intr = self.nstep_return( intr_rewards, last_intr_values, np.zeros_like(dones), gamma=0.99, clip=False) # non episodic intr reward signal Adv = self.model.extr_coeff * ( R_extr - extr_values) + self.model.intr_coeff * (R_intr - intr_values) # stack all states, next_states, actions and Rs across all workers into a single batch states, next_states, actions, R_extr, R_intr, Adv = fold_batch( states), fold_batch(next_states), fold_batch( actions), fold_batch(R_extr), fold_batch( R_intr), fold_batch(Adv) l = self.model.backprop(states, next_states, R_extr, R_intr, Adv, actions, self.runner.state_mean, self.runner.state_std) #start= time.time() if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt")) print('saved model')
def _train_nstep(self): batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size s = 0 rolling = RunningMeanStd() self.init_state_obs(50 * 128) forward_filter = RewardForwardFilter(0.99) self.runner.states = self.env.reset() # main loop start = time.time() for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run( ) policy, last_extr_values, last_intr_values = self.model.forward( next_states[-1]) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics r_intr = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) # update intrinsic return estimate #r_intr = self.nstep_return(intr_rewards, last_intr_values, np.zeros_like(dones)) R_intr_mean, R_intr_std = rolling.update(r_intr.ravel()) intr_rewards /= R_intr_std # normalise intr rewards #print('intr_reward', intr_rewards) R_extr = self.GAE(extr_rewards, extr_values, last_extr_values, dones, gamma=0.999, lambda_=self.lambda_) + extr_values R_intr = self.GAE(intr_rewards, intr_values, last_intr_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) + intr_values #R_mean, R_std = rolling.update(R_intr.ravel()) Adv = self.model.extr_coeff * ( R_extr - extr_values) + self.model.intr_coeff * (R_intr - intr_values) # stack all states, next_states, actions and Rs across all workers into a single batch next_states = next_states[..., -1:] if len( next_states.shape) == 5 else next_states states, next_states, actions, R_extr, R_intr, Adv = fold_batch( states), fold_batch(next_states), fold_batch( actions), fold_batch(R_extr), fold_batch( R_intr), fold_batch(Adv) l = self.model.backprop(states, next_states, R_extr, R_intr, Adv, actions, self.runner.state_mean, self.runner.state_std) #print('backprop time', time.time() -start) #start= time.time() if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt")) print('saved model')
class RANDALTrainer(SyncMultiEnvTrainer): def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma_extr=0.999, gamma_intr=0.99, lambda_=0.95, init_obs_steps=600, num_epochs=4, num_minibatches=4, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, replay_length=2000, norm_pixel_reward=True, log_scalars=True): super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma_extr, lambda_=lambda_, validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) self.gamma_intr = gamma_intr self.num_epochs = num_epochs self.num_minibatches = num_minibatches self.pred_prob = 1 / (self.num_envs / 32.0) self.state_obs = RunningMeanStd() self.forward_filter = RewardForwardFilter(gamma_intr) self.intr_rolling = RunningMeanStd() self.init_obs_steps = init_obs_steps self.replay = deque([], maxlen=replay_length) # replay length per actor self.normalise_obs = norm_pixel_reward self.replay_length = replay_length hyper_paras = { 'learning_rate': model.lr, 'grad_clip': model.grad_clip, 'nsteps': self.nsteps, 'num_workers': self.num_envs, 'total_steps': self.total_steps, 'entropy_coefficient': model.entropy_coeff, 'value_coefficient': 1.0, 'intrinsic_value_coefficient': model.intr_coeff, 'extrinsic_value_coefficient': model.extr_coeff, 'init_obs_steps': init_obs_steps, 'gamma_intrinsic': self.gamma_intr, 'gamma_extrinsic': self.gamma, 'lambda': self.lambda_, 'predictor_dropout_probability': self.pred_prob, 'replay_length': replay_length, 'normalise_pixel_reward': norm_pixel_reward, 'replay_value_coefficient': model.VR, 'pixel_control_coefficient': model.PC, 'reward_prediction_coefficient': model.RP } if log_scalars: filename = log_dir + '/hyperparameters.txt' self.save_hyperparameters(filename, **hyper_paras) def populate_memory(self): for t in range(self.replay_length // self.nsteps): states, *_ = self.rollout() #self.state_mean, self.state_std = self.obs_running.update(fold_batch(states)[...,-1:]) self.update_minmax(states) def update_minmax(self, obs): minima = obs.min() maxima = obs.max() if minima < self.state_min: self.state_min = minima if maxima > self.state_max: self.state_max = maxima def norm_obs(self, obs): ''' normalise pixel intensity changes by recording min and max pixel observations not using per pixel normalisation because expected image is singular greyscale frame ''' return (obs - self.state_min) * (1 / (self.state_max - self.state_min)) def auxiliary_target(self, pixel_rewards, last_values, dones): T = len(pixel_rewards) R = np.zeros((T, *last_values.shape)) dones = dones[:, :, np.newaxis, np.newaxis] R[-1] = last_values * (1 - dones[-1]) for i in reversed(range(T - 1)): # restart score if done as BatchEnv automatically resets after end of episode R[i] = pixel_rewards[i] + 0.99 * R[i + 1] * (1 - dones[-1]) return R def pixel_rewards(self, prev_state, states): # states of rank [T, B, channels, 84, 84] T = len(states) # time length B = states.shape[1] # batch size pixel_rewards = np.zeros((T, B, 21, 21)) states = states[:, :, -1, :, :] prev_state = prev_state[:, -1, :, :] if self.normalise_obs: states = self.norm_obs(states) #print('states, max', states.max(), 'min', states.min(), 'mean', states.mean()) prev_state = self.norm_obs(prev_state) pixel_rewards[0] = np.abs(states[0] - prev_state).reshape( -1, 4, 4, 21, 21).mean(axis=(1, 2)) for i in range(1, T): pixel_rewards[i] = np.abs(states[i] - states[i - 1]).reshape( -1, 4, 4, 21, 21).mean(axis=(1, 2)) return pixel_rewards def sample_replay(self): workers = np.random.choice( self.num_envs, replace=False, size=2) # randomly sample from one of n workers sample_start = np.random.randint(1, len(self.replay) - self.nsteps - 2) replay_sample = [] for i in range(sample_start, sample_start + self.nsteps): replay_sample.append(self.replay[i]) replay_states = np.stack( [replay_sample[i][0][workers] for i in range(len(replay_sample))]) replay_actions = np.stack( [replay_sample[i][1][workers] for i in range(len(replay_sample))]) replay_rewards = np.stack( [replay_sample[i][2][workers] for i in range(len(replay_sample))]) replay_values = np.stack( [replay_sample[i][3][workers] for i in range(len(replay_sample))]) replay_dones = np.stack( [replay_sample[i][4][workers] for i in range(len(replay_sample))]) #print('replay dones shape', replay_dones.shape) #print('replay_values shape', replay_values.shape) next_state = self.replay[sample_start + self.nsteps][0][workers] # get state _, replay_last_values_extr, replay_last_values_intr = self.model.evaluate( next_state) replay_R = self.GAE(replay_rewards, replay_values, replay_last_values_extr, replay_dones, gamma=0.99, lambda_=0.95) + replay_values if self.model.pixel_control: prev_states = self.replay[sample_start - 1][0][workers] Qaux_value = self.model.get_pixel_control(next_state) pixel_rewards = self.pixel_rewards(prev_states, replay_states) Qaux_target = self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=1), replay_dones) else: Qaux_target = np.zeros( (len(replay_states), 1, 1, 1)) # produce fake Qaux to save writing unecessary code return replay_states, replay_actions, replay_R, Qaux_target, replay_dones def sample_reward(self): # worker = np.random.randint(0,self.num_envs) # randomly sample from one of n workers replay_rewards = np.array( [self.replay[i][2] for i in range(len(self.replay))]) worker = np.argmax(np.sum( replay_rewards, axis=0)) # sample experience from best worker nonzero_idxs = np.where( np.abs(replay_rewards) > 0)[0] # idxs where |reward| > 0 zero_idxs = np.where(replay_rewards == 0)[0] # idxs where reward == 0 if len(nonzero_idxs) == 0 or len( zero_idxs ) == 0: # if nonzero or zero idxs do not exist i.e. all rewards same sign idx = np.random.randint(len(replay_rewards)) elif np.random.uniform( ) > 0.5: # sample from zero and nonzero rewards equally #print('nonzero') idx = np.random.choice(nonzero_idxs) else: idx = np.random.choice(zero_idxs) reward_states = self.replay[idx][0][worker] reward = np.array([sign(replay_rewards[idx, worker])]) # source of error return reward_states[None], reward def init_state_obs(self, num_steps): states = 0 for i in range(num_steps): rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs) next_states, rewards, dones, infos = self.env.step(rand_actions) next_states = next_states[:, -1] if len( next_states.shape ) == 4 else next_states # [num_envs, channels, height, width] for convolutions, assume frame stack states += next_states return states / num_steps def _train_nstep(self): # stats for normalising states self.state_mean, self.state_std = self.state_obs.update( self.init_state_obs(self.init_obs_steps)) self.state_min, self.state_max = 0.0, 0.0 self.populate_memory( ) # populate experience replay with random actions self.states = self.env.reset() # reset to state s_0 batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size s = 0 mini_batch_size = self.nsteps // self.num_minibatches start = time.time() # main loop for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, old_policies, dones = self.rollout( ) # update state normalisation statistics self.update_minmax(states) self.state_mean, self.state_std = self.state_obs.update( next_states) mean, std = self.state_mean, self.state_std replay_states, replay_actions, replay_Re, Qaux_target, replay_dones = self.sample_replay( ) # sample experience replay int_rff = np.array([ self.forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) R_intr_mean, R_intr_std = self.intr_rolling.update( int_rff.ravel()) # normalise intrinsic rewards intr_rewards /= R_intr_std Adv_extr = self.GAE(extr_rewards, values_extr, last_values_extr, dones, gamma=self.gamma, lambda_=self.lambda_) Adv_intr = self.GAE(intr_rewards, values_intr, last_values_intr, dones, gamma=self.gamma_intr, lambda_=self.lambda_) Re = Adv_extr + values_extr Ri = Adv_intr + values_intr total_Adv = Adv_extr + Adv_intr l = 0 # perform minibatch gradient descent for K epochs idxs = np.arange(len(states)) for epoch in range(self.num_epochs): reward_states, sample_rewards = self.sample_reward( ) # sample reward from replay memory np.random.shuffle(idxs) for batch in range(0, len(states), mini_batch_size): batch_idxs = idxs[batch:batch + mini_batch_size] # stack all states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Re, mb_Ri, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], next_states[batch_idxs], \ actions[batch_idxs], Re[batch_idxs], Ri[batch_idxs], \ total_Adv[batch_idxs], old_policies[batch_idxs]) mb_replay_states, mb_replay_actions, mb_replay_Rextr, mb_Qaux_target = fold_many(replay_states[batch_idxs], replay_actions[batch_idxs], \ replay_Re[batch_idxs], Qaux_target[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(mini_batch_size)) < self.pred_prob)] # states, next_states, Re, Ri, Adv, actions, old_policy, reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R, state_mean, state_std l += self.model.backprop(mb_states.copy(), mb_nextstates.copy(), mb_Re.copy(), mb_Ri.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy(), reward_states.copy(), sample_rewards.copy(), mb_Qaux_target.copy(), mb_replay_actions.copy(), mb_replay_states.copy(), mb_replay_Rextr.copy(), mean.copy(), std.copy()) l /= self.num_epochs if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.save(s) print('saved model') def get_action(self, states): policies, values_extr, values_intr = self.model.evaluate(states) actions = fastsample(policies) return actions def rollout(self): rollout = [] for t in range(self.nsteps): policies, values_extr, values_intr = self.model.evaluate( self.states) actions = fastsample(policies) next_states, extr_rewards, dones, infos = self.env.step(actions) next_states__ = next_states[:, -1:] if len( next_states.shape ) == 4 else next_states # [num_envs, channels, height, width] for convolutions intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std) rollout.append( (self.states, next_states__, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) self.replay.append((self.states, actions, extr_rewards, values_extr, dones)) # add to replay memory self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many( *zip(*rollout)) last_policy, last_values_extr, last_values_intr, = self.model.evaluate( self.states) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones