def _train_nstep(self): ''' Episodic training loop for synchronous training over multiple environments ''' start = time.time() num_updates = self.total_steps // (self.num_envs * self.nsteps) s = 0 R_std = np.ones((len(self.env))) rolling = rolling_stats(R_std) # main loop for t in range(1,num_updates+1): states, next_states, actions, extr_rewards, intr_rewards, hidden_batch, dones, infos, extr_values, intr_values = self.runner.run() R_extr = self.multistep_target(extr_rewards, extr_values, dones, clip=False) R_intr = self.multistep_target(intr_rewards, intr_values, np.zeros_like(dones), clip=False) R_mean, R_std = rolling.update(R_intr.mean(axis=0)) self.runner.R_std = R_std # stack all states, next_states, actions and Rs across all workers into a single batch states, next_states, actions, R_extr, R_intr = fold_batch(states),fold_batch(next_states), fold_batch(actions), fold_batch(R_extr), fold_batch(R_intr) l = self.model.backprop(states, next_states, R_extr, R_intr, actions, hidden_batch[0], dones) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % self.validate_freq == 0: self.validation_summary(t,l,start,render) start = time.time() if self.save_freq > 0 and t % self.save_freq == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt") ) print('saved model')
def run(self, ): rollout = [] for t in range(self.num_steps): policies, values_extr, values_intr = self.model.forward( self.states) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, extr_rewards, dones, infos = self.env.step( actions) rollout.append( (self.states, next_states, actions, extr_rewards, values_extr, values_intr, policies, dones)) self.states = next_states states, next_states, actions, extr_rewards, values_extr, values_intr, policies, dones = stack_many( zip(*rollout)) intr_rewards = self.model.intrinsic_reward(fold_batch(states), fold_batch(actions), fold_batch(next_states), self.state_mean, self.state_std) intr_rewards = unfold_batch(intr_rewards, self.num_steps, len(self.env)) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
def _train_nstep(self): batch_size = (self.num_envs * self.nsteps) start = time.time() num_updates = self.total_steps // batch_size s = 0 self.populate_memory() # main loop for t in range(1,num_updates+1): states, actions, rewards, hidden_init, prev_acts_rewards, dones, last_values = self.rollout() R = self.nstep_return(rewards, last_values, dones, clip=False) # stack all states, actions and Rs across all workers into a single batch prev_acts_rewards, actions, rewards, R = fold_batch(prev_acts_rewards), fold_batch(actions), fold_batch(rewards), fold_batch(R) reward_states, sample_rewards = self.sample_reward() replay_states, replay_actions, replay_R, Qaux_target, replay_hidden, replay_actsrews, replay_dones = self.sample_replay() l = self.model.backprop(states, R, actions, hidden_init, dones, prev_acts_rewards, reward_states, sample_rewards, Qaux_target, replay_actions, replay_states, replay_R, replay_hidden, replay_dones, replay_actsrews) if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq //batch_size) == 0: self.validation_summary(t,l,start,render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(s) print('saved model')
def sample_replay(self): workers = np.random.choice(self.num_envs, replace=False, size=2) # randomly sample from one of n workers sample_start = np.random.randint(1, len(self.replay) - self.nsteps -2) replay_sample = [] for i in range(sample_start, sample_start+self.nsteps): replay_sample.append(self.replay[i]) replay_states = np.stack([replay_sample[i][0][workers] for i in range(len(replay_sample))]) replay_actions = np.stack([replay_sample[i][1][workers] for i in range(len(replay_sample))]) replay_rewards = np.stack([replay_sample[i][2][workers] for i in range(len(replay_sample))]) replay_values = np.stack([replay_sample[i][3][workers] for i in range(len(replay_sample))]) replay_dones = np.stack([replay_sample[i][4][workers] for i in range(len(replay_sample))]) #print('replay dones shape', replay_dones.shape) #print('replay_values shape', replay_values.shape) next_state = self.replay[sample_start+self.nsteps][0][workers] # get state _, replay_last_values = self.model.evaluate(next_state) replay_R = GAE(replay_rewards, replay_values, replay_last_values, replay_dones, gamma=0.99, lambda_=0.95) + replay_values if self.model.pixel_control: prev_states = self.replay[sample_start-1][0][workers] Qaux_value = self.model.get_pixel_control(next_state) pixel_rewards = self.pixel_rewards(prev_states, replay_states) Qaux_target = self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=1), replay_dones) else: Qaux_target = np.zeros((len(replay_states),1,1,1)) # produce fake Qaux to save writing unecessary code return fold_batch(replay_states), fold_batch(replay_actions), fold_batch(replay_R), fold_batch(Qaux_target), fold_batch(replay_dones)
def update(self, x): if self.lastFrame: # assume image obs return self.rolling.update(fold_batch( x[..., -1:])) #[time,batch,height,width,stack] -> [height, width,1] else: return self.rolling.update( fold_batch(x)) #[time,batch,*shape] -> [*shape]
def _train_nstep(self): ''' template for multi-step training loop for synchronous training over multiple environments ''' start = time.time() batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size # main loop for t in range(self.t, num_updates + 1): states, actions, rewards, dones, infos, values, last_values = self.runner.run( ) if self.return_type == 'nstep': R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma) elif self.return_type == 'GAE': R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values elif self.return_type == 'lambda': R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_, clip=False) # stack all states, actions and Rs from all workers into a single batch states, actions, R = fold_batch(states), fold_batch( actions), fold_batch(R) l = self.model.backprop(states, R, actions) if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: self.s += 1 self.save(self.s) print('saved model') if self.target_freq > 0 and t % ( self.target_freq // batch_size ) == 0: # update target network (for value based learning e.g. DQN) self.update_target() self.t += 1
def _train_nstep(self): batch_size = (self.num_envs * self.nsteps) start = time.time() num_updates = self.total_steps // batch_size s = 0 # main loop for t in range(1, num_updates + 1): states, actions, rewards, hidden_batch, dones, infos, values, last_values = self.runner.run( ) if self.return_type == 'nstep': R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma) elif self.return_type == 'GAE': R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values elif self.return_type == 'lambda': R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) # stack all states, actions and Rs across all workers into a single batch states, actions, R = fold_batch(states), fold_batch( actions), fold_batch(R) l = self.model.backprop(states, R, actions, hidden_batch[0], dones) if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + str(s) + ".ckpt")) print('saved model')
def train_nstep(self): batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size # main loop start = time.time() for t in range(self.t, num_updates + 1): states, locs, actions, rewards, dones, infos, values, last_values = self.rollout( ) if self.return_type == 'nstep': R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma) elif self.return_type == 'GAE': R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values elif self.return_type == 'lambda': R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_, clip=False) # stack all states, actions and Rs from all workers into a single batch states, locs, actions, R = fold_batch(states), fold_batch( locs), fold_batch(actions), fold_batch(R) #print('locs', locs.shape) l = self.model.backprop(states, locs, R, actions) if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, False) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: self.s += 1 self.save(self.s) print('saved model') if self.target_freq > 0 and t % ( self.target_freq // batch_size ) == 0: # update target network (for value based learning e.g. DQN) self.update_target() self.t += 1
def _train_nstep(self): batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size s = 0 self.state_min = 0 self.state_max = 0 self.populate_memory() # main loop start = time.time() for t in range(1, num_updates + 1): states, actions, rewards, values, dones, infos, last_values = self.runner.run( ) # R = self.nstep_return(rewards, last_values, dones, clip=False) R = self.GAE( rewards, values, last_values, dones, gamma=0.99, lambda_=0.95) + values # stack all states, actions and Rs across all workers into a single batch states, actions, rewards, R = fold_batch(states), fold_batch( actions), fold_batch(rewards), fold_batch(R) #self.state_mean, self.state_std = self.obs_running.update(states[...,-1:]) # update state normalisation statistics self.update_minmax(states) reward_states, sample_rewards = self.sample_reward() replay_states, replay_actions, replay_R, Qaux_target, replay_dones = self.sample_replay( ) l = self.model.backprop(states, R, actions, dones, reward_states, sample_rewards, Qaux_target, replay_actions, replay_states, replay_R, replay_dones) if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt")) print('saved model')
def _train_nstep(self): num_updates = self.total_steps // (self.num_envs * self.nsteps) s = 0 self.state_mean, self.state_std = self.state_obs.update( self.init_state_obs(10000 // self.num_envs)) self.states = self.env.reset() print(self.state_mean.shape, self.state_std.shape) start = time.time() # main loop batch_size = self.num_envs * self.nsteps for t in range(1, num_updates + 1): states, next_states, actions, rewards, dones, values = self.rollout( ) _, last_values = self.model.evaluate(next_states[-1]) R = self.nstep_return(rewards, last_values, dones) Adv = R - values #delta = rewards + self.gamma * values[:-1] - values[1:] #Adv = self.multistep_target(delta, values[-1], dones, gamma=self.gamma*self.lambda_) # stack all states, next_states, actions and Rs across all workers into a single batch states, next_states, actions, R, Adv = fold_batch( states), fold_batch(next_states), fold_batch( actions), fold_batch(R), fold_batch(Adv) mean, std = self.state_mean, self.state_std l = self.model.backprop(states, next_states, R, Adv, actions, mean, std) # self.state_mean, self.state_std = self.state_obs.update(states) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save( self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt")) print('saved model')
def run(self): rollout = [] for t in range(self.num_steps): Qsa = self.Q.forward(self.states) actions = np.argmax(Qsa, axis=1) random = np.random.uniform(size=(self.num_envs)) random_actions = np.random.randint(self.action_size, size=(self.num_envs)) actions = np.where(random < self.epsilon, random_actions, actions) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, dones, infos)) self.states = next_states self.schedule.step() #print('epsilon', self.epsilon) states, actions, rewards, dones, infos = zip(*rollout) states, actions, rewards, dones = np.stack(states), np.stack( actions), np.stack(rewards), np.stack(dones) TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)), self.num_steps, self.num_envs) # Q(s,a; theta-1) values = np.sum(TargetQsa * one_hot(actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) last_actions = np.argmax(self.Q.forward(next_states), axis=1) last_TargetQsa = self.TargetQ.forward( next_states) # Q(s,a; theta-1) last_values = np.sum( last_TargetQsa * one_hot(last_actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) return states, actions, rewards, dones, infos, values, last_values
def _train_nstep(self): batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size alpha_step = 1/num_updates s = 0 mini_batch_size = self.nsteps//self.num_minibatches start = time.time() # main loop for t in range(1,num_updates+1): states, actions, rewards, values, last_values, old_policies, dones, infos = self.runner.run() Adv = self.GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=self.lambda_) R = Adv + values l = 0 idxs = np.arange(len(states)) for epoch in range(self.num_epochs): np.random.shuffle(idxs) for batch in range(0,len(states), mini_batch_size): batch_idxs = idxs[batch:batch + mini_batch_size] # stack all states, actions and Rs across all workers into a single batch mb_states, mb_actions, mb_R, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R[batch_idxs]), \ fold_batch(Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) l += self.model.backprop(mb_states, mb_R, mb_Adv, mb_actions, mb_old_policies, self.alpha) l /= (self.num_epochs*self.num_minibatches) #self.alpha -= alpha_step if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t,l,start,render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt") ) print('saved model')
def _train_nstep(self): start = time.time() num_updates = self.total_steps // (self.num_envs * self.nsteps) s = 0 #self.validate(self.val_envs[0], 1, 1000) self.populate_memory() # main loop for t in range(1, num_updates + 1): states, actions, rewards, values, dones, last_values, prev_state, Qaux = self.runner.run( ) self.state_mean, self.state_std = self.obs_running.update( fold_batch(states).mean(axis=0)[:, :, -1:]) pixel_rewards = self.pixel_rewards(prev_state, states) pix_rew_mean, pix_rew_std = self.aux_reward_rolling.update( self.auxiliary_target(pixel_rewards, np.max(Qaux, axis=-1), dones).mean()) Qaux_target = self.auxiliary_target(pixel_rewards / pix_rew_std, np.max(Qaux, axis=-1), dones) # R = self.nstep_return(rewards, last_values, dones, clip=False) Adv = self.GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=0.95) R = Adv + values #self.print_stats('R', R) #self.print_stats('Adv', Adv) # stack all states, actions and Rs across all workers into a single batch states, actions, rewards, R, Adv, Qaux_target = fold_batch( states), fold_batch(actions), fold_batch(rewards), fold_batch( R), fold_batch(Adv), fold_batch(Qaux_target) reward_states, sample_rewards = self.sample_reward() #replay_states, replay_actions, replay_R, Qaux_target, replay_dones = self.sample_replay() l = self.model.backprop(states, R, Adv, actions, dones, reward_states, sample_rewards, Qaux_target) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % self.validate_freq == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % self.save_freq == 0: s += 1 self.saver.save( self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt")) print('saved model')
def sample_replay(self): sample_start = np.random.randint(1, len(self.replay) - self.nsteps - 2) replay_sample = [] for i in range(sample_start, sample_start + self.nsteps): replay_sample.append(self.replay[i]) replay_states = np.stack( [replay_sample[i][0] for i in range(len(replay_sample))]) replay_actions = np.stack( [replay_sample[i][1] for i in range(len(replay_sample))]) replay_rewards = np.stack( [replay_sample[i][2] for i in range(len(replay_sample))]) replay_values = np.stack( [replay_sample[i][3] for i in range(len(replay_sample))]) replay_dones = np.stack( [replay_sample[i][4] for i in range(len(replay_sample))]) #print('replay_hiddens dones shape', replay_dones.shape) next_state = self.replay[sample_start + self.nsteps][0] # get state _, replay_last_values = self.model.forward(next_state) replay_R = self.GAE(replay_rewards, replay_values, replay_last_values, replay_dones, gamma=0.99, lambda_=0.95) + replay_values if self.model.pixel_control: prev_states = self.replay[sample_start - 1][0] Qaux_value = self.model.get_pixel_control(next_state) pixel_rewards = self.pixel_rewards(prev_states, replay_states) Qaux_target = self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=-1), replay_dones) else: Qaux_target = np.zeros( (len(replay_states), 1, 1, 1)) # produce fake Qaux to save writing unecessary code return fold_batch(replay_states), fold_batch( replay_actions), fold_batch(replay_R), fold_batch( Qaux_target), fold_batch(replay_dones)
def _train_nstep(self): num_updates = self.total_steps // (self.num_envs * self.nsteps) s = 0 self.runner.state_mean, self.runner.state_std = self.state_obs.update(self.init_state_obs(10000//self.num_envs)) self.runner.states = self.env.reset() rolling = rolling_stats() start = time.time() # main loop for t in range(1,num_updates+1): states, next_states, actions, rewards, dones, values = self.runner.run() _, last_values = self.model.forward(next_states[-1]) R_mean, R_std = rolling.update(self.nstep_return(rewards, last_values, dones).ravel().mean(axis=0)) rewards /= R_std #print('rewards', rewards) R = self.nstep_return(rewards, last_values, dones) Adv = R - values #delta = rewards + self.gamma * values[:-1] - values[1:] #Adv = self.multistep_target(delta, values[-1], dones, gamma=self.gamma*self.lambda_) # stack all states, next_states, actions and Rs across all workers into a single batch states, next_states, actions, R, Adv = fold_batch(states), fold_batch(next_states), fold_batch(actions), fold_batch(R), fold_batch(Adv) mean, std = np.stack([self.runner.state_mean for i in range(4)], -1), np.stack([self.runner.state_std for i in range(4)], -1) l = self.model.backprop(states, next_states, R, Adv, actions, mean, std) #self.runner.state_mean, self.runner.state_std = self.state_obs.update(states) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % self.validate_freq == 0: self.validation_summary(t,l,start,render) start = time.time() if self.save_freq > 0 and t % self.save_freq == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt") ) print('saved model')
def init_state_obs(self, num_steps): rollout = [] states = self.env.reset() for i in range(1, num_steps + 1): rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs) #print('rand_actions.shape', rand_actions.shape) next_states, rewards, dones, infos = self.env.step(rand_actions) rollout.append([states, next_states, rand_actions, rewards]) states = next_states if i % self.nsteps == 0: mb_states, mb_next_states, mb_actions, mb_rewards = stack_many( zip(*rollout)) #print('states, next_states, actions, rewards', mb_states.shape, mb_next_states.shape, mb_actions.shape, mb_rewards.shape) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( mb_states) self.forward_model.backprop(mb_states[0], fold_batch(mb_next_states), fold_batch(mb_actions), fold_batch(mb_rewards), len(mb_states)) rollout = []
def sample_replay(self): states, actions, rewards, dones, next_states = self.replay.sample( self.num_steps) TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)), self.num_steps, self.num_envs) # Q(s,a; theta-1) values = np.sum(TargetQsa * one_hot(actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) last_actions = np.argmax(self.Q.forward(next_states), axis=1) last_TargetQsa = self.TargetQ.forward( next_states) # Q(s,a; theta-1) last_values = np.sum( last_TargetQsa * one_hot(last_actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) return states, actions, rewards, dones, 0, values, last_values
def _train_nstep(self): batch_size = (self.num_envs * self.nsteps) num_updates = self.total_steps // batch_size s = 0 rolling = RunningMeanStd() self.state_rolling = rolling_obs(shape=()) self.init_state_obs(128 * 50) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop start = time.time() for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run( ) policy, last_extr_values, last_intr_values = self.model.forward( next_states[-1]) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) R_intr_mean, R_intr_std = rolling.update(int_rff.ravel()) intr_rewards /= R_intr_std if self.return_type == 'GAE': R_extr = self.GAE(extr_rewards, extr_values, last_extr_values, dones, gamma=0.999, lambda_=self.lambda_) + extr_values R_intr = self.GAE(intr_rewards, intr_values, last_intr_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) + intr_values else: R_extr = self.nstep_return(extr_rewards, last_extr_values, dones, gamma=0.999, clip=False) R_intr = self.nstep_return( intr_rewards, last_intr_values, np.zeros_like(dones), gamma=0.99, clip=False) # non episodic intr reward signal Adv = self.model.extr_coeff * ( R_extr - extr_values) + self.model.intr_coeff * (R_intr - intr_values) # stack all states, next_states, actions and Rs across all workers into a single batch states, next_states, actions, R_extr, R_intr, Adv = fold_batch( states), fold_batch(next_states), fold_batch( actions), fold_batch(R_extr), fold_batch( R_intr), fold_batch(Adv) l = self.model.backprop(states, next_states, R_extr, R_intr, Adv, actions, self.runner.state_mean, self.runner.state_std) #start= time.time() if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt")) print('saved model')
def _train_nstep(self): batch_size = (self.num_envs * self.nsteps) num_updates = self.total_steps // batch_size s = 0 rolling = RunningMeanStd(shape=()) self.state_rolling = rolling_obs(shape=(), lastFrame=False) self.init_state_obs(128 * 50) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop start = time.time() for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run( ) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics policy, extr_last_values, intr_last_values = self.model.forward( next_states[-1]) int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) R_intr_mean, R_intr_std = rolling.update(int_rff.ravel()) intr_rewards /= R_intr_std Adv_extr = self.GAE(extr_rewards, values_extr, extr_last_values, dones, gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, values_intr, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr # perform minibatch gradient descent for K epochs l = 0 idxs = np.arange(len(states)) for epoch in range(self.num_epochs): mini_batch_size = self.nsteps // self.num_minibatches np.random.shuffle(idxs) for batch in range(0, len(states), mini_batch_size): batch_idxs = idxs[batch:batch + mini_batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) #mb_nextstates = mb_nextstates[np.where(np.random.uniform(size=(batch_size)) < self.pred_prob)] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, mean, std) l /= (self.num_epochs * self.num_minibatches) if self.render_freq > 0 and t % ( (self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt")) print('saved model')
def _train_nstep(self): start = time.time() num_updates = self.total_steps // (self.num_envs * self.nsteps) alpha_step = 1 / num_updates s = 0 rolling = RunningMeanStd(shape=()) self.state_rolling = rolling_obs(shape=()) self.init_state_obs(129) #self.runner.state_mean, self.runner.state_std = self.state_rolling.mean, np.sqrt(self.state_rolling.var) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run( ) policy, extr_last_values, intr_last_values = self.model.forward( next_states[-1]) int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) #R_intr_mean, R_intr_std = rolling.update(self.discount(intr_rewards, self.gamma).ravel().mean()) # rolling.update(int_rff.ravel()) R_intr_std = np.sqrt(rolling.var) intr_rewards /= R_intr_std #print('intr reward', intr_rewards) forward_loss = self.forward_model.backprop( states[0], fold_batch(next_states), fold_batch(actions), fold_batch(extr_rewards), self.nsteps) Adv_extr = self.GAE(extr_rewards, values_extr, extr_last_values, dones, gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, values_intr, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr #self.runner.state_mean, self.runner.state_std = state_rolling.update(fold_batch(next_states)[:,:,:,-1:]) # update state normalisation statistics self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics # perform minibatch gradient descent for K epochs l = 0 idxs = np.arange(len(states)) for epoch in range(self.num_epochs): batch_size = self.nsteps // self.num_minibatches np.random.shuffle(idxs) for batch in range(0, len(states), batch_size): batch_idxs = idxs[batch:batch + batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(batch_size)) < self.pred_prob)][:, :, :, -1:] #mb_nextstates = (mb_nextstates - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, self.alpha, mean, std) l /= (self.num_epochs * self.num_minibatches) # Imagined future rollout hidden = self.forward_model.get_initial_hidden(self.num_envs) obs = next_states[-1] encoded_last_state = self.forward_model.encode_state( next_states[-1]) # o_t -> s_t actions = [ np.random.choice(policy.shape[1], p=policy[i]) for i in range(policy.shape[0]) ] imagined_rollout = [] with tf.variable_scope('forward_model/latent-space-rnn', reuse=tf.AUTO_REUSE): for i in range(self.nsteps): next_obs, extr_rewards, encoded_last_state, hidden = self.forward_model.predict_next( encoded_last_state, hidden, actions) #print('imagined obs', next_obs.shape) intr_rewards = self.model.intrinsic_reward( next_obs[..., -1:], self.runner.state_mean, self.runner.state_std) policies, extr_values, intr_values = self.model.forward( obs) actions = [ np.random.choice(policy.shape[1], p=policy[i]) for i in range(policy.shape[0]) ] imagined_rollout.append([ obs, next_obs, actions, extr_rewards[:, 0], intr_rewards, extr_values, intr_values, policies ]) obs = next_obs obs, next_obs, actions, extr_rewards, intr_rewards, extr_values, intr_values, old_policies = stack_many( zip(*imagined_rollout)) #print('imagined obs', obs.shape) #print('imagined extr rew', extr_rewards.shape) #print('imagined extr_values', extr_values.shape) #print('imagined intr_values', intr_values.shape) intr_rewards /= R_intr_std policies, extr_last_values, intr_last_values = self.model.forward( next_obs[-1]) Adv_extr = self.GAE(extr_rewards, extr_values, extr_last_values, np.zeros_like(dones), gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, intr_values, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr for batch in range(0, len(obs), batch_size): batch_idxs = idxs[batch:batch + batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(obs[batch_idxs]), fold_batch(next_obs[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(batch_size)) < self.pred_prob)][..., -1:] #mb_nextstates = (mb_nextstates - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, self.alpha, mean, std) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % self.validate_freq == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % self.save_freq == 0: s += 1 self.saver.save( self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt")) print('saved model')
def forward(self, state, hidden, validate=False): if validate: return self.validate_policy.forward(state, hidden) else : return self.train_policy.forward(fold_batch(state), hidden)
def _train_nstep(self): batch_size = self.num_envs * self.nsteps num_updates = self.total_steps // batch_size #validate_freq = self.validate_freq // batch_size #save_freq = self.save_freq // batch_size s = 0 rolling = RunningMeanStd() self.init_state_obs(20*50*25) forward_filter = RewardForwardFilter(0.99) self.runner.states = self.env.reset() # main loop start = time.time() for t in range(1,num_updates+1): states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run() policy, last_extr_values, last_intr_values = self.model.forward(next_states[-1]) self.update_minmax(states) Qaux_value = self.model.get_pixel_control(next_states[-1]) pixel_rewards = self.pixel_rewards(states) Qaux_target = fold_batch(self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=-1), dones)) #onehot_rewards = fold_batch(one_hot(extr_rewards.astype(np.int32), 3)) reward_states, sample_rewards, = self.sample_reward(states, extr_rewards) self.runner.state_mean, self.runner.state_std = self.state_rolling.update(next_states) # update state normalisation statistics r_intr = np.array([forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards))]) # update intrinsic return estimate R_intr_mean, R_intr_std = rolling.update(r_intr.ravel()) intr_rewards /= R_intr_std # normalise intr rewards #print('intr_reward', intr_rewards) R_extr = self.GAE(extr_rewards, extr_values, last_extr_values, dones, gamma=0.999, lambda_=self.lambda_, clip=False) + extr_values R_intr = self.GAE(intr_rewards, intr_values, last_intr_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_, clip=False) + intr_values #R_mean, R_std = rolling.update(R_intr.ravel()) Adv = self.model.extr_coeff * (R_extr - extr_values) + self.model.intr_coeff * (R_intr - intr_values) # stack all states, next_states, actions and Rs across all workers into a single batch next_states = next_states[...,-1:] if len(next_states.shape) == 5 else next_states states, next_states, actions, R_extr, R_intr, Adv = fold_batch(states), fold_batch(next_states), fold_batch(actions), fold_batch(R_extr), fold_batch(R_intr), fold_batch(Adv) l = self.model.backprop(states, next_states, R_extr, R_intr, Adv, actions, Qaux_target, reward_states, sample_rewards, self.runner.state_mean, self.runner.state_std) #print('backprop time', time.time() -start) #start= time.time() if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: self.validation_summary(t,l,start,render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: s += 1 self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt") ) print('saved model')