def rollout(self): rollout = [] for t in range(self.nsteps): Qsa = self.eval_state(self.states, self.loc) actions = np.argmax(Qsa, axis=1) random = np.random.uniform(size=(self.num_envs)) random_actions = np.random.randint(self.action_size, size=(self.num_envs)) actions = np.where(random < self.epsilon, random_actions, actions) next_states, rewards, dones, infos = self.env.step(actions) values = np.sum(Qsa * one_hot(actions, self.action_size), axis=-1) rollout.append((self.states, self.loc, actions, rewards, dones, infos, values)) self.states = next_states self.epsilon = self.scheduler.step() self.loc = self.get_locs() states, locs, actions, rewards, dones, infos, values = stack_many(*zip( *rollout)) last_Qsa = self.eval_state(next_states, self.loc) # Q(s,a|theta) last_actions = np.argmax(last_Qsa, axis=1) last_values = np.sum(last_Qsa * one_hot(last_actions, self.action_size), axis=-1) return states, locs, actions, rewards, dones, infos, values, last_values
def rollout(self, ): rollout = [] for t in range(self.nsteps): policies, values = self.model.evaluate(self.states) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.states = next_states states, actions, rewards, values, dones = stack_many(*zip(*rollout)) _, last_values = self.model.evaluate(next_states) return states, actions, rewards, dones, values, last_values
def rollout(self,): rollout = [] for t in range(self.nsteps): policies, values = self.model.evaluate(self.states) # Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis]) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.replay.append((self.states, actions, rewards, values, dones)) # add to replay memory self.states = next_states states, actions, rewards, values, dones = stack_many(*zip(*rollout)) _, last_values = self.model.evaluate(next_states) return states, actions, rewards, values, dones, last_values
def train(global_model, model, env, nsteps, num_episodes, ID): opt = torch.optim.RMSprop(global_model.parameters(), lr=1e-3) episode = 0 episode_steps = 0 episode_score = 0 T = 0 state = env.reset() start = time.time() while episode < num_episodes: rollout = [] for t in range(nsteps): with torch.no_grad(): policy, value = model(totorch(state[None], device='cpu')) policy, value = tonumpy(policy), tonumpy(value) action = np.random.choice(policy.shape[1], p=policy[0]) next_state, reward, done, info = env.step(action) episode_score += reward rollout.append((state, action, reward, value, done)) state = next_state T += 1 episode_steps += 1 if done or t == nsteps-1: states, actions, rewards, values, dones = stack_many(*zip(*rollout)) with torch.no_grad(): _, last_values = model.forward(totorch(next_state[None], device='cpu')) last_values = last_values.cpu().numpy() R = lambda_return(rewards, values, last_values, dones, gamma=0.9, lambda_=0.95, clip=False) loss = update_params(model, global_model, opt, states, actions, R) #self.T += t if done: episode += 1 state = env.reset() if episode % 1 == 0: time_taken = time.time() - start print(f'worker {ID}, total worker steps {T:,} local episode {episode}, episode score {episode_score} episode steps {episode_steps}, time taken {time_taken:,.1f}s, fps {episode_steps/time_taken:.2f}') episode_steps = 0 episode_score = 0 start = time.time() break
def run(self, ): rollout = [] for t in range(self.num_steps): policies, values = self.model.forward(self.states) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones, np.array(infos))) self.states = next_states states, actions, rewards, values, dones, infos = stack_many( zip(*rollout)) _, last_values = self.model.forward(next_states) return states, actions, rewards, dones, infos, values, last_values
def rollout(self, ): rollout = [] first_hidden = self.prev_hidden for t in range(self.nsteps): policies, values, hidden = self.model.evaluate( self.states[None], self.prev_hidden) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.states = next_states self.prev_hidden = self.model.mask_hidden( hidden, dones) # reset hidden state at end of episode states, actions, rewards, values, dones = stack_many(*zip(*rollout)) _, last_values, _ = self.model.evaluate(self.states[None], self.prev_hidden) return states, actions, rewards, first_hidden, dones, values, last_values
def rollout(self,): rollout = [] first_hidden = self.prev_hidden for t in range(self.nsteps): policies, values, hidden = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden) #Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[None]) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, self.prev_actions_rewards, dones, infos)) self.replay.append((self.states, actions, rewards, self.prev_hidden, self.prev_actions_rewards, dones)) # add to replay memory self.states = next_states self.prev_hidden = self.model.mask_hidden(hidden, dones) # reset hidden state at end of episode self.prev_actions_rewards = concat_action_reward(actions, rewards, self.action_size+1) states, actions, rewards, prev_actions_rewards, dones, infos = stack_many(*zip(*rollout)) _, last_values, _ = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden) return states, actions, rewards, first_hidden, prev_actions_rewards, dones, last_values
def run(self,): rollout = [] for t in range(self.num_steps): start = time.time() policies, values = self.model.forward(self.states) actions = [np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0])] next_states, extr_rewards, dones, infos = self.env.step(actions) mean, std = np.stack([self.state_mean for i in range(4)], -1), np.stack([self.state_std for i in range(4)], -1) intr_rewards = self.model.intrinsic_reward(self.states, actions, next_states, mean, std) #print('intr_rewards', intr_rewards) rewards = extr_rewards + intr_rewards #print('rewards', rewards) rollout.append((self.states, next_states, actions, rewards, values, dones)) self.states = next_states states, next_states, actions, rewards, values, dones = stack_many(zip(*rollout)) return states, next_states, actions, rewards, dones, values
def rollout(self, ): rollout = [] for t in range(self.nsteps): start = time.time() policies, values = self.model.evaluate(self.states) actions = fastsample(policies) next_states, extr_rewards, dones, infos = self.env.step(actions) mean, std = self.state_mean[None], self.state_std[None] intr_rewards = self.model.intrinsic_reward( (self.states - mean) / std, actions, (next_states - mean) / std) rewards = extr_rewards + intr_rewards rollout.append( (self.states, next_states, actions, rewards, values, dones)) self.states = next_states states, next_states, actions, rewards, values, dones = stack_many(*zip( *rollout)) return states, next_states, actions, rewards, dones, values
def run(self, ): rollout = [] for t in range(self.num_steps): policies, values, hidden = self.model.forward( self.states, self.prev_hidden) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, self.prev_hidden, dones, infos)) self.states = next_states self.prev_hidden = self.model.reset_batch_hidden( hidden, 1 - dones) # reset hidden state at end of episode states, actions, rewards, hidden_batch, dones, infos = stack_many( zip(*rollout)) _, last_values, _ = self.model.forward(next_states, self.prev_hidden) return states, actions, rewards, hidden_batch, dones, infos, values, last_values
def run(self, ): rollout = [] first_state = self.first_state for t in range(self.num_steps): policies, values = self.model.forward(self.states) #Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis]) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.replay.append((self.states, actions, rewards, dones)) # add to replay memory self.first_state = self.states.copy() self.states = next_states states, actions, rewards, values, dones = stack_many(zip(*rollout)) _, last_values = self.model.forward(next_states) Qaux = self.model.get_pixel_control(next_states) return states, actions, rewards, values, dones, last_values, first_state, Qaux
def init_state_obs(self, num_steps): rollout = [] states = self.env.reset() for i in range(1, num_steps + 1): rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs) #print('rand_actions.shape', rand_actions.shape) next_states, rewards, dones, infos = self.env.step(rand_actions) rollout.append([states, next_states, rand_actions, rewards]) states = next_states if i % self.nsteps == 0: mb_states, mb_next_states, mb_actions, mb_rewards = stack_many( zip(*rollout)) #print('states, next_states, actions, rewards', mb_states.shape, mb_next_states.shape, mb_actions.shape, mb_rewards.shape) self.runner.state_mean, self.runner.state_std = self.state_rolling.update( mb_states) self.forward_model.backprop(mb_states[0], fold_batch(mb_next_states), fold_batch(mb_actions), fold_batch(mb_rewards), len(mb_states)) rollout = []
def run(self, ): rollout = [] for t in range(self.num_steps): policies, values_extr, values_intr = self.model.forward( self.states) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, extr_rewards, dones, infos = self.env.step( actions) rollout.append( (self.states, next_states, actions, extr_rewards, values_extr, values_intr, policies, dones)) self.states = next_states states, next_states, actions, extr_rewards, values_extr, values_intr, policies, dones = stack_many( zip(*rollout)) intr_rewards = self.model.intrinsic_reward(fold_batch(states), fold_batch(actions), fold_batch(next_states), self.state_mean, self.state_std) intr_rewards = unfold_batch(intr_rewards, self.num_steps, len(self.env)) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
def run(self, ): rollout = [] for t in range(self.num_steps): policies, values_extr, values_intr = self.model.forward( self.states) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, extr_rewards, dones, infos = self.env.step( actions) next_states__ = next_states[..., -1:] if len( next_states.shape) == 4 else next_states intr_rewards = self.model.intrinsic_reward( next_states__, self.state_mean, self.state_std) #print('intr rewards', intr_rewards) rollout.append( (self.states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) self.replay.append( (self.states, actions, extr_rewards, values_extr, dones)) # add to replay memory self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many( zip(*rollout)) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
def run(self,): rollout = [] for t in range(self.num_steps): start = time.time() policies, extr_values, intr_values = self.model.forward(self.states) actions = [np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0])] next_states, extr_rewards, dones, infos = self.env.step(actions) next_states__ = next_states[...,-1:] if len(next_states.shape) == 4 else next_states intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std) #print('intr_rewards', self.model.intr_coeff * intr_rewards) rollout.append((self.states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, np.array(infos))) self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = stack_many(zip(*rollout)) return states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos
def rollout(self): rollout = [] for t in range(self.nsteps): policies, values_extr, values_intr = self.model.evaluate( self.states) actions = fastsample(policies) next_states, extr_rewards, dones, infos = self.env.step(actions) next_states__ = next_states[:, -1:] if len( next_states.shape ) == 4 else next_states # [num_envs, channels, height, width] for convolutions intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std) rollout.append( (self.states, next_states__, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many( *zip(*rollout)) last_policy, last_values_extr, last_values_intr, = self.model.evaluate( self.states) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
def run(self, ): rollout = [] for t in range(self.num_steps): policies, values_extr, values_intr = self.model.forward( self.states) #actions = np.argmax(policies, axis=1) actions = [ np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0]) ] next_states, extr_rewards, dones, infos = self.env.step( actions) intr_rewards = self.model.intrinsic_reward( next_states[..., -1:], self.state_mean, self.state_std) #print('intr rewards', intr_rewards) rollout.append( (self.states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many( zip(*rollout)) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
def _train_nstep(self): start = time.time() num_updates = self.total_steps // (self.num_envs * self.nsteps) alpha_step = 1 / num_updates s = 0 rolling = RunningMeanStd(shape=()) self.state_rolling = rolling_obs(shape=()) self.init_state_obs(129) #self.runner.state_mean, self.runner.state_std = self.state_rolling.mean, np.sqrt(self.state_rolling.var) self.runner.states = self.env.reset() forward_filter = RewardForwardFilter(self.gamma) # main loop for t in range(1, num_updates + 1): states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run( ) policy, extr_last_values, intr_last_values = self.model.forward( next_states[-1]) int_rff = np.array([ forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards)) ]) #R_intr_mean, R_intr_std = rolling.update(self.discount(intr_rewards, self.gamma).ravel().mean()) # rolling.update(int_rff.ravel()) R_intr_std = np.sqrt(rolling.var) intr_rewards /= R_intr_std #print('intr reward', intr_rewards) forward_loss = self.forward_model.backprop( states[0], fold_batch(next_states), fold_batch(actions), fold_batch(extr_rewards), self.nsteps) Adv_extr = self.GAE(extr_rewards, values_extr, extr_last_values, dones, gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, values_intr, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr #self.runner.state_mean, self.runner.state_std = state_rolling.update(fold_batch(next_states)[:,:,:,-1:]) # update state normalisation statistics self.runner.state_mean, self.runner.state_std = self.state_rolling.update( next_states) # update state normalisation statistics # perform minibatch gradient descent for K epochs l = 0 idxs = np.arange(len(states)) for epoch in range(self.num_epochs): batch_size = self.nsteps // self.num_minibatches np.random.shuffle(idxs) for batch in range(0, len(states), batch_size): batch_idxs = idxs[batch:batch + batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(batch_size)) < self.pred_prob)][:, :, :, -1:] #mb_nextstates = (mb_nextstates - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, self.alpha, mean, std) l /= (self.num_epochs * self.num_minibatches) # Imagined future rollout hidden = self.forward_model.get_initial_hidden(self.num_envs) obs = next_states[-1] encoded_last_state = self.forward_model.encode_state( next_states[-1]) # o_t -> s_t actions = [ np.random.choice(policy.shape[1], p=policy[i]) for i in range(policy.shape[0]) ] imagined_rollout = [] with tf.variable_scope('forward_model/latent-space-rnn', reuse=tf.AUTO_REUSE): for i in range(self.nsteps): next_obs, extr_rewards, encoded_last_state, hidden = self.forward_model.predict_next( encoded_last_state, hidden, actions) #print('imagined obs', next_obs.shape) intr_rewards = self.model.intrinsic_reward( next_obs[..., -1:], self.runner.state_mean, self.runner.state_std) policies, extr_values, intr_values = self.model.forward( obs) actions = [ np.random.choice(policy.shape[1], p=policy[i]) for i in range(policy.shape[0]) ] imagined_rollout.append([ obs, next_obs, actions, extr_rewards[:, 0], intr_rewards, extr_values, intr_values, policies ]) obs = next_obs obs, next_obs, actions, extr_rewards, intr_rewards, extr_values, intr_values, old_policies = stack_many( zip(*imagined_rollout)) #print('imagined obs', obs.shape) #print('imagined extr rew', extr_rewards.shape) #print('imagined extr_values', extr_values.shape) #print('imagined intr_values', intr_values.shape) intr_rewards /= R_intr_std policies, extr_last_values, intr_last_values = self.model.forward( next_obs[-1]) Adv_extr = self.GAE(extr_rewards, extr_values, extr_last_values, np.zeros_like(dones), gamma=0.999, lambda_=self.lambda_) Adv_intr = self.GAE( intr_rewards, intr_values, intr_last_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_) # non episodic intr reward signal R_extr = Adv_extr + values_extr R_intr = Adv_intr + values_intr total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr for batch in range(0, len(obs), batch_size): batch_idxs = idxs[batch:batch + batch_size] # stack all states, next_states, actions and Rs across all workers into a single batch mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(obs[batch_idxs]), fold_batch(next_obs[batch_idxs]), \ fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \ fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs]) mb_nextstates = mb_nextstates[np.where( np.random.uniform( size=(batch_size)) < self.pred_prob)][..., -1:] #mb_nextstates = (mb_nextstates - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis] mean, std = self.runner.state_mean, self.runner.state_std l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr, mb_Rintr, mb_Adv, mb_actions, mb_old_policies, self.alpha, mean, std) if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % self.validate_freq == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % self.save_freq == 0: s += 1 self.saver.save( self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt")) print('saved model')