def run(self): rollout = [] for t in range(self.num_steps): Qsa = self.Q.forward(self.states) actions = np.argmax(Qsa, axis=1) random = np.random.uniform(size=(self.num_envs)) random_actions = np.random.randint(self.action_size, size=(self.num_envs)) actions = np.where(random < self.epsilon, random_actions, actions) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, dones, infos)) self.states = next_states self.schedule.step() #print('epsilon', self.epsilon) states, actions, rewards, dones, infos = zip(*rollout) states, actions, rewards, dones = np.stack(states), np.stack( actions), np.stack(rewards), np.stack(dones) TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)), self.num_steps, self.num_envs) # Q(s,a; theta-1) values = np.sum(TargetQsa * one_hot(actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) last_actions = np.argmax(self.Q.forward(next_states), axis=1) last_TargetQsa = self.TargetQ.forward( next_states) # Q(s,a; theta-1) last_values = np.sum( last_TargetQsa * one_hot(last_actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) return states, actions, rewards, dones, infos, values, last_values
def rollout(self): rollout = [] for t in range(self.nsteps): Qsa = self.eval_state(self.states, self.loc) actions = np.argmax(Qsa, axis=1) random = np.random.uniform(size=(self.num_envs)) random_actions = np.random.randint(self.action_size, size=(self.num_envs)) actions = np.where(random < self.epsilon, random_actions, actions) next_states, rewards, dones, infos = self.env.step(actions) values = np.sum(Qsa * one_hot(actions, self.action_size), axis=-1) rollout.append((self.states, self.loc, actions, rewards, dones, infos, values)) self.states = next_states self.epsilon = self.scheduler.step() self.loc = self.get_locs() states, locs, actions, rewards, dones, infos, values = stack_many(*zip( *rollout)) last_Qsa = self.eval_state(next_states, self.loc) # Q(s,a|theta) last_actions = np.argmax(last_Qsa, axis=1) last_values = np.sum(last_Qsa * one_hot(last_actions, self.action_size), axis=-1) return states, locs, actions, rewards, dones, infos, values, last_values
def sample_replay(self): states, actions, rewards, dones, next_states = self.replay.sample( self.num_steps) TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)), self.num_steps, self.num_envs) # Q(s,a; theta-1) values = np.sum(TargetQsa * one_hot(actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) last_actions = np.argmax(self.Q.forward(next_states), axis=1) last_TargetQsa = self.TargetQ.forward( next_states) # Q(s,a; theta-1) last_values = np.sum( last_TargetQsa * one_hot(last_actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) return states, actions, rewards, dones, 0, values, last_values
def backprop(self, state, next_state, R_extr, R_intr, Adv, actions, Qaux_target, reward_states, target_rewards, state_mean, state_std): actions_onehot = one_hot(actions, self.action_size) feed_dict = {self.policy.state:state, self.policy.actions:actions, self.policy.R_extr:R_extr, self.policy.R_intr:R_intr, self.policy.Advantage:Adv, self.next_state:next_state, self.state_mean:state_mean, self.state_std:state_std, self.Qaux_target:Qaux_target, self.reward_target:target_rewards, self.reward_state:reward_states} _, l = self.sess.run([self.train_op,self.loss], feed_dict=feed_dict) return l
def backprop(self, state, next_state, R_extr, R_intr, actions, hidden, dones): actions_onehot = one_hot(actions, self.action_size) feed_dict = {self.train_policy.state:state, self.train_policy.actions:actions, self.next_state:next_state, self.train_policy.R_extr:R_extr, self.train_policy.R_intr:R_intr, self.train_policy.hidden_in[0]:hidden[0], self.train_policy.hidden_in[1]:hidden[1], self.train_policy.mask:dones} _, l = self.sess.run([self.train_op,self.loss], feed_dict=feed_dict) return l
def backprop(self, states, locs, R, actions): x, y = zip(*locs) Qsa = self.forward(totorch(states, self.device), torch.tensor(x).to(self.device), torch.tensor(y)).to(self.device) actions_onehot = totorch(one_hot(actions, self.action_size), self.device) Qvalue = torch.sum(Qsa * actions_onehot, axis=1) loss = torch.mean(torch.square(totorch(R).float().cuda() - Qvalue)) loss.backward() self.optim.step() self.optim.zero_grad() return loss.detach().cpu().numpy()
def backprop(self, state, next_state, R_extr, R_intr, Adv, actions, state_mean, state_std): actions_onehot = one_hot(actions, self.action_size) feed_dict = { self.policy.state: state, self.policy.actions: actions, self.policy.R_extr: R_extr, self.policy.R_intr: R_intr, self.policy.Advantage: Adv, self.next_state: next_state, self.state_mean: state_mean, self.state_std: state_std } _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) return l
def concat_action_reward(actions, rewards, num_classes): concat = one_hot(actions, num_classes) concat[:,-1] = rewards return concat