Пример #1
0
        def run(self):
            rollout = []
            for t in range(self.num_steps):
                Qsa = self.Q.forward(self.states)
                actions = np.argmax(Qsa, axis=1)
                random = np.random.uniform(size=(self.num_envs))
                random_actions = np.random.randint(self.action_size,
                                                   size=(self.num_envs))
                actions = np.where(random < self.epsilon, random_actions,
                                   actions)
                next_states, rewards, dones, infos = self.env.step(actions)
                rollout.append((self.states, actions, rewards, dones, infos))
                self.states = next_states
                self.schedule.step()
                #print('epsilon', self.epsilon)

            states, actions, rewards, dones, infos = zip(*rollout)
            states, actions, rewards, dones = np.stack(states), np.stack(
                actions), np.stack(rewards), np.stack(dones)
            TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)),
                                     self.num_steps,
                                     self.num_envs)  # Q(s,a; theta-1)
            values = np.sum(TargetQsa * one_hot(actions, self.action_size),
                            axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)

            last_actions = np.argmax(self.Q.forward(next_states), axis=1)
            last_TargetQsa = self.TargetQ.forward(
                next_states)  # Q(s,a; theta-1)
            last_values = np.sum(
                last_TargetQsa * one_hot(last_actions, self.action_size),
                axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)
            return states, actions, rewards, dones, infos, values, last_values
Пример #2
0
    def rollout(self):
        rollout = []
        for t in range(self.nsteps):
            Qsa = self.eval_state(self.states, self.loc)
            actions = np.argmax(Qsa, axis=1)
            random = np.random.uniform(size=(self.num_envs))
            random_actions = np.random.randint(self.action_size,
                                               size=(self.num_envs))
            actions = np.where(random < self.epsilon, random_actions, actions)
            next_states, rewards, dones, infos = self.env.step(actions)
            values = np.sum(Qsa * one_hot(actions, self.action_size), axis=-1)
            rollout.append((self.states, self.loc, actions, rewards, dones,
                            infos, values))
            self.states = next_states
            self.epsilon = self.scheduler.step()
            self.loc = self.get_locs()

        states, locs, actions, rewards, dones, infos, values = stack_many(*zip(
            *rollout))

        last_Qsa = self.eval_state(next_states, self.loc)  # Q(s,a|theta)
        last_actions = np.argmax(last_Qsa, axis=1)
        last_values = np.sum(last_Qsa *
                             one_hot(last_actions, self.action_size),
                             axis=-1)
        return states, locs, actions, rewards, dones, infos, values, last_values
        def sample_replay(self):
            states, actions, rewards, dones, next_states = self.replay.sample(
                self.num_steps)
            TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)),
                                     self.num_steps,
                                     self.num_envs)  # Q(s,a; theta-1)
            values = np.sum(TargetQsa * one_hot(actions, self.action_size),
                            axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)

            last_actions = np.argmax(self.Q.forward(next_states), axis=1)
            last_TargetQsa = self.TargetQ.forward(
                next_states)  # Q(s,a; theta-1)
            last_values = np.sum(
                last_TargetQsa * one_hot(last_actions, self.action_size),
                axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)
            return states, actions, rewards, dones, 0, values, last_values
Пример #4
0
    def backprop(self, state, next_state, R_extr, R_intr, Adv, actions, Qaux_target, reward_states, target_rewards, state_mean, state_std):
        actions_onehot = one_hot(actions, self.action_size)
        feed_dict = {self.policy.state:state, self.policy.actions:actions,
                     self.policy.R_extr:R_extr, self.policy.R_intr:R_intr, self.policy.Advantage:Adv,
                     self.next_state:next_state, self.state_mean:state_mean, self.state_std:state_std,
                     self.Qaux_target:Qaux_target, self.reward_target:target_rewards, self.reward_state:reward_states}

        _, l = self.sess.run([self.train_op,self.loss], feed_dict=feed_dict)
        return l
    def backprop(self, state, next_state, R_extr, R_intr, actions, hidden, dones):
        actions_onehot = one_hot(actions, self.action_size)
        feed_dict = {self.train_policy.state:state, self.train_policy.actions:actions, self.next_state:next_state,
                     self.train_policy.R_extr:R_extr, self.train_policy.R_intr:R_intr,
                     self.train_policy.hidden_in[0]:hidden[0], self.train_policy.hidden_in[1]:hidden[1],
                     self.train_policy.mask:dones}

        _, l = self.sess.run([self.train_op,self.loss], feed_dict=feed_dict)
        return l
Пример #6
0
    def backprop(self, states, locs, R, actions):
        x, y = zip(*locs)
        Qsa = self.forward(totorch(states, self.device),
                           torch.tensor(x).to(self.device),
                           torch.tensor(y)).to(self.device)
        actions_onehot = totorch(one_hot(actions, self.action_size),
                                 self.device)
        Qvalue = torch.sum(Qsa * actions_onehot, axis=1)
        loss = torch.mean(torch.square(totorch(R).float().cuda() - Qvalue))

        loss.backward()
        self.optim.step()
        self.optim.zero_grad()
        return loss.detach().cpu().numpy()
Пример #7
0
    def backprop(self, state, next_state, R_extr, R_intr, Adv, actions,
                 state_mean, state_std):
        actions_onehot = one_hot(actions, self.action_size)
        feed_dict = {
            self.policy.state: state,
            self.policy.actions: actions,
            self.policy.R_extr: R_extr,
            self.policy.R_intr: R_intr,
            self.policy.Advantage: Adv,
            self.next_state: next_state,
            self.state_mean: state_mean,
            self.state_std: state_std
        }

        _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
        return l
Пример #8
0
def concat_action_reward(actions, rewards, num_classes):
    concat = one_hot(actions, num_classes)
    concat[:,-1] = rewards   
    return concat