示例#1
0
 def policy(self, inputs, states):
     """
     We first compute the successor features and then use the goal vector to
     compute the Q values.
     """
     srs, states = self.sr(inputs, states)
     goal = self.goal(inputs).unsqueeze(1).expand(-1, self.num_actions, -1)
     q_value = torch.sum(torch.mul(goal, srs),
                         dim=-1).view(-1, self.num_actions)
     return dict(action=comf.q_categorical(q_value)), states
示例#2
0
 def policy(self, inputs, states):
     values, states = self.value(inputs, states)
     q_value = values["q_value"]
     return dict(action=comf.q_categorical(q_value)), states