예제 #1
0
    def __init__(
        self,
        env,
        qf,
        vf,
        sac_kwargs,
        tdm_kwargs,
        base_kwargs,
        policy=None,
        replay_buffer=None,
        give_terminal_reward=False,
    ):
        SoftActorCritic.__init__(self,
                                 env=env,
                                 policy=policy,
                                 qf=qf,
                                 vf=vf,
                                 replay_buffer=replay_buffer,
                                 **sac_kwargs,
                                 **base_kwargs)
        TemporalDifferenceModel.__init__(self, **tdm_kwargs)
        action_space_diff = (self.env.action_space.high -
                             self.env.action_space.low)

        # TODO(vitchyr): Maybe add this to the main SAC code.
        terminal_reward = 0
        for dim in range(action_space_diff.size):
            terminal_reward += (-np.log(1. / action_space_diff[dim]))
        self.terminal_bonus = float(terminal_reward)
        self.give_terminal_reward = give_terminal_reward
예제 #2
0
 def __init__(self,
              *args,
              observation_key=None,
              desired_goal_key=None,
              **kwargs):
     HER.__init__(
         self,
         observation_key=observation_key,
         desired_goal_key=desired_goal_key,
     )
     SoftActorCritic.__init__(self, *args, **kwargs)
     assert isinstance(self.replay_buffer, ObsDictRelabelingBuffer)