def __init__( self, env, qf, vf, sac_kwargs, tdm_kwargs, base_kwargs, policy=None, replay_buffer=None, give_terminal_reward=False, ): SoftActorCritic.__init__(self, env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **sac_kwargs, **base_kwargs) TemporalDifferenceModel.__init__(self, **tdm_kwargs) action_space_diff = (self.env.action_space.high - self.env.action_space.low) # TODO(vitchyr): Maybe add this to the main SAC code. terminal_reward = 0 for dim in range(action_space_diff.size): terminal_reward += (-np.log(1. / action_space_diff[dim])) self.terminal_bonus = float(terminal_reward) self.give_terminal_reward = give_terminal_reward
def __init__(self, *args, observation_key=None, desired_goal_key=None, **kwargs): HER.__init__( self, observation_key=observation_key, desired_goal_key=desired_goal_key, ) SoftActorCritic.__init__(self, *args, **kwargs) assert isinstance(self.replay_buffer, ObsDictRelabelingBuffer)