def __init__(self, vec_env, vec_episodic_memory, observation_embedding_fn, target_image_shape, exploration_reward='episodic_curiosity', scale_task_reward=1.0, scale_surrogate_reward=0.0, append_ec_reward_as_channel=False, bonus_reward_additive_term=0, exploration_reward_min_step=0, similarity_threshold=0.5): if exploration_reward == 'episodic_curiosity': if len(vec_episodic_memory) != vec_env.num_envs: raise ValueError( 'Each env must have a unique episodic memory.') # Note: post-processing of the observation might change the [0, 255] # range of the observation... if self._should_postprocess_observation( vec_env.observation_space.shape): observation_space_shape = target_image_shape[:] if append_ec_reward_as_channel: observation_space_shape[-1] += 1 observation_space = gym.spaces.Box(low=0, high=255, shape=observation_space_shape, dtype=np.float) else: observation_space = vec_env.observation_space assert not append_ec_reward_as_channel, ( 'append_ec_reward_as_channel not compatible with non-image-like obs.' ) VecEnvWrapper.__init__(self, vec_env, observation_space=observation_space) self._bonus_reward_additive_term = bonus_reward_additive_term self._vec_episodic_memory = vec_episodic_memory self._observation_embedding_fn = observation_embedding_fn self._target_image_shape = target_image_shape self._append_ec_reward_as_channel = append_ec_reward_as_channel self._exploration_reward = exploration_reward self._scale_task_reward = scale_task_reward self._scale_surrogate_reward = scale_surrogate_reward self._exploration_reward_min_step = exploration_reward_min_step # Oracle reward. self._oracles = [ oracle.OracleExplorationReward() for _ in range(self.venv.num_envs) ] # Cumulative task reward over an episode. self._episode_task_reward = [0.0] * self.venv.num_envs self._episode_bonus_reward = [0.0] * self.venv.num_envs # Stats on the task and exploration reward. self._stats_task_reward = MovingAverage(capacity=100) self._stats_bonus_reward = MovingAverage(capacity=100) # Total number of steps so far per environment. self._step_count = 0 self._similarity_threshold = similarity_threshold # Observers are notified each time a new time step is generated by the # environment. # Observers implement a function "on_new_observation". self._observers = []
def __init__(self, env): """Creates a new oracle to compute the exploration reward.""" gym.Wrapper.__init__(self, env) self._oracle_exploration_reward = oracle.OracleExplorationReward()
def __init__( self, vec_env, vec_episodic_memory, observation_embedding_fn, intrinsic_reward_fn, rlb_image_shape, target_image_shape, exploration_reward='rlb', scale_task_reward=1.0, scale_surrogate_reward=None, exploration_reward_min_step=0, ir_normalize_type=0, ir_clip_low=None, name='', ): logger.info('RLBEnvWrapper args: {}'.format(locals())) if exploration_reward == 'rlb': if len(vec_episodic_memory) != vec_env.num_envs: raise ValueError( 'Each env must have a unique episodic memory.') if target_image_shape is None: target_image_shape = rlb_image_shape if self._should_process_observation(vec_env.observation_space.shape): observation_space_shape = target_image_shape[:] observation_space = gym.spaces.Box(low=0, high=255, shape=observation_space_shape, dtype=np.float) else: observation_space = vec_env.observation_space VecEnvWrapper.__init__(self, vec_env, observation_space=observation_space) self._vec_episodic_memory = vec_episodic_memory self._observation_embedding_fn = observation_embedding_fn self._intrinsic_reward_fn = intrinsic_reward_fn self._rlb_image_shape = rlb_image_shape self._target_image_shape = target_image_shape self._exploration_reward = exploration_reward self._scale_task_reward = scale_task_reward self._scale_surrogate_reward = scale_surrogate_reward self._exploration_reward_min_step = exploration_reward_min_step # Oracle reward. self._oracles = [ oracle.OracleExplorationReward() for _ in range(self.venv.num_envs) ] self._ir_normalize_type = ir_normalize_type if self._ir_normalize_type == 0: pass elif self._ir_normalize_type == 1: ir_normalize_gamma = 0.99 self._irff = RewardForwardFilter(ir_normalize_gamma) self._irff_rms = RunningMeanStd() elif self._ir_normalize_type == 2: self._ir_rms = RunningMeanStd() elif self._ir_normalize_type == 3: self._ir_rms = SimpleWeightedMovingScalarMeanStd(alpha=0.0001) else: assert False self._ir_clip_low = ir_clip_low self._name = name # Cumulative task reward over an episode. self._episode_task_reward = [0.0] * self.venv.num_envs self._episode_bonus_reward = [0.0] * self.venv.num_envs # Stats on the task and exploration reward. self._stats_task_reward = MovingAverage(capacity=100) self._stats_bonus_reward = MovingAverage(capacity=100) # Total number of steps so far per environment. self._step_count = 0 # Observers are notified each time a new time step is generated by the # environment. self._observers = [] self._bonus_reward_raw_history = [[] for _ in range(self.venv.num_envs)] self._bonus_reward_history = [[] for _ in range(self.venv.num_envs)]