def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: feed_dict: Dict[tf.Tensor, Any] = { self.policy.model.batch_size: len(mini_batch["actions"]), self.policy.model.sequence_length: self.policy.sequence_length, } if self.model.use_vail: feed_dict[self.model.use_noise] = [0] if self.policy.use_vec_obs: feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"] if self.policy.model.vis_obs_size > 0: for i in range(len(self.policy.model.visual_in)): _obs = mini_batch["visual_obs%d" % i] feed_dict[self.policy.model.visual_in[i]] = _obs if self.policy.use_continuous_act: feed_dict[ self.policy.model.selected_actions] = mini_batch["actions"] else: feed_dict[self.policy.model.action_holder] = mini_batch["actions"] feed_dict[self.model.done_policy_holder] = np.array( mini_batch["done"]).flatten() unscaled_reward = self.policy.sess.run(self.model.intrinsic_reward, feed_dict=feed_dict) scaled_reward = unscaled_reward * float( self.has_updated) * self.strength return RewardSignalResult(scaled_reward, unscaled_reward)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: feed_dict: Dict[tf.Tensor, Any] = { self.policy.batch_size_ph: len(mini_batch["actions"]), self.policy.sequence_length_ph: self.policy.sequence_length, } if self.policy.use_vec_obs: feed_dict[self.policy.vector_in] = mini_batch["vector_obs"] feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"] if self.policy.vis_obs_size > 0: for i in range(len(self.policy.visual_in)): _obs = mini_batch["visual_obs%d" % i] _next_obs = mini_batch["next_visual_obs%d" % i] feed_dict[self.policy.visual_in[i]] = _obs feed_dict[self.model.next_visual_in[i]] = _next_obs if self.policy.use_continuous_act: feed_dict[self.policy.selected_actions] = mini_batch["actions"] else: feed_dict[self.policy.output] = mini_batch["actions"] unscaled_reward = self.policy.sess.run(self.model.intrinsic_reward, feed_dict=feed_dict) scaled_reward = np.clip( unscaled_reward * float(self.has_updated) * self.strength, 0, 1) return RewardSignalResult(scaled_reward, unscaled_reward)
def test_add_rewards_output(dummy_config): brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) rewardsout = AllRewardsOutput( reward_signals={ "extrinsic": RewardSignalResult(scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0])) }, environment=np.array([1.0, 1.0]), ) values = {"extrinsic": np.array([[2.0]])} agent_id = "123" idx = 0 # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. next_idx = 1 trainer.add_rewards_outputs( rewardsout, values=values, agent_id=agent_id, agent_idx=idx, agent_next_idx=next_idx, ) assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][ 0] == 2.0 assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
def evaluate(self, current_info: BrainInfo, next_info: BrainInfo) -> RewardSignalResult: """ Evaluates the reward for the agents present in current_info given the next_info :param current_info: The current BrainInfo. :param next_info: The BrainInfo from the next timestep. :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator """ if len(current_info.agents) == 0: return RewardSignalResult([], []) mini_batch: Dict[str, np.array] = {} # Construct the batch and use evaluate_batch mini_batch["actions"] = next_info.previous_vector_actions mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1]) for i in range(len(current_info.visual_observations)): mini_batch["visual_obs%d" % i] = current_info.visual_observations[i] mini_batch["next_visual_obs%d" % i] = next_info.visual_observations[i] if self.policy.use_vec_obs: mini_batch["vector_obs"] = current_info.vector_observations mini_batch["next_vector_in"] = next_info.vector_observations result = self.evaluate_batch(mini_batch) return result
def evaluate( self, current_info: BrainInfo, next_info: BrainInfo ) -> RewardSignalResult: if len(current_info.agents) == 0: return [] feed_dict: Dict[tf.Tensor, Any] = { self.policy.model.batch_size: len(next_info.vector_observations), self.policy.model.sequence_length: 1, } if self.model.use_vail: feed_dict[self.model.use_noise] = [0] feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1]) if self.policy.use_continuous_act: feed_dict[ self.policy.model.selected_actions ] = next_info.previous_vector_actions else: feed_dict[ self.policy.model.action_holder ] = next_info.previous_vector_actions if self.policy.use_recurrent: if current_info.memories.shape[1] == 0: current_info.memories = self.policy.make_empty_memory( len(current_info.agents) ) feed_dict[self.policy.model.memory_in] = current_info.memories unscaled_reward = self.policy.sess.run( self.model.intrinsic_reward, feed_dict=feed_dict ) scaled_reward = unscaled_reward * float(self.has_updated) * self.strength return RewardSignalResult(scaled_reward, unscaled_reward)
def evaluate(self, current_info: BrainInfo, action: np.array, next_info: BrainInfo) -> RewardSignalResult: """ Evaluates the reward for the agents present in current_info given the next_info :param current_info: The current BrainInfo. :param next_info: The BrainInfo from the next timestep. :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator """ unscaled_reward = np.array(next_info.rewards) scaled_reward = self.strength * unscaled_reward return RewardSignalResult(scaled_reward, unscaled_reward)
def evaluate(self, current_info: BrainInfo, action: np.array, next_info: BrainInfo) -> RewardSignalResult: if len(current_info.agents) == 0: return RewardSignalResult([], []) mini_batch: Dict[str, np.array] = {} # Construct the batch mini_batch["actions"] = action mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1]) for i, obs in enumerate(current_info.visual_observations): mini_batch["visual_obs%d" % i] = obs if self.policy.use_vec_obs: mini_batch["vector_obs"] = current_info.vector_observations result = self.evaluate_batch(mini_batch) return result
def evaluate( self, current_info: BrainInfo, next_info: BrainInfo ) -> RewardSignalResult: """ Evaluates the reward for the agents present in current_info given the next_info :param current_info: The current BrainInfo. :param next_info: The BrainInfo from the next timestep. :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator """ if len(current_info.agents) == 0: return [] feed_dict = { self.policy.model.batch_size: len(next_info.vector_observations), self.policy.model.sequence_length: 1, } feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) if self.policy.use_continuous_act: feed_dict[ self.policy.model.selected_actions ] = next_info.previous_vector_actions else: feed_dict[ self.policy.model.action_holder ] = next_info.previous_vector_actions for i in range(self.policy.model.vis_obs_size): feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i] if self.policy.use_vec_obs: feed_dict[self.model.next_vector_in] = next_info.vector_observations if self.policy.use_recurrent: if current_info.memories.shape[1] == 0: current_info.memories = self.policy.make_empty_memory( len(current_info.agents) ) feed_dict[self.policy.model.memory_in] = current_info.memories unscaled_reward = self.policy.sess.run( self.model.intrinsic_reward, feed_dict=feed_dict ) scaled_reward = np.clip( unscaled_reward * float(self.has_updated) * self.strength, 0, 1 ) return RewardSignalResult(scaled_reward, unscaled_reward)
def test_add_rewards_output(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) rewardsout = AllRewardsOutput( reward_signals={ "extrinsic": RewardSignalResult( scaled_reward=np.array([1.0, 1.0], dtype=np.float32), unscaled_reward=np.array([1.0, 1.0], dtype=np.float32), ) }, environment=np.array([1.0, 1.0], dtype=np.float32), ) values = {"extrinsic": np.array([[2.0]], dtype=np.float32)} agent_id = "123" idx = 0 # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. next_idx = 1 trainer.add_rewards_outputs( rewardsout, values=values, agent_id=agent_id, agent_idx=idx, agent_next_idx=next_idx, ) assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][ 0] == 2.0 assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: env_rews = np.array(mini_batch["environment_rewards"]) return RewardSignalResult(self.strength * env_rews, env_rews)
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult: env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32) return RewardSignalResult(self.strength * env_rews, env_rews)