def evaluate( self, current_info: BrainInfo, next_info: BrainInfo ) -> RewardSignalResult: if len(current_info.agents) == 0: return [] feed_dict: Dict[tf.Tensor, Any] = { self.policy.model.batch_size: len(next_info.vector_observations), self.policy.model.sequence_length: 1, } if self.model.use_vail: feed_dict[self.model.use_noise] = [0] feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1]) if self.policy.use_continuous_act: feed_dict[ self.policy.model.selected_actions ] = next_info.previous_vector_actions else: feed_dict[ self.policy.model.action_holder ] = next_info.previous_vector_actions if self.policy.use_recurrent: if current_info.memories.shape[1] == 0: current_info.memories = self.policy.make_empty_memory( len(current_info.agents) ) feed_dict[self.policy.model.memory_in] = current_info.memories unscaled_reward = self.policy.sess.run( self.model.intrinsic_reward, feed_dict=feed_dict ) scaled_reward = unscaled_reward * float(self.has_updated) * self.strength return RewardSignalResult(scaled_reward, unscaled_reward)
def load_demonstration( file_path: str) -> Tuple[BrainParameters, List[BrainInfo], int]: """ Loads and parses a demonstration file. :param file_path: Location of demonstration file (.demo). :return: BrainParameter and list of BrainInfos containing demonstration data. """ # First 32 bytes of file dedicated to meta-data. INITIAL_POS = 33 file_paths = [] if os.path.isdir(file_path): all_files = os.listdir(file_path) for _file in all_files: if _file.endswith(".demo"): file_paths.append(os.path.join(file_path, _file)) if not all_files: raise ValueError( "There are no '.demo' files in the provided directory.") elif os.path.isfile(file_path): file_paths.append(file_path) file_extension = pathlib.Path(file_path).suffix if file_extension != ".demo": raise ValueError( "The file is not a '.demo' file. Please provide a file with the " "correct extension.") else: raise FileNotFoundError( "The demonstration file or directory {} does not exist.".format( file_path)) brain_params = None brain_infos = [] total_expected = 0 for _file_path in file_paths: data = open(_file_path, "rb").read() next_pos, pos, obs_decoded = 0, 0, 0 while pos < len(data): next_pos, pos = _DecodeVarint32(data, pos) if obs_decoded == 0: meta_data_proto = DemonstrationMetaProto() meta_data_proto.ParseFromString(data[pos:pos + next_pos]) total_expected += meta_data_proto.number_steps pos = INITIAL_POS if obs_decoded == 1: brain_param_proto = BrainParametersProto() brain_param_proto.ParseFromString(data[pos:pos + next_pos]) brain_params = BrainParameters.from_proto(brain_param_proto) pos += next_pos if obs_decoded > 1: agent_info = AgentInfoProto() agent_info.ParseFromString(data[pos:pos + next_pos]) brain_info = BrainInfo.from_agent_proto( 0, [agent_info], brain_params) brain_infos.append(brain_info) if len(brain_infos) == total_expected: break pos += next_pos obs_decoded += 1 return brain_params, brain_infos, total_expected
def test_take_action_returns_nones_on_missing_values(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy.evaluate = MagicMock(return_value={}) brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"]) result = policy.get_action(brain_info_with_agents) assert result == ActionInfo(None, None, None, None, {})
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_process_buffer = ProcessingBuffer() demo_buffer = AgentBuffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params) previous_action = (np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32) demo_process_buffer[0].last_brain_info = current_brain_info demo_process_buffer[0]["done"].append(next_brain_info.local_done[0]) demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_process_buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0]) if brain_params.vector_observation_space_size > 0: demo_process_buffer[0]["vector_obs"].append( current_brain_info.vector_observations[0]) demo_process_buffer[0]["actions"].append( current_pair_info.action_info.vector_actions) demo_process_buffer[0]["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_process_buffer.append_to_update_buffer( demo_buffer, 0, batch_size=None, training_length=sequence_length) demo_process_buffer.reset_local_buffers() demo_process_buffer.append_to_update_buffer( demo_buffer, 0, batch_size=None, training_length=sequence_length) return demo_buffer
def test_from_agent_proto_nan(mock_warning, mock_nan_to_num): agent_info_proto = _make_agent_info_proto([1.0, 2.0, float("nan")]) brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain) # nan gets set to 0.0 expected = [1.0, 2.0, 0.0] assert (brain_info.vector_observations == expected).all() mock_nan_to_num.assert_called() mock_warning.assert_called()
def test_from_agent_proto_inf(mock_warning, mock_nan_to_num): agent_info_proto = _make_agent_info_proto([1.0, float("inf"), 0.0]) brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain) # inf should get set to float_max expected = [1.0, sys.float_info.max, 0.0] assert (brain_info.vector_observations == expected).all() mock_nan_to_num.assert_called() # We don't warn on inf, just NaN mock_warning.assert_not_called()
def test_from_agent_proto_fast_path(mock_warning, mock_nan_to_num): """ Check that all finite values skips the nan_to_num call """ agent_info_proto = _make_agent_info_proto([1.0, 2.0, 3.0]) brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain) expected = [1.0, 2.0, 3.0] assert (brain_info.vector_observations == expected).all() mock_nan_to_num.assert_not_called() mock_warning.assert_not_called()
def test_take_action_returns_action_info_when_available(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy_eval_out = { "action": np.array([1.0], dtype=np.float32), "memory_out": np.array([[2.5]], dtype=np.float32), "value": np.array([1.1], dtype=np.float32), } policy.evaluate = MagicMock(return_value=policy_eval_out) brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"], local_done=[False]) result = policy.get_action(brain_info_with_agents) expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"], policy_eval_out) assert result == expected
def load_demonstration(file_path): """ Loads and parses a demonstration file. :param file_path: Location of demonstration file (.demo). :return: BrainParameter and list of BrainInfos containing demonstration data. """ # First 32 bytes of file dedicated to meta-data. INITIAL_POS = 33 if not os.path.isfile(file_path): raise FileNotFoundError( "The demonstration file {} does not exist.".format(file_path)) file_extension = pathlib.Path(file_path).suffix if file_extension != ".demo": raise ValueError( "The file is not a '.demo' file. Please provide a file with the " "correct extension.") brain_params = None brain_infos = [] data = open(file_path, "rb").read() next_pos, pos, obs_decoded = 0, 0, 0 total_expected = 0 while pos < len(data): next_pos, pos = _DecodeVarint32(data, pos) if obs_decoded == 0: meta_data_proto = DemonstrationMetaProto() meta_data_proto.ParseFromString(data[pos:pos + next_pos]) total_expected = meta_data_proto.number_steps pos = INITIAL_POS if obs_decoded == 1: brain_param_proto = BrainParametersProto() brain_param_proto.ParseFromString(data[pos:pos + next_pos]) brain_params = BrainParameters.from_proto(brain_param_proto) pos += next_pos if obs_decoded > 1: agent_info = AgentInfoProto() agent_info.ParseFromString(data[pos:pos + next_pos]) brain_info = BrainInfo.from_agent_proto([agent_info], brain_params) brain_infos.append(brain_info) if len(brain_infos) == total_expected: break pos += next_pos obs_decoded += 1 return brain_params, brain_infos, total_expected
def step_result_to_brain_info( step_result: BatchedStepResult, group_spec: AgentGroupSpec, agent_id_prefix: int = None, ) -> BrainInfo: n_agents = step_result.n_agents() vis_obs_indices = [] vec_obs_indices = [] for index, observation in enumerate(step_result.obs): if len(observation.shape) == 2: vec_obs_indices.append(index) elif len(observation.shape) == 4: vis_obs_indices.append(index) else: raise UnityEnvironmentException( "Invalid input received from the environment, the observation should " "either be a vector of float or a PNG image") if len(vec_obs_indices) == 0: vec_obs = np.zeros((n_agents, 0), dtype=np.float32) else: vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1) vis_obs = [step_result.obs[i] for i in vis_obs_indices] mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32) if group_spec.is_action_discrete(): mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32) if step_result.action_mask is not None: mask = 1 - np.concatenate(step_result.action_mask, axis=1) if agent_id_prefix is None: agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)] else: agent_ids = [ f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id ] return BrainInfo( vis_obs, vec_obs, list(step_result.reward), agent_ids, list(step_result.done), list(step_result.max_step), mask, )
def reset( self, config: Dict[str, float] = None, train_mode: bool = True, custom_reset_parameters: Any = None, ) -> AllBrainInfo: # type: ignore self._reset_agent() agent_info = AgentInfoProto( stacked_vector_observation=[self.goal] * OBS_SIZE, done=False, max_step_reached=False, ) return { BRAIN_NAME: BrainInfo.from_agent_proto(0, [agent_info], self._brains[BRAIN_NAME]) }
def get_value_estimates(self, brain_info: BrainInfo, idx: int, done: bool) -> Dict[str, float]: """ Generates value estimates for bootstrapping. :param brain_info: BrainInfo to be used for bootstrapping. :param idx: Index in BrainInfo of agent. :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0. :return: The value estimate dictionary with key being the name of the reward signal and the value the corresponding value estimate. """ feed_dict: Dict[tf.Tensor, Any] = { self.model.batch_size: 1, self.model.sequence_length: 1, } for i in range(len(brain_info.visual_observations)): feed_dict[self.model.visual_in[i]] = [ brain_info.visual_observations[i][idx] ] if self.use_vec_obs: feed_dict[self.model.vector_in] = [ brain_info.vector_observations[idx] ] if self.use_recurrent: if brain_info.memories.shape[1] == 0: brain_info.memories = self.make_empty_memory( len(brain_info.agents)) feed_dict[self.model.memory_in] = [brain_info.memories[idx]] if not self.use_continuous_act and self.use_recurrent: feed_dict[self.model.prev_action] = [ brain_info.previous_vector_actions[idx] ] value_estimates = self.sess.run(self.model.value_heads, feed_dict) value_estimates = {k: float(v) for k, v in value_estimates.items()} # If we're done, reassign all of the value estimates that need terminal states. if done: for k in value_estimates: if self.reward_signals[k].use_terminal_states: value_estimates[k] = 0.0 return value_estimates
def evaluate( self, current_info: BrainInfo, next_info: BrainInfo ) -> RewardSignalResult: """ Evaluates the reward for the agents present in current_info given the next_info :param current_info: The current BrainInfo. :param next_info: The BrainInfo from the next timestep. :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator """ if len(current_info.agents) == 0: return [] feed_dict = { self.policy.model.batch_size: len(next_info.vector_observations), self.policy.model.sequence_length: 1, } feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info) if self.policy.use_continuous_act: feed_dict[ self.policy.model.selected_actions ] = next_info.previous_vector_actions else: feed_dict[ self.policy.model.action_holder ] = next_info.previous_vector_actions for i in range(self.policy.model.vis_obs_size): feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i] if self.policy.use_vec_obs: feed_dict[self.model.next_vector_in] = next_info.vector_observations if self.policy.use_recurrent: if current_info.memories.shape[1] == 0: current_info.memories = self.policy.make_empty_memory( len(current_info.agents) ) feed_dict[self.policy.model.memory_in] = current_info.memories unscaled_reward = self.policy.sess.run( self.model.intrinsic_reward, feed_dict=feed_dict ) scaled_reward = np.clip( unscaled_reward * float(self.has_updated) * self.strength, 0, 1 ) return RewardSignalResult(scaled_reward, unscaled_reward)
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] for _ in next_info.visual_observations ] # TODO add types to brain.py methods vector_observations = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index]) vector_observations.append( agent_brain_info.vector_observations[agent_index]) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) action_masks.append(agent_brain_info.action_masks[agent_index]) curr_info = BrainInfo( visual_observations, vector_observations, rewards, agents, local_dones, max_reacheds, action_masks, ) return curr_info
def step( self, vector_action: Dict[str, Any] = None, memory: Dict[str, Any] = None, value: Dict[str, Any] = None, ) -> AllBrainInfo: assert vector_action is not None if self.discrete: act = vector_action[BRAIN_NAME][0][0] delta = 1 if act else -1 else: delta = vector_action[BRAIN_NAME][0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position += delta self.position = clamp(self.position, -1, 1) self.step_count += 1 done = self.position >= 1.0 or self.position <= -1.0 if done: reward = SUCCESS_REWARD * self.position * self.goal else: reward = -TIME_PENALTY vector_obs = [self.goal] * OBS_SIZE vector_obs_proto = ObservationProto( float_data=ObservationProto.FloatData(data=vector_obs), shape=[len(vector_obs)], compression_type=COMPRESSION_TYPE_NONE, ) agent_info = AgentInfoProto(reward=reward, done=bool(done), observations=[vector_obs_proto]) if done: self._reset_agent() return { BRAIN_NAME: BrainInfo.from_agent_proto(0, [agent_info], self._brains[BRAIN_NAME]) }
def reset( self, config: Dict[str, float] = None, train_mode: bool = True, custom_reset_parameters: Any = None, ) -> AllBrainInfo: # type: ignore self._reset_agent() vector_obs = [self.goal] * OBS_SIZE vector_obs_proto = ObservationProto( float_data=ObservationProto.FloatData(data=vector_obs), shape=[len(vector_obs)], compression_type=COMPRESSION_TYPE_NONE, ) agent_info = AgentInfoProto(done=False, max_step_reached=False, observations=[vector_obs_proto]) return { BRAIN_NAME: BrainInfo.from_agent_proto(0, [agent_info], self._brains[BRAIN_NAME]) }
def step( self, vector_action: Dict[str, Any] = None, memory: Dict[str, Any] = None, text_action: Dict[str, Any] = None, value: Dict[str, Any] = None, ) -> AllBrainInfo: assert vector_action is not None if self.discrete: act = vector_action[BRAIN_NAME][0][0] delta = 1 if act else -1 else: delta = vector_action[BRAIN_NAME][0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position += delta self.position = clamp(self.position, -1, 1) self.step_count += 1 done = self.position >= 1.0 or self.position <= -1.0 if done: reward = SUCCESS_REWARD * self.position * self.goal else: reward = -TIME_PENALTY agent_info = AgentInfoProto(stacked_vector_observation=[self.goal] * OBS_SIZE, reward=reward, done=done) if done: self._reset_agent() return { BRAIN_NAME: BrainInfo.from_agent_proto(0, [agent_info], self._brains[BRAIN_NAME]) }
def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]: """ Evaluates policy for the agent experiences provided. :param brain_info: BrainInfo object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ feed_dict = { self.model.batch_size: len(brain_info.vector_observations), self.model.sequence_length: 1, } if self.use_recurrent: if not self.use_continuous_act: feed_dict[ self.model. prev_action] = brain_info.previous_vector_actions.reshape( [-1, len(self.model.act_size)]) if brain_info.memories.shape[1] == 0: brain_info.memories = self.make_empty_memory( len(brain_info.agents)) feed_dict[self.model.memory_in] = brain_info.memories feed_dict = self.fill_eval_dict(feed_dict, brain_info) run_out = self._execute_model(feed_dict, self.inference_dict) return run_out
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) no_agent_brain_info = BrainInfo([], [], [], agents=[]) result = policy.get_action(no_agent_brain_info) assert result == ActionInfo([], [], None)
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] for _ in next_info.visual_observations ] # TODO add types to brain.py methods vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index] ) vector_observations.append( agent_brain_info.vector_observations[agent_index] ) text_observations.append(agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories) > 0: memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index] ) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index] ) action_masks.append(agent_brain_info.action_masks[agent_index]) # Check if memories exists (i.e. next_info is not empty) before attempting vstack if self.policy.use_recurrent and memories: memories = np.vstack(memories) curr_info = BrainInfo( visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds, action_masks, ) return curr_info