def evaluate(
        self, current_info: BrainInfo, next_info: BrainInfo
    ) -> RewardSignalResult:
        if len(current_info.agents) == 0:
            return []

        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.model.batch_size: len(next_info.vector_observations),
            self.policy.model.sequence_length: 1,
        }
        if self.model.use_vail:
            feed_dict[self.model.use_noise] = [0]

        feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
        feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1])
        if self.policy.use_continuous_act:
            feed_dict[
                self.policy.model.selected_actions
            ] = next_info.previous_vector_actions
        else:
            feed_dict[
                self.policy.model.action_holder
            ] = next_info.previous_vector_actions
        if self.policy.use_recurrent:
            if current_info.memories.shape[1] == 0:
                current_info.memories = self.policy.make_empty_memory(
                    len(current_info.agents)
                )
            feed_dict[self.policy.model.memory_in] = current_info.memories
        unscaled_reward = self.policy.sess.run(
            self.model.intrinsic_reward, feed_dict=feed_dict
        )
        scaled_reward = unscaled_reward * float(self.has_updated) * self.strength
        return RewardSignalResult(scaled_reward, unscaled_reward)
示例#2
0
def load_demonstration(
        file_path: str) -> Tuple[BrainParameters, List[BrainInfo], int]:
    """
    Loads and parses a demonstration file.
    :param file_path: Location of demonstration file (.demo).
    :return: BrainParameter and list of BrainInfos containing demonstration data.
    """

    # First 32 bytes of file dedicated to meta-data.
    INITIAL_POS = 33
    file_paths = []
    if os.path.isdir(file_path):
        all_files = os.listdir(file_path)
        for _file in all_files:
            if _file.endswith(".demo"):
                file_paths.append(os.path.join(file_path, _file))
        if not all_files:
            raise ValueError(
                "There are no '.demo' files in the provided directory.")
    elif os.path.isfile(file_path):
        file_paths.append(file_path)
        file_extension = pathlib.Path(file_path).suffix
        if file_extension != ".demo":
            raise ValueError(
                "The file is not a '.demo' file. Please provide a file with the "
                "correct extension.")
    else:
        raise FileNotFoundError(
            "The demonstration file or directory {} does not exist.".format(
                file_path))

    brain_params = None
    brain_infos = []
    total_expected = 0
    for _file_path in file_paths:
        data = open(_file_path, "rb").read()
        next_pos, pos, obs_decoded = 0, 0, 0
        while pos < len(data):
            next_pos, pos = _DecodeVarint32(data, pos)
            if obs_decoded == 0:
                meta_data_proto = DemonstrationMetaProto()
                meta_data_proto.ParseFromString(data[pos:pos + next_pos])
                total_expected += meta_data_proto.number_steps
                pos = INITIAL_POS
            if obs_decoded == 1:
                brain_param_proto = BrainParametersProto()
                brain_param_proto.ParseFromString(data[pos:pos + next_pos])
                brain_params = BrainParameters.from_proto(brain_param_proto)
                pos += next_pos
            if obs_decoded > 1:
                agent_info = AgentInfoProto()
                agent_info.ParseFromString(data[pos:pos + next_pos])
                brain_info = BrainInfo.from_agent_proto(
                    0, [agent_info], brain_params)
                brain_infos.append(brain_info)
                if len(brain_infos) == total_expected:
                    break
                pos += next_pos
            obs_decoded += 1
    return brain_params, brain_infos, total_expected
示例#3
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"])
    result = policy.get_action(brain_info_with_agents)
    assert result == ActionInfo(None, None, None, None, {})
示例#4
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    brain_params: BrainParameters,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_process_buffer = ProcessingBuffer()
    demo_buffer = AgentBuffer()
    for idx, experience in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        current_pair_info = pair_infos[idx]
        next_pair_info = pair_infos[idx + 1]
        current_brain_info = BrainInfo.from_agent_proto(
            0, [current_pair_info.agent_info], brain_params)
        next_brain_info = BrainInfo.from_agent_proto(
            0, [next_pair_info.agent_info], brain_params)
        previous_action = (np.array(pair_infos[idx].action_info.vector_actions,
                                    dtype=np.float32) * 0)
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions,
                dtype=np.float32)
        demo_process_buffer[0].last_brain_info = current_brain_info
        demo_process_buffer[0]["done"].append(next_brain_info.local_done[0])
        demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            demo_process_buffer[0]["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0])
        if brain_params.vector_observation_space_size > 0:
            demo_process_buffer[0]["vector_obs"].append(
                current_brain_info.vector_observations[0])
        demo_process_buffer[0]["actions"].append(
            current_pair_info.action_info.vector_actions)
        demo_process_buffer[0]["prev_action"].append(previous_action)
        if next_brain_info.local_done[0]:
            demo_process_buffer.append_to_update_buffer(
                demo_buffer,
                0,
                batch_size=None,
                training_length=sequence_length)
            demo_process_buffer.reset_local_buffers()
    demo_process_buffer.append_to_update_buffer(
        demo_buffer, 0, batch_size=None, training_length=sequence_length)
    return demo_buffer
示例#5
0
def test_from_agent_proto_nan(mock_warning, mock_nan_to_num):
    agent_info_proto = _make_agent_info_proto([1.0, 2.0, float("nan")])

    brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain)
    # nan gets set to 0.0
    expected = [1.0, 2.0, 0.0]
    assert (brain_info.vector_observations == expected).all()
    mock_nan_to_num.assert_called()
    mock_warning.assert_called()
示例#6
0
def test_from_agent_proto_inf(mock_warning, mock_nan_to_num):
    agent_info_proto = _make_agent_info_proto([1.0, float("inf"), 0.0])

    brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain)
    # inf should get set to float_max
    expected = [1.0, sys.float_info.max, 0.0]
    assert (brain_info.vector_observations == expected).all()
    mock_nan_to_num.assert_called()
    # We don't warn on inf, just NaN
    mock_warning.assert_not_called()
示例#7
0
def test_from_agent_proto_fast_path(mock_warning, mock_nan_to_num):
    """
    Check that all finite values skips the nan_to_num call
    """
    agent_info_proto = _make_agent_info_proto([1.0, 2.0, 3.0])

    brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain)
    expected = [1.0, 2.0, 3.0]
    assert (brain_info.vector_observations == expected).all()
    mock_nan_to_num.assert_not_called()
    mock_warning.assert_not_called()
示例#8
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    brain_info_with_agents = BrainInfo([], [], [],
                                       agents=["an-agent-id"],
                                       local_done=[False])
    result = policy.get_action(brain_info_with_agents)
    expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"],
                          policy_eval_out)
    assert result == expected
def load_demonstration(file_path):
    """
    Loads and parses a demonstration file.
    :param file_path: Location of demonstration file (.demo).
    :return: BrainParameter and list of BrainInfos containing demonstration data.
    """

    # First 32 bytes of file dedicated to meta-data.
    INITIAL_POS = 33

    if not os.path.isfile(file_path):
        raise FileNotFoundError(
            "The demonstration file {} does not exist.".format(file_path))
    file_extension = pathlib.Path(file_path).suffix
    if file_extension != ".demo":
        raise ValueError(
            "The file is not a '.demo' file. Please provide a file with the "
            "correct extension.")

    brain_params = None
    brain_infos = []
    data = open(file_path, "rb").read()
    next_pos, pos, obs_decoded = 0, 0, 0
    total_expected = 0
    while pos < len(data):
        next_pos, pos = _DecodeVarint32(data, pos)
        if obs_decoded == 0:
            meta_data_proto = DemonstrationMetaProto()
            meta_data_proto.ParseFromString(data[pos:pos + next_pos])
            total_expected = meta_data_proto.number_steps
            pos = INITIAL_POS
        if obs_decoded == 1:
            brain_param_proto = BrainParametersProto()
            brain_param_proto.ParseFromString(data[pos:pos + next_pos])
            brain_params = BrainParameters.from_proto(brain_param_proto)
            pos += next_pos
        if obs_decoded > 1:
            agent_info = AgentInfoProto()
            agent_info.ParseFromString(data[pos:pos + next_pos])
            brain_info = BrainInfo.from_agent_proto([agent_info], brain_params)
            brain_infos.append(brain_info)
            if len(brain_infos) == total_expected:
                break
            pos += next_pos
        obs_decoded += 1
    return brain_params, brain_infos, total_expected
def step_result_to_brain_info(
    step_result: BatchedStepResult,
    group_spec: AgentGroupSpec,
    agent_id_prefix: int = None,
) -> BrainInfo:
    n_agents = step_result.n_agents()
    vis_obs_indices = []
    vec_obs_indices = []
    for index, observation in enumerate(step_result.obs):
        if len(observation.shape) == 2:
            vec_obs_indices.append(index)
        elif len(observation.shape) == 4:
            vis_obs_indices.append(index)
        else:
            raise UnityEnvironmentException(
                "Invalid input received from the environment, the observation should "
                "either be a vector of float or a PNG image")
    if len(vec_obs_indices) == 0:
        vec_obs = np.zeros((n_agents, 0), dtype=np.float32)
    else:
        vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices],
                                 axis=1)
    vis_obs = [step_result.obs[i] for i in vis_obs_indices]
    mask = np.ones((n_agents, np.sum(group_spec.action_size)),
                   dtype=np.float32)
    if group_spec.is_action_discrete():
        mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)),
                       dtype=np.float32)
        if step_result.action_mask is not None:
            mask = 1 - np.concatenate(step_result.action_mask, axis=1)
    if agent_id_prefix is None:
        agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)]
    else:
        agent_ids = [
            f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id
        ]
    return BrainInfo(
        vis_obs,
        vec_obs,
        list(step_result.reward),
        agent_ids,
        list(step_result.done),
        list(step_result.max_step),
        mask,
    )
示例#11
0
    def reset(
        self,
        config: Dict[str, float] = None,
        train_mode: bool = True,
        custom_reset_parameters: Any = None,
    ) -> AllBrainInfo:  # type: ignore
        self._reset_agent()

        agent_info = AgentInfoProto(
            stacked_vector_observation=[self.goal] * OBS_SIZE,
            done=False,
            max_step_reached=False,
        )
        return {
            BRAIN_NAME:
            BrainInfo.from_agent_proto(0, [agent_info],
                                       self._brains[BRAIN_NAME])
        }
示例#12
0
    def get_value_estimates(self, brain_info: BrainInfo, idx: int,
                            done: bool) -> Dict[str, float]:
        """
        Generates value estimates for bootstrapping.
        :param brain_info: BrainInfo to be used for bootstrapping.
        :param idx: Index in BrainInfo of agent.
        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
        :return: The value estimate dictionary with key being the name of the reward signal and the value the
        corresponding value estimate.
        """

        feed_dict: Dict[tf.Tensor, Any] = {
            self.model.batch_size: 1,
            self.model.sequence_length: 1,
        }
        for i in range(len(brain_info.visual_observations)):
            feed_dict[self.model.visual_in[i]] = [
                brain_info.visual_observations[i][idx]
            ]
        if self.use_vec_obs:
            feed_dict[self.model.vector_in] = [
                brain_info.vector_observations[idx]
            ]
        if self.use_recurrent:
            if brain_info.memories.shape[1] == 0:
                brain_info.memories = self.make_empty_memory(
                    len(brain_info.agents))
            feed_dict[self.model.memory_in] = [brain_info.memories[idx]]
        if not self.use_continuous_act and self.use_recurrent:
            feed_dict[self.model.prev_action] = [
                brain_info.previous_vector_actions[idx]
            ]
        value_estimates = self.sess.run(self.model.value_heads, feed_dict)

        value_estimates = {k: float(v) for k, v in value_estimates.items()}

        # If we're done, reassign all of the value estimates that need terminal states.
        if done:
            for k in value_estimates:
                if self.reward_signals[k].use_terminal_states:
                    value_estimates[k] = 0.0

        return value_estimates
    def evaluate(
        self, current_info: BrainInfo, next_info: BrainInfo
    ) -> RewardSignalResult:
        """
        Evaluates the reward for the agents present in current_info given the next_info
        :param current_info: The current BrainInfo.
        :param next_info: The BrainInfo from the next timestep.
        :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
        """
        if len(current_info.agents) == 0:
            return []

        feed_dict = {
            self.policy.model.batch_size: len(next_info.vector_observations),
            self.policy.model.sequence_length: 1,
        }
        feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
        if self.policy.use_continuous_act:
            feed_dict[
                self.policy.model.selected_actions
            ] = next_info.previous_vector_actions
        else:
            feed_dict[
                self.policy.model.action_holder
            ] = next_info.previous_vector_actions
        for i in range(self.policy.model.vis_obs_size):
            feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
        if self.policy.use_vec_obs:
            feed_dict[self.model.next_vector_in] = next_info.vector_observations
        if self.policy.use_recurrent:
            if current_info.memories.shape[1] == 0:
                current_info.memories = self.policy.make_empty_memory(
                    len(current_info.agents)
                )
            feed_dict[self.policy.model.memory_in] = current_info.memories
        unscaled_reward = self.policy.sess.run(
            self.model.intrinsic_reward, feed_dict=feed_dict
        )
        scaled_reward = np.clip(
            unscaled_reward * float(self.has_updated) * self.strength, 0, 1
        )
        return RewardSignalResult(scaled_reward, unscaled_reward)
示例#14
0
 def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:
     """
     Constructs a BrainInfo which contains the most recent previous experiences for all agents
     which correspond to the agents in a provided next_info.
     :BrainInfo next_info: A t+1 BrainInfo.
     :return: curr_info: Reconstructed BrainInfo to match agents of next_info.
     """
     visual_observations: List[List[Any]] = [
         [] for _ in next_info.visual_observations
     ]  # TODO add types to brain.py methods
     vector_observations = []
     rewards = []
     local_dones = []
     max_reacheds = []
     agents = []
     action_masks = []
     for agent_id in next_info.agents:
         agent_brain_info = self.training_buffer[agent_id].last_brain_info
         if agent_brain_info is None:
             agent_brain_info = next_info
         agent_index = agent_brain_info.agents.index(agent_id)
         for i in range(len(next_info.visual_observations)):
             visual_observations[i].append(
                 agent_brain_info.visual_observations[i][agent_index])
         vector_observations.append(
             agent_brain_info.vector_observations[agent_index])
         rewards.append(agent_brain_info.rewards[agent_index])
         local_dones.append(agent_brain_info.local_done[agent_index])
         max_reacheds.append(agent_brain_info.max_reached[agent_index])
         agents.append(agent_brain_info.agents[agent_index])
         action_masks.append(agent_brain_info.action_masks[agent_index])
     curr_info = BrainInfo(
         visual_observations,
         vector_observations,
         rewards,
         agents,
         local_dones,
         max_reacheds,
         action_masks,
     )
     return curr_info
示例#15
0
    def step(
        self,
        vector_action: Dict[str, Any] = None,
        memory: Dict[str, Any] = None,
        value: Dict[str, Any] = None,
    ) -> AllBrainInfo:
        assert vector_action is not None

        if self.discrete:
            act = vector_action[BRAIN_NAME][0][0]
            delta = 1 if act else -1
        else:
            delta = vector_action[BRAIN_NAME][0][0]
        delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
        self.position += delta
        self.position = clamp(self.position, -1, 1)
        self.step_count += 1
        done = self.position >= 1.0 or self.position <= -1.0
        if done:
            reward = SUCCESS_REWARD * self.position * self.goal
        else:
            reward = -TIME_PENALTY

        vector_obs = [self.goal] * OBS_SIZE
        vector_obs_proto = ObservationProto(
            float_data=ObservationProto.FloatData(data=vector_obs),
            shape=[len(vector_obs)],
            compression_type=COMPRESSION_TYPE_NONE,
        )
        agent_info = AgentInfoProto(reward=reward,
                                    done=bool(done),
                                    observations=[vector_obs_proto])

        if done:
            self._reset_agent()

        return {
            BRAIN_NAME:
            BrainInfo.from_agent_proto(0, [agent_info],
                                       self._brains[BRAIN_NAME])
        }
示例#16
0
    def reset(
        self,
        config: Dict[str, float] = None,
        train_mode: bool = True,
        custom_reset_parameters: Any = None,
    ) -> AllBrainInfo:  # type: ignore
        self._reset_agent()

        vector_obs = [self.goal] * OBS_SIZE
        vector_obs_proto = ObservationProto(
            float_data=ObservationProto.FloatData(data=vector_obs),
            shape=[len(vector_obs)],
            compression_type=COMPRESSION_TYPE_NONE,
        )
        agent_info = AgentInfoProto(done=False,
                                    max_step_reached=False,
                                    observations=[vector_obs_proto])

        return {
            BRAIN_NAME:
            BrainInfo.from_agent_proto(0, [agent_info],
                                       self._brains[BRAIN_NAME])
        }
示例#17
0
    def step(
        self,
        vector_action: Dict[str, Any] = None,
        memory: Dict[str, Any] = None,
        text_action: Dict[str, Any] = None,
        value: Dict[str, Any] = None,
    ) -> AllBrainInfo:
        assert vector_action is not None

        if self.discrete:
            act = vector_action[BRAIN_NAME][0][0]
            delta = 1 if act else -1
        else:
            delta = vector_action[BRAIN_NAME][0][0]
        delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
        self.position += delta
        self.position = clamp(self.position, -1, 1)
        self.step_count += 1
        done = self.position >= 1.0 or self.position <= -1.0
        if done:
            reward = SUCCESS_REWARD * self.position * self.goal
        else:
            reward = -TIME_PENALTY

        agent_info = AgentInfoProto(stacked_vector_observation=[self.goal] *
                                    OBS_SIZE,
                                    reward=reward,
                                    done=done)

        if done:
            self._reset_agent()

        return {
            BRAIN_NAME:
            BrainInfo.from_agent_proto(0, [agent_info],
                                       self._brains[BRAIN_NAME])
        }
示例#18
0
    def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]:
        """
        Evaluates policy for the agent experiences provided.
        :param brain_info: BrainInfo object containing inputs.
        :return: Outputs from network as defined by self.inference_dict.
        """
        feed_dict = {
            self.model.batch_size: len(brain_info.vector_observations),
            self.model.sequence_length: 1,
        }
        if self.use_recurrent:
            if not self.use_continuous_act:
                feed_dict[
                    self.model.
                    prev_action] = brain_info.previous_vector_actions.reshape(
                        [-1, len(self.model.act_size)])
            if brain_info.memories.shape[1] == 0:
                brain_info.memories = self.make_empty_memory(
                    len(brain_info.agents))
            feed_dict[self.model.memory_in] = brain_info.memories

        feed_dict = self.fill_eval_dict(feed_dict, brain_info)
        run_out = self._execute_model(feed_dict, self.inference_dict)
        return run_out
示例#19
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    no_agent_brain_info = BrainInfo([], [], [], agents=[])
    result = policy.get_action(no_agent_brain_info)
    assert result == ActionInfo([], [], None)
 def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:
     """
     Constructs a BrainInfo which contains the most recent previous experiences for all agents
     which correspond to the agents in a provided next_info.
     :BrainInfo next_info: A t+1 BrainInfo.
     :return: curr_info: Reconstructed BrainInfo to match agents of next_info.
     """
     visual_observations: List[List[Any]] = [
         [] for _ in next_info.visual_observations
     ]  # TODO add types to brain.py methods
     vector_observations = []
     text_observations = []
     memories = []
     rewards = []
     local_dones = []
     max_reacheds = []
     agents = []
     prev_vector_actions = []
     prev_text_actions = []
     action_masks = []
     for agent_id in next_info.agents:
         agent_brain_info = self.training_buffer[agent_id].last_brain_info
         if agent_brain_info is None:
             agent_brain_info = next_info
         agent_index = agent_brain_info.agents.index(agent_id)
         for i in range(len(next_info.visual_observations)):
             visual_observations[i].append(
                 agent_brain_info.visual_observations[i][agent_index]
             )
         vector_observations.append(
             agent_brain_info.vector_observations[agent_index]
         )
         text_observations.append(agent_brain_info.text_observations[agent_index])
         if self.policy.use_recurrent:
             if len(agent_brain_info.memories) > 0:
                 memories.append(agent_brain_info.memories[agent_index])
             else:
                 memories.append(self.policy.make_empty_memory(1))
         rewards.append(agent_brain_info.rewards[agent_index])
         local_dones.append(agent_brain_info.local_done[agent_index])
         max_reacheds.append(agent_brain_info.max_reached[agent_index])
         agents.append(agent_brain_info.agents[agent_index])
         prev_vector_actions.append(
             agent_brain_info.previous_vector_actions[agent_index]
         )
         prev_text_actions.append(
             agent_brain_info.previous_text_actions[agent_index]
         )
         action_masks.append(agent_brain_info.action_masks[agent_index])
     # Check if memories exists (i.e. next_info is not empty) before attempting vstack
     if self.policy.use_recurrent and memories:
         memories = np.vstack(memories)
     curr_info = BrainInfo(
         visual_observations,
         vector_observations,
         text_observations,
         memories,
         rewards,
         agents,
         local_dones,
         prev_vector_actions,
         prev_text_actions,
         max_reacheds,
         action_masks,
     )
     return curr_info