Python AgentBuffer示例，mlagents.trainers.buffer.AgentBuffer Python示例

示例#1

0

显示文件

    def __init__(self, brain, trainer_parameters, training, load, seed,
                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super(BCTrainer, self).__init__(brain, trainer_parameters, training,
                                        run_id)
        self.policy = BCPolicy(seed, brain, trainer_parameters, load)
        self.n_sequences = 1
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.stats = {
            "Losses/Cloning Loss": [],
            "Environment/Episode Length": [],
            "Environment/Cumulative Reward": [],
        }

        self.batches_per_epoch = trainer_parameters["batches_per_epoch"]

        self.demonstration_buffer = AgentBuffer()
        self.evaluation_buffer = ProcessingBuffer()

示例#2

0

显示文件

文件： test_buffer.py 项目： SimpleG20/ml-agents

def test_buffer():
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    agent_3_buffer = construct_fake_buffer(3)
    a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2,
                                                         training_length=1,
                                                         sequential=True)
    assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2,
                                                         training_length=3,
                                                         sequential=True)
    assert_array(
        np.array(a),
        np.array([
            [231, 232, 233],
            [241, 242, 243],
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2,
                                                         training_length=3,
                                                         sequential=False)
    assert_array(
        np.array(a),
        np.array([
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    agent_1_buffer.reset_agent()
    assert agent_1_buffer.num_experiences == 0
    update_buffer = AgentBuffer()
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_3_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20

    assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20,
                                                                          2)

    c = update_buffer.make_mini_batch(start=0, end=1)
    assert c.keys() == update_buffer.keys()
    assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)

示例#3

0

显示文件

def test_buffer():
    b = construct_fake_processing_buffer()
    a = b[1]["vector_observation"].get_batch(batch_size=2,
                                             training_length=1,
                                             sequential=True)
    assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
    a = b[2]["vector_observation"].get_batch(batch_size=2,
                                             training_length=3,
                                             sequential=True)
    assert_array(
        np.array(a),
        np.array([
            [231, 232, 233],
            [241, 242, 243],
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    a = b[2]["vector_observation"].get_batch(batch_size=2,
                                             training_length=3,
                                             sequential=False)
    assert_array(
        np.array(a),
        np.array([
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    b[4].reset_agent()
    assert len(b[4]) == 0
    update_buffer = AgentBuffer()
    b.append_to_update_buffer(update_buffer,
                              3,
                              batch_size=None,
                              training_length=2)
    b.append_to_update_buffer(update_buffer,
                              2,
                              batch_size=None,
                              training_length=2)
    assert len(update_buffer["action"]) == 20

    assert np.array(update_buffer["action"]).shape == (20, 2)

    c = update_buffer.make_mini_batch(start=0, end=1)
    assert c.keys() == update_buffer.keys()
    assert np.array(c["action"]).shape == (1, 2)

示例#4

0

显示文件

文件： rl_trainer.py 项目： solpaul/ml-agents

 def _append_to_update_buffer(self,
                              agentbuffer_trajectory: AgentBuffer) -> None:
     """
     Append an AgentBuffer to the update buffer. If the trainer isn't training,
     don't update to avoid a memory leak.
     """
     if self.should_still_train:
         seq_len = (
             self.trainer_settings.network_settings.memory.sequence_length
             if self.trainer_settings.network_settings.memory is not None
             else 1)
         agentbuffer_trajectory.resequence_and_append(
             self.update_buffer, training_length=seq_len)

示例#5

0

显示文件

文件： test_buffer.py 项目： terite/HexChess

def test_buffer_save_load():
    original = construct_fake_buffer(3)
    import io

    write_buffer = io.BytesIO()
    original.save_to_file(write_buffer)

    loaded = AgentBuffer()
    loaded.load_from_file(write_buffer)

    assert len(original) == len(loaded)
    for k in original.keys():
        assert np.allclose(original[k], loaded[k])

示例#6

0

显示文件

文件： rl_trainer.py 项目： qiwulun2006/ml-agents

 def __init__(self, *args, **kwargs):
     super(RLTrainer, self).__init__(*args, **kwargs)
     # Make sure we have at least one reward_signal
     if not self.trainer_parameters["reward_signals"]:
         raise UnityTrainerException(
             "No reward signals were defined. At least one must be used with {}."
             .format(self.__class__.__name__))
     # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
     # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
     # of what reward signals are actually present.
     self.collected_rewards = {"environment": {}}
     self.processing_buffer = ProcessingBuffer()
     self.update_buffer = AgentBuffer()
     self.episode_steps = {}

示例#7

0

显示文件

文件： demo_loader.py 项目： zcemycl/ml-agents

def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    behavior_spec: BehaviorSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_decision_step, current_terminal_step = steps_from_proto(
            [current_pair_info.agent_info], behavior_spec
        )
        next_decision_step, next_terminal_step = steps_from_proto(
            [next_pair_info.agent_info], behavior_spec
        )
        previous_action = (
            np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
        )
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
            )

        next_done = len(next_terminal_step) == 1
        next_reward = 0
        if len(next_terminal_step) == 1:
            next_reward = next_terminal_step.reward[0]
        else:
            next_reward = next_decision_step.reward[0]
        current_obs = None
        if len(current_terminal_step) == 1:
            current_obs = list(current_terminal_step.values())[0].obs
        else:
            current_obs = list(current_decision_step.values())[0].obs

        demo_raw_buffer["done"].append(next_done)
        demo_raw_buffer["rewards"].append(next_reward)
        split_obs = SplitObservations.from_observations(current_obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer, batch_size=None, training_length=sequence_length
            )
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(
        demo_processed_buffer, batch_size=None, training_length=sequence_length
    )
    return demo_processed_buffer

示例#8

0

显示文件

文件： test_buffer.py 项目： terite/HexChess

def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
        b[ObsUtil.get_name_at(0)].append(
            np.array(
                [
                    100 * fake_agent_id + 10 * step + 1,
                    100 * fake_agent_id + 10 * step + 2,
                    100 * fake_agent_id + 10 * step + 3,
                ],
                dtype=np.float32,
            ))
        b[BufferKey.CONTINUOUS_ACTION].append(
            np.array(
                [
                    100 * fake_agent_id + 10 * step + 4,
                    100 * fake_agent_id + 10 * step + 5,
                ],
                dtype=np.float32,
            ))
        b[BufferKey.GROUP_CONTINUOUS_ACTION].append([
            np.array(
                [
                    100 * fake_agent_id + 10 * step + 4,
                    100 * fake_agent_id + 10 * step + 5,
                ],
                dtype=np.float32,
            )
        ] * 3)
    return b

示例#9

0

显示文件

文件： utils.py 项目： rdekleijn/SequentialReacherRevision

def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action["continuous_action"] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action["discrete_action"] = action_buffer.discrete

    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer

示例#10

0

显示文件

def test_sac_rnn_policy(dummy_config):
    # Test evaluate
    tf.reset_default_graph()
    policy = create_sac_policy_mock(dummy_config,
                                    use_rnn=True,
                                    use_discrete=True,
                                    use_visual=False)
    step = mb.create_batchedstep_from_brainparams(policy.brain,
                                                  num_agents=NUM_AGENTS)
    run_out = policy.evaluate(step, list(step.agent_id))
    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))

    # Test update
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                 policy.brain,
                                 memory_size=8)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    update_buffer = AgentBuffer()
    buffer.resequence_and_append(update_buffer,
                                 training_length=policy.sequence_length)
    run_out = policy.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // policy.sequence_length,
    )

示例#11

0

显示文件

def test_obsutil_group_from_buffer():
    buff = AgentBuffer()
    # Create some obs
    for _ in range(3):
        buff[GroupObsUtil.get_name_at(0)].append(
            3 * [np.ones((5, ), dtype=np.float32)])
    # Some agents have died
    for _ in range(2):
        buff[GroupObsUtil.get_name_at(0)].append(
            1 * [np.ones((5, ), dtype=np.float32)])

    # Get the group obs, which will be a List of Lists of np.ndarray, where each element is the same
    # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by
    # NaNs.
    gobs = GroupObsUtil.from_buffer(buff, 1)
    # Agent 0 is full
    agent_0_obs = gobs[0]
    for obs in agent_0_obs:
        assert obs.shape == (buff.num_experiences, 5)
        assert not np.isnan(obs).any()

    agent_1_obs = gobs[1]
    for obs in agent_1_obs:
        assert obs.shape == (buff.num_experiences, 5)
        for i, _exp_obs in enumerate(obs):
            if i >= 3:
                assert np.isnan(_exp_obs).all()
            else:
                assert not np.isnan(_exp_obs).any()

示例#12

0

显示文件

文件： utils.py 项目： zereyak13/ml-agents

def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_obs = [
        np.random.normal(size=sen_spec.shape).astype(np.float32)
        for sen_spec in behavior_spec.sensor_specs
    ]
    next_obs = [
        np.random.normal(size=sen_spec.shape).astype(np.float32)
        for sen_spec in behavior_spec.sensor_specs
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action["continuous_action"] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action["discrete_action"] = action_buffer.discrete

    for _ in range(number):
        for i, obs in enumerate(curr_obs):
            buffer[ObsUtil.get_name_at(i)].append(obs)
        for i, obs in enumerate(next_obs):
            buffer[ObsUtil.get_name_at_next(i)].append(obs)
        buffer["actions"].append(action)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer

示例#13

0

显示文件

文件： trajectory.py 项目： chenh1001/Basketball-ML-Unity

    def to_agentbuffer(self) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
        less than the trajectory, as the next observation need to be populated from the last
        step of the trajectory.
        """
        agent_buffer_trajectory = AgentBuffer()
        vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
        for step, exp in enumerate(self.steps):
            if step < len(self.steps) - 1:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.steps[step + 1].obs)
            else:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.next_obs)

            for i, _ in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["visual_obs%d" % i].append(
                    vec_vis_obs.visual_observations[i])
                agent_buffer_trajectory["next_visual_obs%d" % i].append(
                    next_vec_vis_obs.visual_observations[i])
            agent_buffer_trajectory["vector_obs"].append(
                vec_vis_obs.vector_observations)
            agent_buffer_trajectory["next_vector_in"].append(
                next_vec_vis_obs.vector_observations)
            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)

            agent_buffer_trajectory["masks"].append(1.0)
            agent_buffer_trajectory["done"].append(exp.done)
            # Add the outputs of the last eval
            if exp.action_pre is not None:
                actions_pre = exp.action_pre
                agent_buffer_trajectory["actions_pre"].append(actions_pre)

            # value is a dictionary from name of reward to value estimate of the value head
            agent_buffer_trajectory["actions"].append(exp.action)
            agent_buffer_trajectory["action_probs"].append(exp.action_probs)

            # Store action masks if necessary. Note that 1 means active, while
            # in AgentExperience False means active.
            if exp.action_mask is not None:
                mask = 1 - np.concatenate(exp.action_mask)
                agent_buffer_trajectory["action_mask"].append(mask,
                                                              padding_value=1)
            else:
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.
                agent_buffer_trajectory["action_mask"].append(np.ones(
                    exp.action_probs.shape, dtype=np.float32),
                                                              padding_value=1)

            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current
            vec_vis_obs = next_vec_vis_obs
        return agent_buffer_trajectory

示例#14

0

显示文件

def test_agent_action_group_from_buffer():
    buff = AgentBuffer()
    # Create some actions
    for _ in range(3):
        buff[BufferKey.GROUP_CONTINUOUS_ACTION].append(
            3 * [np.ones((5,), dtype=np.float32)]
        )
        buff[BufferKey.GROUP_DISCRETE_ACTION].append(
            3 * [np.ones((4,), dtype=np.float32)]
        )
    # Some agents have died
    for _ in range(2):
        buff[BufferKey.GROUP_CONTINUOUS_ACTION].append(
            1 * [np.ones((5,), dtype=np.float32)]
        )
        buff[BufferKey.GROUP_DISCRETE_ACTION].append(
            1 * [np.ones((4,), dtype=np.float32)]
        )

    # Get the group actions, which will be a List of Lists of AgentAction, where each element is the same
    # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by
    # NaNs.
    gact = AgentAction.group_from_buffer(buff)
    # Agent 0 is full
    agent_0_act = gact[0]
    assert agent_0_act.continuous_tensor.shape == (buff.num_experiences, 5)
    assert agent_0_act.discrete_tensor.shape == (buff.num_experiences, 4)

    agent_1_act = gact[1]
    assert agent_1_act.continuous_tensor.shape == (buff.num_experiences, 5)
    assert agent_1_act.discrete_tensor.shape == (buff.num_experiences, 4)
    assert (agent_1_act.continuous_tensor[0:3] > 0).all()
    assert (agent_1_act.continuous_tensor[3:] == 0).all()
    assert (agent_1_act.discrete_tensor[0:3] > 0).all()
    assert (agent_1_act.discrete_tensor[3:] == 0).all()

示例#15

0

显示文件

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
        # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
        # of what reward signals are actually present.
        self.cumulative_returns_since_policy_update: List[float] = []
        self.collected_rewards: Dict[str, Dict[str, int]] = {
            "environment": defaultdict(lambda: 0)
        }
        self.update_buffer: AgentBuffer = AgentBuffer()
        self._stats_reporter.add_property(StatsPropertyType.HYPERPARAMETERS,
                                          self.trainer_settings.as_dict())
        self.framework = self.trainer_settings.framework
        if self.framework == FrameworkType.PYTORCH and not torch_utils.is_available(
        ):
            raise UnityTrainerException(
                "To use the experimental PyTorch backend, install the PyTorch Python package first."
            )

        logger.debug(f"Using framework {self.framework.value}")

        self._next_save_step = 0
        self._next_summary_step = 0
        self.model_saver = self.create_model_saver(self.framework,
                                                   self.trainer_settings,
                                                   self.artifact_path,
                                                   self.load)

示例#16

0

显示文件

def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    action = behavior_spec.action_spec.random_action(1)[0, :]
    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        buffer["actions"].append(action)
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer

示例#17

0

显示文件

文件： utils.py 项目： SancySwachitha/Drone

def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_obs = [
        np.random.normal(size=obs_spec.shape).astype(np.float32)
        for obs_spec in behavior_spec.observation_specs
    ]
    next_obs = [
        np.random.normal(size=obs_spec.shape).astype(np.float32)
        for obs_spec in behavior_spec.observation_specs
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete

    for _ in range(number):
        for i, obs in enumerate(curr_obs):
            buffer[ObsUtil.get_name_at(i)].append(obs)
        for i, obs in enumerate(next_obs):
            buffer[ObsUtil.get_name_at_next(i)].append(obs)
        # TODO
        # buffer[AgentBufferKey.ACTIONS].append(action)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        # TODO was "rewards"
        buffer[BufferKey.ENVIRONMENT_REWARDS].append(
            np.ones(1, dtype=np.float32) * reward)
        buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32))
    buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32)
    return buffer

示例#18

0

显示文件

    def to_agentbuffer(self) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
        less than the trajectory, as the next observation need to be populated from the last
        step of the trajectory.
        """
        agent_buffer_trajectory = AgentBuffer()
        obs = self.steps[0].obs
        for step, exp in enumerate(self.steps):
            if step < len(self.steps) - 1:
                next_obs = self.steps[step + 1].obs
            else:
                next_obs = self.next_obs

            num_obs = len(obs)
            for i in range(num_obs):
                agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(
                    next_obs[i])

            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)

            agent_buffer_trajectory["masks"].append(1.0)
            agent_buffer_trajectory["done"].append(exp.done)

            # Adds the log prob and action of continuous/discrete separately
            agent_buffer_trajectory["continuous_action"].append(
                exp.action.continuous)
            agent_buffer_trajectory["discrete_action"].append(
                exp.action.discrete)
            agent_buffer_trajectory["continuous_log_probs"].append(
                exp.action_probs.continuous)
            agent_buffer_trajectory["discrete_log_probs"].append(
                exp.action_probs.discrete)

            # Store action masks if necessary. Note that 1 means active, while
            # in AgentExperience False means active.
            if exp.action_mask is not None:
                mask = 1 - np.concatenate(exp.action_mask)
                agent_buffer_trajectory["action_mask"].append(mask,
                                                              padding_value=1)
            else:
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.

                action_shape = exp.action.discrete.shape
                agent_buffer_trajectory["action_mask"].append(np.ones(
                    action_shape, dtype=np.float32),
                                                              padding_value=1)
            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current
            obs = next_obs
        return agent_buffer_trajectory

示例#19

0

显示文件

文件： test_buffer.py 项目： terite/HexChess

def test_buffer_truncate():
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    update_buffer = AgentBuffer()
    agent_1_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    # Test non-LSTM
    update_buffer.truncate(2)
    assert update_buffer.num_experiences == 2

    agent_1_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    # Test LSTM, truncate should be some multiple of sequence_length
    update_buffer.truncate(4, sequence_length=3)
    assert update_buffer.num_experiences == 3
    for buffer_field in update_buffer.values():
        assert isinstance(buffer_field, AgentBufferField)

示例#20

0

显示文件

文件： demo_loader.py 项目： zer05um2017/ml-agents

def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    group_spec: AgentGroupSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_step_info = batched_step_result_from_proto(
            [current_pair_info.agent_info], group_spec)
        next_step_info = batched_step_result_from_proto(
            [next_pair_info.agent_info], group_spec)
        previous_action = (np.array(pair_infos[idx].action_info.vector_actions,
                                    dtype=np.float32) * 0)
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions,
                dtype=np.float32)
        curr_agent_id = current_step_info.agent_id[0]
        current_agent_step_info = current_step_info.get_agent_step_result(
            curr_agent_id)
        next_agent_id = next_step_info.agent_id[0]
        next_agent_step_info = next_step_info.get_agent_step_result(
            next_agent_id)

        demo_raw_buffer["done"].append(next_agent_step_info.done)
        demo_raw_buffer["rewards"].append(next_agent_step_info.reward)
        split_obs = SplitObservations.from_observations(
            current_agent_step_info.obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        demo_raw_buffer["actions"].append(
            current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_step_info.done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer,
                batch_size=None,
                training_length=sequence_length)
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(demo_processed_buffer,
                                          batch_size=None,
                                          training_length=sequence_length)
    return demo_processed_buffer

示例#21

0

显示文件

def test_clear_update_buffer():
    trainer = create_rl_trainer()
    trainer.processing_buffer = construct_fake_processing_buffer()
    trainer.update_buffer = AgentBuffer()
    trainer.processing_buffer.append_to_update_buffer(
        trainer.update_buffer, 2, batch_size=None, training_length=2
    )
    trainer.clear_update_buffer()
    for _, arr in trainer.update_buffer.items():
        assert len(arr) == 0

示例#22

0

显示文件

文件： demo_loader.py 项目： zouhunter/ml-agents

def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    brain_params: BrainParameters,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, experience in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        current_pair_info = pair_infos[idx]
        next_pair_info = pair_infos[idx + 1]
        current_brain_info = BrainInfo.from_agent_proto(
            0, [current_pair_info.agent_info], brain_params
        )
        next_brain_info = BrainInfo.from_agent_proto(
            0, [next_pair_info.agent_info], brain_params
        )
        previous_action = (
            np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
        )
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
            )
        demo_raw_buffer["done"].append(next_brain_info.local_done[0])
        demo_raw_buffer["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
        if brain_params.vector_observation_space_size > 0:
            demo_raw_buffer["vector_obs"].append(
                current_brain_info.vector_observations[0]
            )
        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_brain_info.local_done[0]:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer, batch_size=None, training_length=sequence_length
            )
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(
        demo_processed_buffer, batch_size=None, training_length=sequence_length
    )
    return demo_processed_buffer

示例#23

0

显示文件

 def __init__(self, *args, **kwargs):
     super(RLTrainer, self).__init__(*args, **kwargs)
     # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
     # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
     # of what reward signals are actually present.
     self.cumulative_returns_since_policy_update: List[float] = []
     self.collected_rewards: Dict[str, Dict[str, int]] = {
         "environment": defaultdict(lambda: 0)
     }
     self.update_buffer: AgentBuffer = AgentBuffer()
     self._stats_reporter.add_property(StatsPropertyType.HYPERPARAMETERS,
                                       self.trainer_settings.as_dict())

示例#24

0

显示文件

    def to_agentbuffer(self) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
        less than the trajectory, as the next observation need to be populated from the last
        step of the trajectory.
        """
        agent_buffer_trajectory = AgentBuffer()
        vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
        for step, exp in enumerate(self.steps):
            if step < len(self.steps) - 1:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.steps[step + 1].obs)
            else:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.next_obs)

            for i, _ in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["visual_obs%d" % i].append(
                    vec_vis_obs.visual_observations[i])
                agent_buffer_trajectory["next_visual_obs%d" % i].append(
                    next_vec_vis_obs.visual_observations[i])
            agent_buffer_trajectory["vector_obs"].append(
                vec_vis_obs.vector_observations)
            agent_buffer_trajectory["next_vector_in"].append(
                next_vec_vis_obs.vector_observations)
            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)

            agent_buffer_trajectory["masks"].append(1.0)
            agent_buffer_trajectory["done"].append(exp.done)
            # Add the outputs of the last eval
            if exp.action_pre is not None:
                actions_pre = exp.action_pre
                agent_buffer_trajectory["actions_pre"].append(actions_pre)

            # value is a dictionary from name of reward to value estimate of the value head
            agent_buffer_trajectory["actions"].append(exp.action)
            agent_buffer_trajectory["action_probs"].append(exp.action_probs)

            # Store action masks if necessary. Eventually these will be
            # None for continuous actions
            if exp.action_mask is not None:
                agent_buffer_trajectory["action_mask"].append(exp.action_mask,
                                                              padding_value=1)

            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current
            vec_vis_obs = next_vec_vis_obs
        return agent_buffer_trajectory

示例#25

0

显示文件

def test_buffer_truncate():
    b = construct_fake_processing_buffer()
    update_buffer = AgentBuffer()
    b.append_to_update_buffer(update_buffer,
                              3,
                              batch_size=None,
                              training_length=2)
    b.append_to_update_buffer(update_buffer,
                              2,
                              batch_size=None,
                              training_length=2)
    # Test non-LSTM
    update_buffer.truncate(2)
    assert update_buffer.num_experiences == 2

    b.append_to_update_buffer(update_buffer,
                              3,
                              batch_size=None,
                              training_length=2)
    b.append_to_update_buffer(update_buffer,
                              2,
                              batch_size=None,
                              training_length=2)
    # Test LSTM, truncate should be some multiple of sequence_length
    update_buffer.truncate(4, sequence_length=3)
    assert update_buffer.num_experiences == 3

示例#26

0

显示文件

文件： test_buffer.py 项目： chenh1001/Basketball-ML-Unity

def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
        b["vector_observation"].append([
            100 * fake_agent_id + 10 * step + 1,
            100 * fake_agent_id + 10 * step + 2,
            100 * fake_agent_id + 10 * step + 3,
        ])
        b["action"].append([
            100 * fake_agent_id + 10 * step + 4,
            100 * fake_agent_id + 10 * step + 5
        ])
    return b

示例#27

0

显示文件

文件： test_buffer.py 项目： SimpleG20/ml-agents

def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
        b[ObsUtil.get_name_at(0)].append([
            100 * fake_agent_id + 10 * step + 1,
            100 * fake_agent_id + 10 * step + 2,
            100 * fake_agent_id + 10 * step + 3,
        ])
        b[BufferKey.CONTINUOUS_ACTION].append([
            100 * fake_agent_id + 10 * step + 4,
            100 * fake_agent_id + 10 * step + 5
        ])
    return b

示例#28

0

显示文件

文件： __init__.py 项目： chenh1001/Basketball-ML-Unity

 def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
     """
     Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
     function drawn straight from a Buffer.
     :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
         when drawing from the update buffer.
     :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
     """
     mini_batch_len = len(next(iter(mini_batch.values())))
     return RewardSignalResult(
         self.strength * np.zeros(mini_batch_len, dtype=np.float32),
         np.zeros(mini_batch_len, dtype=np.float32),
     )

示例#29

0

显示文件

文件： mock_brain.py 项目： qiwulun2006/ml-agents

def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8):
    buffer = ProcessingBuffer()
    update_buffer = AgentBuffer()
    # Make a buffer
    for idx, experience in enumerate(brain_infos):
        if idx > len(brain_infos) - 2:
            break
        current_brain_info = brain_infos[idx]
        next_brain_info = brain_infos[idx + 1]
        buffer[0].last_brain_info = current_brain_info
        buffer[0]["done"].append(next_brain_info.local_done[0])
        buffer[0]["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            buffer[0]["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
            buffer[0]["next_visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
        if brain_params.vector_observation_space_size > 0:
            buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0])
            buffer[0]["next_vector_in"].append(
                current_brain_info.vector_observations[0]
            )
        fake_action_size = len(brain_params.vector_action_space_size)
        if brain_params.vector_action_space_type == "continuous":
            fake_action_size = brain_params.vector_action_space_size[0]
        buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
        buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
        buffer[0]["masks"].append(1.0)
        buffer[0]["advantages"].append(1.0)
        if brain_params.vector_action_space_type == "discrete":
            buffer[0]["action_probs"].append(
                np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32)
            )
        else:
            buffer[0]["action_probs"].append(
                np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
            )
        buffer[0]["actions_pre"].append(
            np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
        )
        buffer[0]["action_mask"].append(
            np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32)
        )
        buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32))

    buffer.append_to_update_buffer(
        update_buffer, 0, batch_size=None, training_length=sequence_length
    )
    return update_buffer

示例#30

0

显示文件

文件： test_buffer.py 项目： terite/HexChess

def test_num_experiences():
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    update_buffer = AgentBuffer()

    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 0
    assert update_buffer.num_experiences == 0
    agent_1_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)

    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20
    assert update_buffer.num_experiences == 20