Пример #1
0
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    policy1.actor = policy1.actor.to(default_device())
    policy2.actor = policy2.actor.to(default_device())

    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    np_obs = decision_step.obs
    masks = policy1._extract_masks(decision_step)
    memories = torch.as_tensor(
        policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0)
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    with torch.no_grad():
        _, log_probs1, _, _ = policy1.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
        _, log_probs2, _, _ = policy2.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
    np.testing.assert_array_equal(
        ModelUtils.to_numpy(log_probs1.all_discrete_tensor),
        ModelUtils.to_numpy(log_probs2.all_discrete_tensor),
    )
Пример #2
0
    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TorchPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the
        properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.

        super().__init__(policy, trainer_settings)
        reward_signal_configs = trainer_settings.reward_signals
        reward_signal_names = [
            key.value for key, _ in reward_signal_configs.items()
        ]

        if policy.shared_critic:
            self._critic = policy.actor
        else:
            self._critic = ValueNetwork(
                reward_signal_names,
                policy.behavior_spec.observation_specs,
                network_settings=trainer_settings.network_settings,
            )
            self._critic.to(default_device())

        params = list(self.policy.actor.parameters()) + list(
            self._critic.parameters())
        self.hyperparameters: PPOSettings = cast(
            PPOSettings, trainer_settings.hyperparameters)
        self.decay_learning_rate = ModelUtils.DecayedValue(
            self.hyperparameters.learning_rate_schedule,
            self.hyperparameters.learning_rate,
            1e-10,
            self.trainer_settings.max_steps,
        )
        self.decay_epsilon = ModelUtils.DecayedValue(
            self.hyperparameters.learning_rate_schedule,
            self.hyperparameters.epsilon,
            0.1,
            self.trainer_settings.max_steps,
        )
        self.decay_beta = ModelUtils.DecayedValue(
            self.hyperparameters.learning_rate_schedule,
            self.hyperparameters.beta,
            1e-5,
            self.trainer_settings.max_steps,
        )

        self.optimizer = torch.optim.Adam(
            params, lr=self.trainer_settings.hyperparameters.learning_rate)
        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
        }

        self.stream_names = list(self.reward_signals.keys())
 def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
     super().__init__(specs, settings)
     self._ignore_done = True
     self._discriminator_network = DiscriminatorNetwork(specs, settings)
     self._discriminator_network.to(default_device())
     _, self._demo_buffer = demo_to_buffer(
         settings.demo_path, 1, specs
     )  # This is supposed to be the sequence length but we do not have access here
     params = list(self._discriminator_network.parameters())
     self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate)
    def __init__(self, specs: BehaviorSpec,
                 settings: CuriositySettings) -> None:
        super().__init__(specs, settings)
        self._ignore_done = True
        self._network = CuriosityNetwork(specs, settings)
        self._network.to(default_device())

        self.optimizer = torch.optim.Adam(self._network.parameters(),
                                          lr=settings.learning_rate)
        self._has_updated_once = False
Пример #5
0
def test_set_torch_device(
    mock_set_default_tensor_type,
    device_str,
    expected_type,
    expected_index,
    expected_tensor_type,
):
    try:
        torch_settings = TorchSettings(device=device_str)
        set_torch_config(torch_settings)
        assert default_device().type == expected_type
        if expected_index is None:
            assert default_device().index is None
        else:
            assert default_device().index == expected_index
        mock_set_default_tensor_type.assert_called_once_with(
            expected_tensor_type)
    except Exception:
        raise
    finally:
        # restore the defaults
        torch_settings = TorchSettings(device=None)
        set_torch_config(torch_settings)
Пример #6
0
    def environment_initialized(self, run_options: RunOptions) -> None:
        self.run_options = run_options
        # Tuple of (major, minor, patch)
        vi = sys.version_info
        env_params = run_options.environment_parameters

        msg = TrainingEnvironmentInitialized(
            python_version=f"{vi[0]}.{vi[1]}.{vi[2]}",
            mlagents_version=mlagents.trainers.__version__,
            mlagents_envs_version=mlagents_envs.__version__,
            torch_version=torch_utils.torch.__version__,
            torch_device_type=torch_utils.default_device().type,
            num_envs=run_options.env_settings.num_envs,
            num_environment_parameters=len(env_params) if env_params else 0,
        )

        any_message = Any()
        any_message.Pack(msg)

        env_init_msg = OutgoingMessage()
        env_init_msg.set_raw_bytes(any_message.SerializeToString())
        super().queue_message_to_send(env_init_msg)
Пример #7
0
    def __init__(
        self,
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        separate_critic: bool = True,
        condition_sigma_on_obs: bool = True,
    ):
        """
        Policy that uses a multilayer perceptron to map the observations to actions. Could
        also use a CNN to encode visual input prior to the MLP. Supports discrete and
        continuous actions, as well as recurrent networks.
        :param seed: Random seed.
        :param behavior_spec: Assigned BehaviorSpec object.
        :param trainer_settings: Defined training parameters.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        :param tanh_squash: Whether to use a tanh function on the continuous output,
        or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy
        in continuous output.
        """
        super().__init__(
            seed,
            behavior_spec,
            trainer_settings,
            tanh_squash,
            reparameterize,
            condition_sigma_on_obs,
        )
        self.global_step = (
            GlobalSteps())  # could be much simpler if TorchPolicy is nn.Module
        self.grads = None

        reward_signal_configs = trainer_settings.reward_signals
        reward_signal_names = [
            key.value for key, _ in reward_signal_configs.items()
        ]

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
        }
        if separate_critic:
            ac_class = SeparateActorCritic
        else:
            ac_class = SharedActorCritic
        self.actor_critic = ac_class(
            sensor_specs=self.behavior_spec.sensor_specs,
            network_settings=trainer_settings.network_settings,
            action_spec=behavior_spec.action_spec,
            stream_names=reward_signal_names,
            conditional_sigma=self.condition_sigma_on_obs,
            tanh_squash=tanh_squash,
        )
        # Save the m_size needed for export
        self._export_m_size = self.m_size
        # m_size needed for training is determined by network, not trainer settings
        self.m_size = self.actor_critic.memory_size

        self.actor_critic.to(default_device())
        self._clip_action = not tanh_squash
Пример #8
0
    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        super().__init__(policy, trainer_params)
        hyperparameters: SACSettings = cast(SACSettings,
                                            trainer_params.hyperparameters)
        self.tau = hyperparameters.tau
        self.init_entcoef = hyperparameters.init_entcoef

        self.policy = policy
        policy_network_settings = policy.network_settings

        self.tau = hyperparameters.tau
        self.burn_in_ratio = 0.0

        # Non-exposed SAC parameters
        self.discrete_target_entropy_scale = 0.2  # Roughly equal to e-greedy 0.05
        self.continuous_target_entropy_scale = 1.0

        self.stream_names = list(self.reward_signals.keys())
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.gammas = [
            _val.gamma for _val in trainer_params.reward_signals.values()
        ]
        self.use_dones_in_backup = {
            name: int(not self.reward_signals[name].ignore_done)
            for name in self.stream_names
        }
        self._action_spec = self.policy.behavior_spec.action_spec

        self.value_network = TorchSACOptimizer.PolicyValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.sensor_specs,
            policy_network_settings,
            self._action_spec,
        )

        self.target_network = ValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.sensor_specs,
            policy_network_settings,
        )
        ModelUtils.soft_update(self.policy.actor_critic.critic,
                               self.target_network, 1.0)

        # We create one entropy coefficient per action, whether discrete or continuous.
        _disc_log_ent_coef = torch.nn.Parameter(
            torch.log(
                torch.as_tensor([self.init_entcoef] *
                                len(self._action_spec.discrete_branches))),
            requires_grad=True,
        )
        _cont_log_ent_coef = torch.nn.Parameter(torch.log(
            torch.as_tensor([self.init_entcoef])),
                                                requires_grad=True)
        self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
            discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef)
        _cont_target = (
            -1 * self.continuous_target_entropy_scale *
            np.prod(self._action_spec.continuous_size).astype(np.float32))
        _disc_target = [
            self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
            for i in self._action_spec.discrete_branches
        ]
        self.target_entropy = TorchSACOptimizer.TargetEntropy(
            continuous=_cont_target, discrete=_disc_target)
        policy_params = list(
            self.policy.actor_critic.network_body.parameters()) + list(
                self.policy.actor_critic.action_model.parameters())
        value_params = list(self.value_network.parameters()) + list(
            self.policy.actor_critic.critic.parameters())

        logger.debug("value_vars")
        for param in value_params:
            logger.debug(param.shape)
        logger.debug("policy_vars")
        for param in policy_params:
            logger.debug(param.shape)

        self.decay_learning_rate = ModelUtils.DecayedValue(
            hyperparameters.learning_rate_schedule,
            hyperparameters.learning_rate,
            1e-10,
            self.trainer_settings.max_steps,
        )
        self.policy_optimizer = torch.optim.Adam(
            policy_params, lr=hyperparameters.learning_rate)
        self.value_optimizer = torch.optim.Adam(
            value_params, lr=hyperparameters.learning_rate)
        self.entropy_optimizer = torch.optim.Adam(
            self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate)
        self._move_to_device(default_device())