示例#1
0
    def __init__(
        self,
        seed: int,
        brain: BrainParameters,
        trainer_params: Dict[str, Any],
        is_training: bool,
        load: bool,
    ):
        """
        Policy for Proximal Policy Optimization Networks.
        :param seed: Random seed.
        :param brain: Assigned Brain object.
        :param trainer_params: Defined training parameters.
        :param is_training: Whether the model should be trained.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        """
        super().__init__(seed, brain, trainer_params)

        reward_signal_configs = trainer_params["reward_signals"]
        self.inference_dict: Dict[str, tf.Tensor] = {}
        self.update_dict: Dict[str, tf.Tensor] = {}
        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
        }

        self.create_model(
            brain, trainer_params, reward_signal_configs, is_training, load, seed
        )
        self.create_reward_signals(reward_signal_configs)

        with self.graph.as_default():
            self.bc_module: Optional[BCModule] = None
            # Create pretrainer if needed
            if "behavioral_cloning" in trainer_params:
                BCModule.check_config(trainer_params["behavioral_cloning"])
                self.bc_module = BCModule(
                    self,
                    policy_learning_rate=trainer_params["learning_rate"],
                    default_batch_size=trainer_params["batch_size"],
                    default_num_epoch=3,
                    **trainer_params["behavioral_cloning"],
                )

        if load:
            self._load_graph()
        else:
            self._initialize_graph()
示例#2
0
 def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
     self.sess = policy.sess
     self.policy = policy
     self.update_dict: Dict[str, tf.Tensor] = {}
     self.value_heads: Dict[str, tf.Tensor] = {}
     self.create_reward_signals(trainer_params["reward_signals"])
     self.memory_in: tf.Tensor = None
     self.memory_out: tf.Tensor = None
     self.m_size: int = 0
     self.bc_module: Optional[BCModule] = None
     # Create pretrainer if needed
     if "behavioral_cloning" in trainer_params:
         BCModule.check_config(trainer_params["behavioral_cloning"])
         self.bc_module = BCModule(
             self.policy,
             policy_learning_rate=trainer_params["learning_rate"],
             default_batch_size=trainer_params["batch_size"],
             default_num_epoch=3,
             **trainer_params["behavioral_cloning"],
         )
    def __init__(self, seed, brain, trainer_params, is_training, load):
        """
        Policy for Proximal Policy Optimization Networks.
        :param seed: Random seed.
        :param brain: Assigned Brain object.
        :param trainer_params: Defined training parameters.
        :param is_training: Whether the model should be trained.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        """
        super().__init__(seed, brain, trainer_params)

        reward_signal_configs = trainer_params["reward_signals"]

        self.reward_signals = {}
        with self.graph.as_default():
            self.model = PPOModel(
                brain,
                lr=float(trainer_params["learning_rate"]),
                h_size=int(trainer_params["hidden_units"]),
                epsilon=float(trainer_params["epsilon"]),
                beta=float(trainer_params["beta"]),
                max_step=float(trainer_params["max_steps"]),
                normalize=trainer_params["normalize"],
                use_recurrent=trainer_params["use_recurrent"],
                num_layers=int(trainer_params["num_layers"]),
                m_size=self.m_size,
                seed=seed,
                stream_names=list(reward_signal_configs.keys()),
                vis_encode_type=EncoderType(
                    trainer_params.get("vis_encode_type", "simple")),
            )
            self.model.create_ppo_optimizer()

            # Create reward signals
            for reward_signal, config in reward_signal_configs.items():
                self.reward_signals[reward_signal] = create_reward_signal(
                    self, reward_signal, config)

            # Create pretrainer if needed
            if "pretraining" in trainer_params:
                BCModule.check_config(trainer_params["pretraining"])
                self.bc_module = BCModule(
                    self,
                    policy_learning_rate=trainer_params["learning_rate"],
                    default_batch_size=trainer_params["batch_size"],
                    default_num_epoch=trainer_params["num_epoch"],
                    **trainer_params["pretraining"],
                )
            else:
                self.bc_module = None

        if load:
            self._load_graph()
        else:
            self._initialize_graph()

        self.inference_dict = {
            "action": self.model.output,
            "log_probs": self.model.all_log_probs,
            "value": self.model.value_heads,
            "entropy": self.model.entropy,
            "learning_rate": self.model.learning_rate,
        }
        if self.use_continuous_act:
            self.inference_dict["pre_action"] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.model.memory_out
        if (is_training and self.use_vec_obs and trainer_params["normalize"]
                and not load):
            self.inference_dict[
                "update_mean"] = self.model.update_normalization

        self.total_policy_loss = self.model.policy_loss

        self.update_dict = {
            "value_loss": self.model.value_loss,
            "policy_loss": self.total_policy_loss,
            "update_batch": self.model.update_batch,
        }
示例#4
0
    def __init__(
        self,
        seed: int,
        brain: BrainParameters,
        trainer_params: Dict[str, Any],
        is_training: bool,
        load: bool,
    ) -> None:
        """
        Policy for Proximal Policy Optimization Networks.
        :param seed: Random seed.
        :param brain: Assigned Brain object.
        :param trainer_params: Defined training parameters.
        :param is_training: Whether the model should be trained.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        """
        super().__init__(seed, brain, trainer_params)

        reward_signal_configs = {}
        for key, rsignal in trainer_params["reward_signals"].items():
            if type(rsignal) is dict:
                reward_signal_configs[key] = rsignal

        self.inference_dict: Dict[str, tf.Tensor] = {}
        self.update_dict: Dict[str, tf.Tensor] = {}
        self.create_model(brain, trainer_params, reward_signal_configs,
                          is_training, load, seed)
        self.create_reward_signals(reward_signal_configs)

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
            "Losses/Q1 Loss": "q1_loss",
            "Losses/Q2 Loss": "q2_loss",
            "Policy/Entropy Coeff": "entropy_coef",
        }

        with self.graph.as_default():
            # Create pretrainer if needed
            self.bc_module: Optional[BCModule] = None
            if "pretraining" in trainer_params:
                BCModule.check_config(trainer_params["pretraining"])
                self.bc_module = BCModule(
                    self,
                    policy_learning_rate=trainer_params["learning_rate"],
                    default_batch_size=trainer_params["batch_size"],
                    default_num_epoch=1,
                    samples_per_update=trainer_params["batch_size"],
                    **trainer_params["pretraining"],
                )
                # SAC-specific setting - we don't want to do a whole epoch each update!
                if "samples_per_update" in trainer_params["pretraining"]:
                    logger.warning(
                        "Pretraining: Samples Per Update is not a valid setting for SAC."
                    )
                    self.bc_module.samples_per_update = 1

        if load:
            self._load_graph()
        else:
            self._initialize_graph()
            self.sess.run(self.model.target_init_op)

        # Disable terminal states for certain reward signals to avoid survivor bias
        for name, reward_signal in self.reward_signals.items():
            if not reward_signal.use_terminal_states:
                self.sess.run(self.model.disable_use_dones[name])