예제 #1
0
    def __init__(self, sess, brain, reward_buff_cap, trainer_parameters,
                 training, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param sess: Tensorflow session.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        """
        super(PPOTrainer, self).__init__(sess, brain.brain_name,
                                         trainer_parameters, training, run_id)

        self.param_keys = [
            'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma',
            'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize',
            'num_epoch', 'num_layers', 'time_horizon', 'sequence_length',
            'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path',
            'memory_size', 'use_curiosity', 'curiosity_strength',
            'curiosity_enc_size'
        ]

        for k in self.param_keys:
            if k not in trainer_parameters:
                raise UnityTrainerException(
                    "The hyperparameter {0} could not be found for the PPO trainer of "
                    "brain {1}.".format(k, brain.brain_name))

        self.use_curiosity = bool(trainer_parameters['use_curiosity'])

        self.step = 0

        self.policy = PPOPolicy(seed, brain, trainer_parameters, sess,
                                self.is_training)

        stats = {
            'cumulative_reward': [],
            'episode_length': [],
            'value_estimate': [],
            'entropy': [],
            'value_loss': [],
            'policy_loss': [],
            'learning_rate': []
        }
        if self.use_curiosity:
            stats['forward_loss'] = []
            stats['inverse_loss'] = []
            stats['intrinsic_reward'] = []
            self.intrinsic_rewards = {}
        self.stats = stats

        self.training_buffer = Buffer()
        self.training_buffer2 = Buffer()

        self.cumulative_rewards = {}
        self._reward_buffer = deque(maxlen=reward_buff_cap)
        self.episode_steps = {}
        self.summary_path = trainer_parameters['summary_path']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.summary_writer = tf.summary.FileWriter(self.summary_path)
예제 #2
0
    def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param sess: Tensorflow session.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        """
        super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id)

        self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon',
                           'graph_scope', 'summary_freq', 'max_steps',
                           'batches_per_epoch', 'use_recurrent',
                           'hidden_units','learning_rate', 'num_layers',
                           'sequence_length', 'memory_size']

        for k in self.param_keys:
            if k not in trainer_parameters:
                raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of "
                                            "brain {1}.".format(k, brain.brain_name))

        self.policy = BCPolicy(seed, brain, trainer_parameters, sess)
        self.brain_name = brain.brain_name
        self.brain_to_imitate = trainer_parameters['brain_to_imitate']
        self.batches_per_epoch = trainer_parameters['batches_per_epoch']
        self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []}

        self.training_buffer = Buffer()
        self.summary_path = trainer_parameters['summary_path']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.summary_writer = tf.summary.FileWriter(self.summary_path)
예제 #3
0
 def _check_resolution_for_encoder(camera_res: CameraResolution,
                                   vis_encoder_type: EncoderType) -> None:
     min_res = LearningModel.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
     if camera_res.height < min_res or camera_res.width < min_res:
         raise UnityTrainerException(
             f"Visual observation resolution ({camera_res.width}x{camera_res.height}) is too small for"
             f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}"
         )
    def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
                 load, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param reward_buff_cap: Max reward history to track in the reward buffer
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super().__init__(brain, trainer_parameters, training, run_id,
                         reward_buff_cap)
        self.param_keys = [
            "batch_size",
            "beta",
            "buffer_size",
            "epsilon",
            "hidden_units",
            "lambd",
            "learning_rate",
            "max_steps",
            "normalize",
            "num_epoch",
            "num_layers",
            "time_horizon",
            "sequence_length",
            "summary_freq",
            "use_recurrent",
            "summary_path",
            "memory_size",
            "model_path",
            "reward_signals",
        ]
        self.check_param_keys()

        # Make sure we have at least one reward_signal
        if not self.trainer_parameters["reward_signals"]:
            raise UnityTrainerException(
                "No reward signals were defined. At least one must be used with {}."
                .format(self.__class__.__name__))

        self.step = 0
        self.policy = PPOPolicy(seed, brain, trainer_parameters,
                                self.is_training, load)

        stats = defaultdict(list)
        # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
        # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
        # of what reward signals are actually present.
        self.collected_rewards = {"environment": {}}
        for _reward_signal in self.policy.reward_signals.keys():
            self.collected_rewards[_reward_signal] = {}

        self.stats = stats

        self.training_buffer = Buffer()
        self.episode_steps = {}
def create_reward_signal(policy: TFPolicy, name: str,
                         config_entry: Dict[str, Any]) -> RewardSignal:
    """
    Creates a reward signal class based on the name and config entry provided as a dict.
    :param policy: The policy class which the reward will be applied to.
    :param name: The name of the reward signal
    :param config_entry: The config entries for that reward signal
    :return: The reward signal class instantiated
    """
    rcls = NAME_TO_CLASS.get(name)
    if not rcls:
        raise UnityTrainerException(
            "Unknown reward signal type {0}".format(name))
    rcls.check_config(config_entry)
    try:
        class_inst = rcls(policy, **config_entry)
    except TypeError:
        raise UnityTrainerException(
            "Unknown parameters given for reward signal {0}".format(name))
    return class_inst
예제 #6
0
 def check_config(config_dict: Dict[str, Any]) -> None:
     """
     Check the behavioral_cloning config for the required keys.
     :param config_dict: Pretraining section of trainer_config
     """
     param_keys = ["strength", "demo_path", "steps"]
     for k in param_keys:
         if k not in config_dict:
             raise UnityTrainerException(
                 "The required pre-training hyper-parameter {0} was not defined. Please check your \
                 trainer YAML file.".format(k))
예제 #7
0
 def check_config(cls,
                  config_dict: Dict[str, Any],
                  param_keys: List[str] = None) -> None:
     """
     Check the config dict, and throw an error if there are missing hyperparameters.
     """
     param_keys = param_keys or []
     for k in param_keys:
         if k not in config_dict:
             raise UnityTrainerException(
                 "The hyper-parameter {0} could not be found for {1}.".
                 format(k, cls.__name__))
예제 #8
0
 def add_policy_outputs(self, take_action_outputs: ActionInfoOutputs,
                        agent_id: str, agent_idx: int) -> None:
     """
     Takes the output of the last action and store it into the training buffer.
     We break this out from add_experiences since it is very highly dependent
     on the type of trainer.
     :param take_action_outputs: The outputs of the Policy's get_action method.
     :param agent_id: the Agent we're adding to.
     :param agent_idx: the index of the Agent agent_id
     """
     raise UnityTrainerException(
         "The process_experiences method was not implemented.")
예제 #9
0
 def __init__(self, *args, **kwargs):
     super(RLTrainer, self).__init__(*args, **kwargs)
     # Make sure we have at least one reward_signal
     if not self.trainer_parameters["reward_signals"]:
         raise UnityTrainerException(
             "No reward signals were defined. At least one must be used with {}."
             .format(self.__class__.__name__))
     # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
     # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
     # of what reward signals are actually present.
     self.collected_rewards = {"environment": {}}
     self.training_buffer = Buffer()
     self.episode_steps = {}
    def __init__(self, brain, trainer_parameters, training, load, seed,
                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super(OfflineBCTrainer, self).__init__(brain, trainer_parameters,
                                               training, load, seed, run_id)

        self.param_keys = [
            "batch_size",
            "summary_freq",
            "max_steps",
            "batches_per_epoch",
            "use_recurrent",
            "hidden_units",
            "learning_rate",
            "num_layers",
            "sequence_length",
            "memory_size",
            "model_path",
            "demo_path",
        ]

        self.check_param_keys()
        self.batches_per_epoch = trainer_parameters["batches_per_epoch"]
        self.n_sequences = max(
            int(trainer_parameters["batch_size"] /
                self.policy.sequence_length), 1)

        brain_params, self.demonstration_buffer = demo_to_buffer(
            trainer_parameters["demo_path"], self.policy.sequence_length)

        policy_brain = copy.deepcopy(brain.__dict__)
        expert_brain = copy.deepcopy(brain_params.__dict__)
        policy_brain.pop("brain_name")
        expert_brain.pop("brain_name")
        policy_brain.pop("vector_action_descriptions")
        expert_brain.pop("vector_action_descriptions")
        if expert_brain != policy_brain:
            raise UnityTrainerException(
                "The provided demonstration is not compatible with the "
                "brain being used for performance evaluation.")
 def create_learning_rate(
     lr_schedule: LearningRateSchedule,
     lr: float,
     global_step: tf.Tensor,
     max_step: int,
 ) -> tf.Tensor:
     if lr_schedule == LearningRateSchedule.CONSTANT:
         learning_rate = tf.Variable(lr)
     elif lr_schedule == LearningRateSchedule.LINEAR:
         learning_rate = tf.train.polynomial_decay(
             lr, global_step, max_step, 1e-10, power=1.0
         )
     else:
         raise UnityTrainerException(
             "The learning rate schedule {} is invalid.".format(lr_schedule)
         )
     return learning_rate
예제 #12
0
 def add_rewards_outputs(
     self,
     rewards_out: AllRewardsOutput,
     values: Dict[str, np.ndarray],
     agent_id: str,
     agent_idx: int,
     agent_next_idx: int,
 ) -> None:
     """
     Takes the value and evaluated rewards output of the last action and store it
     into the training buffer. We break this out from add_experiences since it is very
     highly dependent on the type of trainer.
     :param take_action_outputs: The outputs of the Policy's get_action method.
     :param rewards_dict: Dict of rewards after evaluation
     :param agent_id: the Agent we're adding to.
     :param agent_idx: the index of the Agent agent_id in the current brain info
     :param agent_next_idx: the index of the Agent agent_id in the next brain info
     """
     raise UnityTrainerException(
         "The process_experiences method was not implemented.")
예제 #13
0
def initialize_trainer(
    trainer_config: Any,
    brain_parameters: BrainParameters,
    summaries_dir: str,
    run_id: str,
    model_path: str,
    keep_checkpoints: int,
    train_model: bool,
    load_model: bool,
    seed: int,
    meta_curriculum: MetaCurriculum = None,
    multi_gpu: bool = False,
) -> Trainer:
    """
    Initializes a trainer given a provided trainer configuration and brain parameters, as well as
    some general training session options.

    :param trainer_config: Original trainer configuration loaded from YAML
    :param brain_parameters: BrainParameters provided by the Unity environment
    :param summaries_dir: Directory to store trainer summary statistics
    :param run_id: Run ID to associate with this training run
    :param model_path: Path to save the model
    :param keep_checkpoints: How many model checkpoints to keep
    :param train_model: Whether to train the model (vs. run inference)
    :param load_model: Whether to load the model or randomly initialize
    :param seed: The random seed to use
    :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
    :param multi_gpu: Whether to use multi-GPU training
    :return:
    """
    brain_name = brain_parameters.brain_name
    if "default" not in trainer_config and brain_name not in trainer_config:
        raise TrainerConfigError(
            f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). '
            "See config/trainer_config.yaml for an example.")

    trainer_parameters = trainer_config.get("default", {}).copy()
    trainer_parameters["summary_path"] = str(run_id) + "_" + brain_name
    trainer_parameters["model_path"] = "{basedir}/{name}".format(
        basedir=model_path, name=brain_name)
    trainer_parameters["keep_checkpoints"] = keep_checkpoints
    if brain_name in trainer_config:
        _brain_key: Any = brain_name
        while not isinstance(trainer_config[_brain_key], dict):
            _brain_key = trainer_config[_brain_key]
        trainer_parameters.update(trainer_config[_brain_key])

    min_lesson_length = 1
    if meta_curriculum:
        if brain_name in meta_curriculum.brains_to_curriculums:
            min_lesson_length = meta_curriculum.brains_to_curriculums[
                brain_name].min_lesson_length
        else:
            logger.warning(
                f"Metacurriculum enabled, but no curriculum for brain {brain_name}. "
                f"Brains with curricula: {meta_curriculum.brains_to_curriculums.keys()}. "
            )

    trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
    if "trainer" not in trainer_parameters:
        raise TrainerConfigError(
            f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).'
        )
    trainer_type = trainer_parameters["trainer"]

    if trainer_type == "offline_bc":
        raise UnityTrainerException(
            "The offline_bc trainer has been removed. To train with demonstrations, "
            "please use a PPO or SAC trainer with the GAIL Reward Signal and/or the "
            "Behavioral Cloning feature enabled.")
    elif trainer_type == "ppo":
        trainer = PPOTrainer(
            brain_parameters,
            min_lesson_length,
            trainer_parameters,
            train_model,
            load_model,
            seed,
            run_id,
            multi_gpu,
        )
    elif trainer_type == "sac":
        trainer = SACTrainer(
            brain_parameters,
            min_lesson_length,
            trainer_parameters,
            train_model,
            load_model,
            seed,
            run_id,
        )
    else:
        raise TrainerConfigError(
            f'The trainer config contains an unknown trainer type "{trainer_type}" for brain {brain_name}'
        )
    return trainer