def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id) self.param_keys = [ 'batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', 'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', 'graph_scope', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength', 'curiosity_enc_size' ] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException( "The hyperparameter {0} could not be found for the PPO trainer of " "brain {1}.".format(k, brain.brain_name)) self.use_curiosity = bool(trainer_parameters['use_curiosity']) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, sess, self.is_training) stats = { 'cumulative_reward': [], 'episode_length': [], 'value_estimate': [], 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': [] } if self.use_curiosity: stats['forward_loss'] = [] stats['inverse_loss'] = [] stats['intrinsic_reward'] = [] self.intrinsic_rewards = {} self.stats = stats self.training_buffer = Buffer() self.training_buffer2 = Buffer() self.cumulative_rewards = {} self._reward_buffer = deque(maxlen=reward_buff_cap) self.episode_steps = {} self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param sess: Tensorflow session. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. """ super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id) self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope', 'summary_freq', 'max_steps', 'batches_per_epoch', 'use_recurrent', 'hidden_units','learning_rate', 'num_layers', 'sequence_length', 'memory_size'] for k in self.param_keys: if k not in trainer_parameters: raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of " "brain {1}.".format(k, brain.brain_name)) self.policy = BCPolicy(seed, brain, trainer_parameters, sess) self.brain_name = brain.brain_name self.brain_to_imitate = trainer_parameters['brain_to_imitate'] self.batches_per_epoch = trainer_parameters['batches_per_epoch'] self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1) self.cumulative_rewards = {} self.episode_steps = {} self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []} self.training_buffer = Buffer() self.summary_path = trainer_parameters['summary_path'] if not os.path.exists(self.summary_path): os.makedirs(self.summary_path) self.summary_writer = tf.summary.FileWriter(self.summary_path)
def _check_resolution_for_encoder(camera_res: CameraResolution, vis_encoder_type: EncoderType) -> None: min_res = LearningModel.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type] if camera_res.height < min_res or camera_res.width < min_res: raise UnityTrainerException( f"Visual observation resolution ({camera_res.width}x{camera_res.height}) is too small for" f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}" )
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) self.step = 0 self.policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) stats = defaultdict(list) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {} self.stats = stats self.training_buffer = Buffer() self.episode_steps = {}
def create_reward_signal(policy: TFPolicy, name: str, config_entry: Dict[str, Any]) -> RewardSignal: """ Creates a reward signal class based on the name and config entry provided as a dict. :param policy: The policy class which the reward will be applied to. :param name: The name of the reward signal :param config_entry: The config entries for that reward signal :return: The reward signal class instantiated """ rcls = NAME_TO_CLASS.get(name) if not rcls: raise UnityTrainerException( "Unknown reward signal type {0}".format(name)) rcls.check_config(config_entry) try: class_inst = rcls(policy, **config_entry) except TypeError: raise UnityTrainerException( "Unknown parameters given for reward signal {0}".format(name)) return class_inst
def check_config(config_dict: Dict[str, Any]) -> None: """ Check the behavioral_cloning config for the required keys. :param config_dict: Pretraining section of trainer_config """ param_keys = ["strength", "demo_path", "steps"] for k in param_keys: if k not in config_dict: raise UnityTrainerException( "The required pre-training hyper-parameter {0} was not defined. Please check your \ trainer YAML file.".format(k))
def check_config(cls, config_dict: Dict[str, Any], param_keys: List[str] = None) -> None: """ Check the config dict, and throw an error if there are missing hyperparameters. """ param_keys = param_keys or [] for k in param_keys: if k not in config_dict: raise UnityTrainerException( "The hyper-parameter {0} could not be found for {1}.". format(k, cls.__name__))
def add_policy_outputs(self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int) -> None: """ Takes the output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id """ raise UnityTrainerException( "The process_experiences method was not implemented.")
def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.training_buffer = Buffer() self.episode_steps = {}
def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(OfflineBCTrainer, self).__init__(brain, trainer_parameters, training, load, seed, run_id) self.param_keys = [ "batch_size", "summary_freq", "max_steps", "batches_per_epoch", "use_recurrent", "hidden_units", "learning_rate", "num_layers", "sequence_length", "memory_size", "model_path", "demo_path", ] self.check_param_keys() self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.n_sequences = max( int(trainer_parameters["batch_size"] / self.policy.sequence_length), 1) brain_params, self.demonstration_buffer = demo_to_buffer( trainer_parameters["demo_path"], self.policy.sequence_length) policy_brain = copy.deepcopy(brain.__dict__) expert_brain = copy.deepcopy(brain_params.__dict__) policy_brain.pop("brain_name") expert_brain.pop("brain_name") policy_brain.pop("vector_action_descriptions") expert_brain.pop("vector_action_descriptions") if expert_brain != policy_brain: raise UnityTrainerException( "The provided demonstration is not compatible with the " "brain being used for performance evaluation.")
def create_learning_rate( lr_schedule: LearningRateSchedule, lr: float, global_step: tf.Tensor, max_step: int, ) -> tf.Tensor: if lr_schedule == LearningRateSchedule.CONSTANT: learning_rate = tf.Variable(lr) elif lr_schedule == LearningRateSchedule.LINEAR: learning_rate = tf.train.polynomial_decay( lr, global_step, max_step, 1e-10, power=1.0 ) else: raise UnityTrainerException( "The learning rate schedule {} is invalid.".format(lr_schedule) ) return learning_rate
def add_rewards_outputs( self, rewards_out: AllRewardsOutput, values: Dict[str, np.ndarray], agent_id: str, agent_idx: int, agent_next_idx: int, ) -> None: """ Takes the value and evaluated rewards output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param rewards_dict: Dict of rewards after evaluation :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id in the current brain info :param agent_next_idx: the index of the Agent agent_id in the next brain info """ raise UnityTrainerException( "The process_experiences method was not implemented.")
def initialize_trainer( trainer_config: Any, brain_parameters: BrainParameters, summaries_dir: str, run_id: str, model_path: str, keep_checkpoints: int, train_model: bool, load_model: bool, seed: int, meta_curriculum: MetaCurriculum = None, multi_gpu: bool = False, ) -> Trainer: """ Initializes a trainer given a provided trainer configuration and brain parameters, as well as some general training session options. :param trainer_config: Original trainer configuration loaded from YAML :param brain_parameters: BrainParameters provided by the Unity environment :param summaries_dir: Directory to store trainer summary statistics :param run_id: Run ID to associate with this training run :param model_path: Path to save the model :param keep_checkpoints: How many model checkpoints to keep :param train_model: Whether to train the model (vs. run inference) :param load_model: Whether to load the model or randomly initialize :param seed: The random seed to use :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer :param multi_gpu: Whether to use multi-GPU training :return: """ brain_name = brain_parameters.brain_name if "default" not in trainer_config and brain_name not in trainer_config: raise TrainerConfigError( f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). ' "See config/trainer_config.yaml for an example.") trainer_parameters = trainer_config.get("default", {}).copy() trainer_parameters["summary_path"] = str(run_id) + "_" + brain_name trainer_parameters["model_path"] = "{basedir}/{name}".format( basedir=model_path, name=brain_name) trainer_parameters["keep_checkpoints"] = keep_checkpoints if brain_name in trainer_config: _brain_key: Any = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] trainer_parameters.update(trainer_config[_brain_key]) min_lesson_length = 1 if meta_curriculum: if brain_name in meta_curriculum.brains_to_curriculums: min_lesson_length = meta_curriculum.brains_to_curriculums[ brain_name].min_lesson_length else: logger.warning( f"Metacurriculum enabled, but no curriculum for brain {brain_name}. " f"Brains with curricula: {meta_curriculum.brains_to_curriculums.keys()}. " ) trainer: Trainer = None # type: ignore # will be set to one of these, or raise if "trainer" not in trainer_parameters: raise TrainerConfigError( f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).' ) trainer_type = trainer_parameters["trainer"] if trainer_type == "offline_bc": raise UnityTrainerException( "The offline_bc trainer has been removed. To train with demonstrations, " "please use a PPO or SAC trainer with the GAIL Reward Signal and/or the " "Behavioral Cloning feature enabled.") elif trainer_type == "ppo": trainer = PPOTrainer( brain_parameters, min_lesson_length, trainer_parameters, train_model, load_model, seed, run_id, multi_gpu, ) elif trainer_type == "sac": trainer = SACTrainer( brain_parameters, min_lesson_length, trainer_parameters, train_model, load_model, seed, run_id, ) else: raise TrainerConfigError( f'The trainer config contains an unknown trainer type "{trainer_type}" for brain {brain_name}' ) return trainer