def test_demo_mismatch(): path_prefix = os.path.dirname(os.path.abspath(__file__)) # observation size mismatch with pytest.raises(RuntimeError): mismatch_obs = setup_test_behavior_specs(False, False, vector_action_space=2, vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_obs) # action mismatch with pytest.raises(RuntimeError): mismatch_act = setup_test_behavior_specs(False, False, vector_action_space=3, vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_act) # action type mismatch with pytest.raises(RuntimeError): mismatch_act_type = setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_act_type) # number obs mismatch with pytest.raises(RuntimeError): mismatch_obs_number = setup_test_behavior_specs(False, True, vector_action_space=2, vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_obs_number)
def test_demo_mismatch(): path_prefix = os.path.dirname(os.path.abspath(__file__)) # observation mismatch with pytest.raises(RuntimeError): brain_params_obs = BrainParameters( brain_name="test_brain", vector_observation_space_size=9, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=1, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_obs) # action mismatch with pytest.raises(RuntimeError): brain_params_act = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[3], vector_action_descriptions=[], vector_action_space_type=1, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_act) # action type mismatch with pytest.raises(RuntimeError): brain_params_type = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_type) # vis obs mismatch with pytest.raises(RuntimeError): brain_params_vis = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[[30, 40]], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=1, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_vis)
def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__(specs, settings) self._ignore_done = True self._discriminator_network = DiscriminatorNetwork(specs, settings) _, self._demo_buffer = demo_to_buffer( settings.demo_path, 1, specs ) # This is supposed to be the sequence length but we do not have access here params = list(self._discriminator_network.parameters()) self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate)
def test_load_demo_dir(): path_prefix = os.path.dirname(os.path.abspath(__file__)) behavior_spec, pair_infos, total_expected = load_demonstration( path_prefix + "/test_demo_dir") assert np.sum(behavior_spec.observation_shapes[0]) == 8 assert len(pair_infos) == total_expected _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1) assert len(demo_buffer["actions"]) == total_expected - 1
def test_load_demo(): path_prefix = os.path.dirname(os.path.abspath(__file__)) brain_parameters, pair_infos, total_expected = load_demonstration( path_prefix + "/test.demo") assert brain_parameters.brain_name == "Ball3DBrain" assert brain_parameters.vector_observation_space_size == 8 assert len(pair_infos) == total_expected _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1) assert len(demo_buffer["actions"]) == total_expected - 1
def test_load_demo_dir(): path_prefix = os.path.dirname(os.path.abspath(__file__)) behavior_spec, pair_infos, total_expected = load_demonstration( path_prefix + "/test_demo_dir") assert np.sum(behavior_spec.observation_shapes[0]) == 8 assert len(pair_infos) == total_expected _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC) assert (len(demo_buffer["continuous_action"]) == total_expected - 1 or len(demo_buffer["discrete_action"]) == total_expected - 1)
def test_load_demo_dir(): path_prefix = os.path.dirname(os.path.abspath(__file__)) behavior_spec, pair_infos, total_expected = load_demonstration( path_prefix + "/test_demo_dir") assert np.sum(behavior_spec.observation_specs[0].shape) == 8 assert len(pair_infos) == total_expected _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC) assert (len(demo_buffer[BufferKey.CONTINUOUS_ACTION]) == total_expected - 1 or len( demo_buffer[BufferKey.DISCRETE_ACTION]) == total_expected - 1)
def __init__( self, policy: TFPolicy, strength: float, gamma: float, demo_path: str, encoding_size: int = 64, learning_rate: float = 3e-4, use_actions: bool = False, use_vail: bool = False, ): """ The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476 :param policy: The policy of the learning model :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled reward multiplied by the strength parameter :param gamma: The time discounting factor used for this reward. :param demo_path: The path to the demonstration file :param num_epoch: The number of epochs to train over the training buffer for the discriminator. :param encoding_size: The size of the the hidden layers of the discriminator :param learning_rate: The Learning Rate used during GAIL updates. :param use_actions: Whether or not to use the actions for the discriminator. :param use_vail: Whether or not to use a variational bottleneck for the discriminator. See https://arxiv.org/abs/1810.00821. """ super().__init__(policy, strength, gamma) self.use_terminal_states = False self.model = GAILModel( policy, 128, learning_rate, encoding_size, use_actions, use_vail ) _, self.demonstration_buffer = demo_to_buffer( demo_path, policy.sequence_length, policy.brain ) self.has_updated = False self.update_dict: Dict[str, tf.Tensor] = { "gail_loss": self.model.loss, "gail_update_batch": self.model.update_batch, "gail_policy_estimate": self.model.mean_policy_estimate, "gail_expert_estimate": self.model.mean_expert_estimate, } if self.model.use_vail: self.update_dict["kl_loss"] = self.model.kl_loss self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq self.update_dict["z_mean_expert"] = self.model.z_mean_expert self.update_dict["z_mean_policy"] = self.model.z_mean_policy self.update_dict["beta_update"] = self.model.update_beta self.stats_name_to_update_name = { "Losses/GAIL Loss": "gail_loss", "Policy/GAIL Policy Estimate": "gail_policy_estimate", "Policy/GAIL Expert Estimate": "gail_expert_estimate", }
def __init__( self, policy: TFPolicy, policy_learning_rate: float, default_batch_size: int, default_num_epoch: int, strength: float, demo_path: str, steps: int, batch_size: int = None, num_epoch: int = None, samples_per_update: int = 0, ): """ A BC trainer that can be used inline with RL. :param policy: The policy of the learning model :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate for the pretrainer. :param default_batch_size: The default batch size to use if batch_size isn't provided. :param default_num_epoch: The default num_epoch to use if num_epoch isn't provided. :param strength: The proportion of learning rate used to update through BC. :param steps: The number of steps to anneal BC training over. 0 for continuous training. :param demo_path: The path to the demonstration file. :param batch_size: The batch size to use during BC training. :param num_epoch: Number of epochs to train for during each update. :param samples_per_update: Maximum number of samples to train on during each BC update. """ self.policy = policy self.current_lr = policy_learning_rate * strength self.model = BCModel(policy, self.current_lr, steps) _, self.demonstration_buffer = demo_to_buffer( demo_path, policy.sequence_length, policy.brain ) self.batch_size = batch_size if batch_size else default_batch_size self.num_epoch = num_epoch if num_epoch else default_num_epoch self.n_sequences = max( min(self.batch_size, self.demonstration_buffer.num_experiences) // policy.sequence_length, 1, ) self.has_updated = False self.use_recurrent = self.policy.use_recurrent self.samples_per_update = samples_per_update self.out_dict = { "loss": self.model.loss, "update": self.model.update_batch, "learning_rate": self.model.annealed_learning_rate, }
def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(OfflineBCTrainer, self).__init__(brain, trainer_parameters, training, load, seed, run_id) self.param_keys = [ "batch_size", "summary_freq", "max_steps", "batches_per_epoch", "use_recurrent", "hidden_units", "learning_rate", "num_layers", "sequence_length", "memory_size", "model_path", "demo_path", ] self.check_param_keys() self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.n_sequences = max( int(trainer_parameters["batch_size"] / self.policy.sequence_length), 1) brain_params, self.demonstration_buffer = demo_to_buffer( trainer_parameters["demo_path"], self.policy.sequence_length) policy_brain = copy.deepcopy(brain.__dict__) expert_brain = copy.deepcopy(brain_params.__dict__) policy_brain.pop("brain_name") expert_brain.pop("brain_name") policy_brain.pop("vector_action_descriptions") expert_brain.pop("vector_action_descriptions") if expert_brain != policy_brain: raise UnityTrainerException( "The provided demonstration is not compatible with the " "brain being used for performance evaluation.")
def __init__( self, policy: TorchPolicy, settings: BehavioralCloningSettings, policy_learning_rate: float, default_batch_size: int, default_num_epoch: int, ): """ A BC trainer that can be used inline with RL. :param policy: The policy of the learning model :param settings: The settings for BehavioralCloning including LR strength, batch_size, num_epochs, samples_per_update and LR annealing steps. :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate for the pretrainer. """ self.policy = policy self._anneal_steps = settings.steps self.current_lr = policy_learning_rate * settings.strength learning_rate_schedule: ScheduleType = ScheduleType.LINEAR if self._anneal_steps > 0 else ScheduleType.CONSTANT self.decay_learning_rate = ModelUtils.DecayedValue( learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps ) params = self.policy.actor_critic.parameters() self.optimizer = torch.optim.Adam(params, lr=self.current_lr) _, self.demonstration_buffer = demo_to_buffer( settings.demo_path, policy.sequence_length, policy.behavior_spec ) self.batch_size = ( settings.batch_size if settings.batch_size else default_batch_size ) self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch self.n_sequences = max( min(self.batch_size, self.demonstration_buffer.num_experiences) // policy.sequence_length, 1, ) self.has_updated = False self.use_recurrent = self.policy.use_recurrent self.samples_per_update = settings.samples_per_update
def __init__(self, policy: TFPolicy, settings: GAILSettings): """ The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476 :param policy: The policy of the learning model :param settings: The settings for this GAILRewardSignal. See https://arxiv.org/abs/1810.00821. """ super().__init__(policy, settings) self.use_terminal_states = False self.model = GAILModel( policy, 128, settings.learning_rate, settings.encoding_size, settings.use_actions, settings.use_vail, ) _, self.demonstration_buffer = demo_to_buffer(settings.demo_path, policy.sequence_length, policy.brain) self.has_updated = False self.update_dict: Dict[str, tf.Tensor] = { "gail_loss": self.model.loss, "gail_update_batch": self.model.update_batch, "gail_policy_estimate": self.model.mean_policy_estimate, "gail_expert_estimate": self.model.mean_expert_estimate, } if self.model.use_vail: self.update_dict["kl_loss"] = self.model.kl_loss self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq self.update_dict["z_mean_expert"] = self.model.z_mean_expert self.update_dict["z_mean_policy"] = self.model.z_mean_policy self.update_dict["beta_update"] = self.model.update_beta self.stats_name_to_update_name = { "Losses/GAIL Loss": "gail_loss", "Policy/GAIL Policy Estimate": "gail_policy_estimate", "Policy/GAIL Expert Estimate": "gail_expert_estimate", }
def __init__( self, policy: TFPolicy, strength: float, gamma: float, demo_path: str, num_epoch: int = 3, encoding_size: int = 64, learning_rate: float = 3e-4, samples_per_update: int = 0, use_actions: bool = False, use_vail: bool = False, ): """ The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476 :param policy: The policy of the learning model :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled reward multiplied by the strength parameter :param gamma: The time discounting factor used for this reward. :param demo_path: The path to the demonstration file :param num_epoch: The number of epochs to train over the training buffer for the discriminator. :param encoding_size: The size of the the hidden layers of the discriminator :param learning_rate: The Learning Rate used during GAIL updates. :param samples_per_update: The maximum number of samples to update during GAIL updates. :param use_actions: Whether or not to use the actions for the discriminator. :param use_vail: Whether or not to use a variational bottleneck for the discriminator. See https://arxiv.org/abs/1810.00821. """ super().__init__(policy, strength, gamma) self.num_epoch = num_epoch self.samples_per_update = samples_per_update self.use_terminal_states = False self.model = GAILModel( policy.model, 128, learning_rate, encoding_size, use_actions, use_vail ) _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length) self.has_updated = False
def load_demo(self): brain_params, demo_buffer = demo_to_buffer(demo_path,1) update_buffer = demo_buffer.update_buffer return update_buffer