def __init__( self, venv: VecEnv, gen_policy: BaseRLModel, discrim: discrim_net.DiscrimNet, expert_demos: types.Transitions, *, log_dir: str = "output/", disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: tf.train.Optimizer = tf.train.AdamOptimizer, disc_opt_kwargs: dict = {}, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, ): """Builds Trainer. Args: venv: The vectorized environment to train in. gen_policy: The generator policy that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_policy.n_batch`. discrim: The discriminator network. For GAIL, use a DiscrimNetGAIL. For AIRL, use a DiscrimNetAIRL. expert_demos: Transitions from an expert dataset. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert (logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._sess = tf.get_default_session() self._global_step = tf.train.create_global_step() assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self._expert_demos = expert_demos self._gen_policy = gen_policy self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self._discrim = discrim self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._build_graph() self._sess.run(tf.global_variables_initializer()) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.reward_train = self.reward_test = None self.venv_train = self.venv_test = self.venv else: self.reward_train = partial( self.discrim.reward_train, # The generator policy uses normalized observations # but the reward function (self.reward_train) and discriminator use # and receive unnormalized observations. Therefore to get the right # log action probs for AIRL's ent bonus, we need to normalize obs. gen_log_prob_fn=self._gen_log_action_prob_from_unnormalized, ) self.reward_test = self.discrim.reward_test self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_test) self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train) self.venv_train_norm = VecNormalize(self.venv_train_buffering) self.gen_policy.set_env(self.venv_train_norm) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) self._exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_demos) if self.disc_batch_size // 2 > len(self._exp_replay_buffer): warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing samples every " "discrim batch.")
def __init__( self, venv: vec_env.VecEnv, gen_algo: on_policy_algorithm.OnPolicyAlgorithm, discrim: discrim_nets.DiscrimNet, expert_data: Union[Iterable[Mapping], types.Transitions], expert_batch_size: int, n_disc_updates_per_round: int = 2, *, log_dir: str = "output/", normalize_obs: bool = True, normalize_reward: bool = True, disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_algo: The generator RL algorithm that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_algo.n_steps`. discrim: The discriminator network. This will be moved to the same device as `gen_algo`. expert_data: Either a `torch.utils.data.DataLoader`-like object or an instance of `Transitions` which is automatically converted into a shuffled version of the former type. If the argument passed is a `DataLoader`, then it must yield batches of expert data via its `__iter__` method. Each batch is a dictionary whose keys "obs", "acts", "next_obs", and "dones", correspond to Tensor or NumPy array values each with batch dimension equal to `expert_batch_size`. If any batch dimension doesn't equal `expert_batch_size` then a `ValueError` is raised. If the argument is a `Transitions` instance, then `len(expert_data)` must be at least `expert_batch_size`. expert_batch_size: The number of samples in each batch yielded from the expert data loader. The discriminator batch size is twice this number because each discriminator batch contains a generator sample for every expert sample. n_discrim_updates_per_round: The number of discriminator updates after each round of generator updates in AdversarialTrainer.learn(). log_dir: Directory to store TensorBoard logs, plots, etc. in. normalize_obs: Whether to normalize observations with `VecNormalize`. normalize_reward: Whether to normalize rewards with `VecNormalize`. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert ( logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._global_step = 0 self._disc_step = 0 self.n_disc_updates_per_round = n_disc_updates_per_round if expert_batch_size <= 0: raise ValueError(f"expert_batch_size={expert_batch_size} must be positive.") self.expert_batch_size = expert_batch_size if isinstance(expert_data, types.Transitions): if len(expert_data) < expert_batch_size: raise ValueError( "Provided Transitions instance as `expert_data` argument but " "len(expert_data) < expert_batch_size. " f"({len(expert_data)} < {expert_batch_size})." ) self.expert_data_loader = th_data.DataLoader( expert_data, batch_size=expert_batch_size, collate_fn=types.transitions_collate_fn, shuffle=True, drop_last=True, ) else: self.expert_data_loader = expert_data self._endless_expert_iterator = util.endless_iter(self.expert_data_loader) self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self.gen_algo = gen_algo self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self.discrim = discrim.to(self.gen_algo.device) self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._disc_opt = self._disc_opt_cls( self.discrim.parameters(), **self._disc_opt_kwargs ) if self._init_tensorboard: logging.info("building summary directory at " + self._log_dir) summary_dir = os.path.join(self._log_dir, "summary") os.makedirs(summary_dir, exist_ok=True) self._summary_writer = thboard.SummaryWriter(summary_dir) self.venv_buffering = wrappers.BufferingWrapper(self.venv) self.venv_norm_obs = vec_env.VecNormalize( self.venv_buffering, norm_reward=False, norm_obs=normalize_obs, ) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_wrapped = self.venv_norm_obs self.gen_callback = None else: self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper( self.venv_norm_obs, self.discrim.predict_reward_train ) self.gen_callback = self.venv_wrapped.make_log_callback() self.venv_train = vec_env.VecNormalize( self.venv_wrapped, norm_obs=False, norm_reward=normalize_reward ) self.gen_algo.set_env(self.venv_train) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv )
def __init__( self, venv: vec_env.VecEnv, gen_algo: base_class.BaseAlgorithm, discrim: discrim_nets.DiscrimNet, expert_data: Union[datasets.Dataset[types.Transitions], types.Transitions], *, log_dir: str = "output/", disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, device: Union[str, th.device] = "auto", ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_algo: The generator RL algorithm that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_algo.n_steps`. discrim: The discriminator network. This will be moved to the same device as `gen_algo`. expert_data: Either a `Dataset` of expert `Transitions`, or an instance of `Transitions` to be automatically converted into a `Dataset[Transitions]`. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert (logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._global_step = 0 self._disc_step = 0 assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self.gen_algo = gen_algo self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self.discrim = discrim.to(self.gen_algo.device) self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._disc_opt = self._disc_opt_cls(self.discrim.parameters(), **self._disc_opt_kwargs) if self._init_tensorboard: logging.info("building summary directory at " + self._log_dir) summary_dir = os.path.join(self._log_dir, "summary") os.makedirs(summary_dir, exist_ok=True) self._summary_writer = thboard.SummaryWriter(summary_dir) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_train = self.venv_test = self.venv else: self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.predict_reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.predict_reward_test) self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train) self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering) self.gen_algo.set_env(self.venv_train_norm) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) if isinstance(expert_data, types.Transitions): # Somehow, pytype doesn't recognize that `expert_data` is Transitions. expert_data = datasets.TransitionsDictDatasetAdaptor( expert_data, # pytype: disable=wrong-arg-types ) self._expert_dataset = expert_data expert_ds_size = self._expert_dataset.size() if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size: warnings.warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing expert samples " "every discrim batch.", category=RuntimeWarning, )