Пример #1
0
    def __init__(self,
                 env,
                 policy,
                 dynamics_model,
                 scope=None,
                 n_itr=500,
                 start_itr=0,
                 batch_size_env_samples=5000,
                 initial_random_samples=None,
                 max_path_length=500,
                 discount=0.99,
                 dynamic_model_max_epochs=(1000, 1000),
                 reinit_model_cycle=0,
                 plot=False,
                 pause_for_plot=False,
                 center_adv=True,
                 positive_adv=False,
                 store_paths=False,
                 whole_paths=True,
                 fixed_horizon=False,
                 sampler_cls=None,
                 sampler_args=None,
                 force_batch_sampler=False,
                 **kwargs):
        """
        :param env: Environment
        :param policy: Policy
        :param dynamics_model: Dynamics Model
        :param baseline: Baseline
        :param scope: Scope for identifying the algorithm. Must be specified if running multiple algorithms
        simultaneously, each using different environments and policies
        :param n_itr: Number of iterations.
        :param start_itr: Starting iteration.
        :param batch_size_env_samples: Number of samples from the environment per iteration.
        :param batch_size_dynamics_samples: Number of (imaginary) samples from the dynamics model
        :param initial_random_samples: either None -> use initial policy to sample from env
                                       or int: number of random samples at first iteration to train dynamics model
                                               if provided, in the first iteration no samples from the env are generated
                                               with the policy
        :param max_path_length: Maximum length of a single rollout.
        :param discount: Discount.
        :param gae_lambda: Lambda used for generalized advantage estimation.
        :param dynamic_model_epochs: (2-tuple) number of epochs to train the dynamics model
                                        (n_epochs_at_first_iter, n_epochs_after_first_iter)
        :param num_gradient_steps_per_iter: number of policy gradients steps before retraining dynamics model
        :param retrain_model_when_reward_decreases: (boolean) if true - stop inner gradient steps when performance decreases
        :param reset_policy_std: whether to reset the policy std after each iteration
        :param reinit_model_cycle: number of iterations before re-initializing the dynamics model (if 0 the dynamic model is not re-initialized at all)
        :param plot: Plot evaluation run after each iteration.
        :param pause_for_plot: Whether to pause before contiuing when plotting.
        :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1.
        :param positive_adv: Whether to shift the advantages so that they are always positive. When used in
        conjunction with center_adv the advantages will be standardized before shifting.
        :param store_paths: Whether to save all paths data to the snapshot.
        """
        self.env = env
        self.policy = policy
        self.dynamics_model = dynamics_model
        self.scope = scope
        self.n_itr = n_itr
        self.plot = plot
        self.start_itr = start_itr
        self.batch_size = batch_size_env_samples
        self.initial_random_samples = initial_random_samples
        self.max_path_length = max_path_length
        self.discount = discount
        self.dynamic_model_max_epochs = dynamic_model_max_epochs
        self.pause_for_plot = pause_for_plot
        self.center_adv = center_adv
        self.positive_adv = positive_adv
        self.store_paths = store_paths
        self.whole_paths = whole_paths
        self.fixed_horizon = fixed_horizon
        self.reinit_model = reinit_model_cycle

        # sampler for the environment
        if sampler_cls is None:
            if self.policy.vectorized and not force_batch_sampler:
                sampler_cls = EnvVectorizedSampler
            else:
                sampler_cls = BatchSampler  # TODO: use batch sampler rather than Vectorized Sampler
        if sampler_args is None:
            sampler_args = dict()
        self.env_sampler = sampler_cls(self, **sampler_args)

        # sampler for (imaginary) rollouts with the estimated dynamics model
        self.model_sampler_processor = ModelBaseSampler(self).process_samples

        if self.initial_random_samples:
            self.random_sampler = RandomVectorizedSampler(self)
        else:
            self.random_sampler = None

        self.init_opt()
    def __init__(
            self,
            env,
            policy,
            dynamics_model,
            baseline,
            scope=None,
            n_itr=20,
            start_itr=0,
            # Note that the number of trajectories for grad upate = batch_size
            # Defaults are 10 trajectories of length 500 for gradient update
            batch_size_env_samples=10,
            batch_size_dynamics_samples=100,
            meta_batch_size=None,
            initial_random_samples=None,
            max_path_length_env=100,
            max_path_length_dyn=None,
            num_grad_updates=1,
            discount=0.99,
            entropy_bonus=0,
            gae_lambda=1,
            dynamic_model_max_epochs=(1000, 1000),
            num_maml_steps_per_iter=10,
            reset_from_env_traj=False,
            dynamics_data_buffer_size=1e5,
            retrain_model_when_reward_decreases=True,
            reset_policy_std=False,
            reinit_model_cycle=0,
            center_adv=True,
            positive_adv=False,
            store_paths=False,
            sampler_cls=None,
            sampler_args=None,
            load_policy=None,
            frac_gpu=0.85,
            log_real_performance=True,
            clip_obs=False,
            tailored_exploration=True,
            **kwargs
    ):
        """
        :param env: Environment
        :param policy: Policy
        :param dynamics_model: Dynamics model ensemble
        :param baseline: Baseline
        :param scope: Scope for identifying the algorithm. Must be specified if running multiple algorithms
        simultaneously, each using different environments and policies
        :param n_itr: Number of iterations.
        :param start_itr: Starting iteration.
        :param batch_size_env_samples: Number of policy rollouts for each model/policy
        :param batch_size_dynamics_samples: Number of (imaginary) policy rollouts with each dynamics model
        :param meta_batch_size: Number of meta-tasks (default - meta_batch_size-dynamics_model.num_models)
        :param initial_random_samples: either None -> use initial policy to sample from env
                                       or int: number of random samples at first iteration to train dynamics model
                                               if provided, in the first iteration no samples from the env are generated
                                               with the policy
        :param max_path_length_env: Maximum length of a single rollout in the environment
        :param max_path_length_dyn: Maximum path length of a single (imaginary) rollout with the dynamics model
        :param num_grad_updates: Number of fast gradient updates
        :param discount: Discount.
        :param entropy_bonus_coef: Entropy bonus
        :param gae_lambda: Lambda used for generalized advantage estimation.
        :param dynamic_model_max_epochs: (int) maximum number of epochs for training the dynamics model
        :param num_maml_steps_per_iter: number of policy gradients steps before retraining dynamics model
        :param reset_from_env_traj: (boolean) whether to use the real environment observations for resetting the imaginary dynamics model rollouts
        :param dynamics_data_buffer_size: (int) size of the queue/buffer that holds data for the model training
        :param retrain_model_when_reward_decreases: (boolean) if true - stop inner gradient steps when performance decreases
        :param reset_policy_std: whether to reset the policy std after each iteration
        :param reinit_model_cycle: number of iterations before re-initializing the dynamics model (if 0 the dynamic model is not re-initialized at all)
        :param store_paths: Whether to save all paths data to the snapshot.
        :param frac_gpu: memory fraction of the gpu that shall be used for this task
        :param log_real_performance: (boolean) if true the pre-update and post-update performance in the real env is evaluated and logged
        :param clip_obs: (boolean) whether to clip the predicted next observations of the dynamics model in order to avoid numerical instabilities
        """
        self.env = env
        self.policy = policy
        self.dynamics_model = dynamics_model
        self.load_policy = load_policy
        self.baseline = baseline
        self.scope = scope
        self.n_itr = n_itr
        self.start_itr = start_itr
        self.tailored_exploration = tailored_exploration

        # meta batch size and number of dynamics models
        self.num_models = dynamics_model.num_models
        if meta_batch_size is None:
            self.meta_batch_size = self.num_models # set meta_batch_size to number of dynamic models
        else:
            assert meta_batch_size % self.num_models == 0, "meta_batch_size must a multiple the number of models in the dynamics ensemble"
            self.meta_batch_size = meta_batch_size

        self.max_path_length = max_path_length_env
        self.max_path_length_dyn = max_path_length_dyn if max_path_length_dyn is not None else max_path_length_env

        # batch_size is the number of trajectories for one fast grad update.
        self.batch_size = batch_size_env_samples * max_path_length_env * self.meta_batch_size # batch_size for env sampling
        self.batch_size_dynamics_samples = batch_size_dynamics_samples * self.max_path_length_dyn * self.meta_batch_size # batch_size for model sampling
        if initial_random_samples is None:
            self.initial_random_samples = self.batch_size
        else:
            self.initial_random_samples = initial_random_samples
        self.discount = discount
        self.entropy_bonus = entropy_bonus
        self.gae_lambda = gae_lambda

        # dynamics model config
        self.dynamic_model_epochs = dynamic_model_max_epochs
        self.num_maml_steps_per_iter = num_maml_steps_per_iter
        self.reset_from_env_traj = reset_from_env_traj
        self.dynamics_data_buffer_size = dynamics_data_buffer_size
        self.retrain_model_when_reward_decreases = retrain_model_when_reward_decreases
        self.reset_policy_std = reset_policy_std
        self.reinit_model = reinit_model_cycle
        self.log_real_performance = log_real_performance

        self.center_adv = center_adv
        self.positive_adv = positive_adv
        self.store_paths = store_paths
        self.num_grad_updates = num_grad_updates # number of gradient steps during training
        self.frac_gpu = frac_gpu

        ''' setup sampler classes '''

        # env sampler - get samples from environment using the policy
        if sampler_cls is None:
            sampler_cls = MAMLVectorizedSampler
            sampler_args = dict(n_tasks=self.meta_batch_size, n_envs=self.meta_batch_size * batch_size_env_samples, parallel=False)
        self.env_sampler = sampler_cls(self, **sampler_args)

        # model sampler - makes (imaginary) rollouts with the estimated dynamics model ensemble
        self.model_sampler = MAMLModelVectorizedSampler(self, max_path_length=max_path_length_dyn, clip_obs=clip_obs)

        # random sampler - (initially) collects random samples from the environment to train the dynamics model
        if self.initial_random_samples:
            self.random_sampler = RandomVectorizedSampler(self)
        else:
            self.random_sampler = None
    def __init__(
            self,
            env,
            policy,
            dynamics_model,
            baseline,
            scope=None,
            n_itr=20,
            start_itr=0,
            # Note that the number of trajectories for grad upate = batch_size
            # Defaults are 10 trajectories of length 500 for gradient update
            batch_size_env_samples=10,
            batch_size_dynamics_samples=100,
            meta_batch_size=None,
            initial_random_samples=None,
            max_path_length_env=100,
            max_path_length_dyn=None,
            num_grad_updates=1,
            discount=0.99,
            gae_lambda=1,
            dynamic_model_epochs=(30, 10),
            num_maml_steps_per_iter=10,
            reset_from_env_traj=False,
            retrain_model_when_reward_decreases=True,
            reset_policy_std=False,
            reinit_model_cycle=0,
            plot=False,
            pause_for_plot=False,
            center_adv=True,
            positive_adv=False,
            store_paths=False,
            whole_paths=True,
            fixed_horizon=False,
            sampler_cls=None,
            sampler_args=None,
            force_batch_sampler=False,
            use_maml=True,
            load_policy=None,
            frac_gpu=0.85,
            log_real_performance=False,
            resample_output_bias=True,
            **kwargs):
        """
        :param env: Environment
        :param policy: Policy
        :param dynamics_model: Dynamics model ensemble
        :param baseline: Baseline
        :param scope: Scope for identifying the algorithm. Must be specified if running multiple algorithms
        simultaneously, each using different environments and policies
        :param n_itr: Number of iterations.
        :param start_itr: Starting iteration.
        :param batch_size_env_samples: Number of policy rollouts for each model/policy
        :param batch_size_dynamics_samples: Number of (imaginary) policy rollouts with each dynamics model
        :param meta_batch_size: Number of meta-tasks (default - meta_batch_size-dynamics_model.num_models)
        :param initial_random_samples: either None -> use initial policy to sample from env
                                       or int: number of random samples at first iteration to train dynamics model
                                               if provided, in the first iteration no samples from the env are generated
                                               with the policy
        :param max_path_length_env: Maximum length of a single rollout in the environment
        :param max_path_length_dyn: Maximum path length of a single (imaginary) rollout with the dynamics model
        :param num_grad_updates: Number of fast gradient updates
        :param discount: Discount.
        :param gae_lambda: Lambda used for generalized advantage estimation.
        :param dynamic_model_epochs: (2-tuple) number of epochs to train the dynamics model
                                        (n_epochs_at_first_iter, n_epochs_after_first_iter)
        :param num_maml_steps_per_iter: number of policy gradients steps before retraining dynamics model
        :param reset_from_env_traj: (boolean) whether to use the real environment observations for resetting the imaginary dynamics model rollouts
        :param retrain_model_when_reward_decreases: (boolean) if true - stop inner gradient steps when performance decreases
        :param reset_policy_std: whether to reset the policy std after each iteration
        :param reinit_model_cycle: number of iterations before re-initializing the dynamics model (if 0 the dynamic model is not re-initialized at all)
        :param plot: Plot evaluation run after each iteration.
        :param pause_for_plot: Whether to pause before contiuing when plotting.
        :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1.
        :param positive_adv: Whether to shift the advantages so that they are always positive. When used in
        conjunction with center_adv the advantages will be standardized before shifting.
        :param store_paths: Whether to save all paths data to the snapshot.
        :param frac_gpu: memory fraction of the gpu that shall be used for this task
        :return:
        """
        self.env = env
        self.policy = policy
        self.dynamics_model = dynamics_model
        self.load_policy = load_policy
        self.baseline = baseline
        self.scope = scope
        self.n_itr = n_itr
        self.start_itr = start_itr

        # meta batch size and number of dynamics models
        self.num_models = dynamics_model.num_models
        if meta_batch_size is None:
            self.meta_batch_size = self.num_models  # set meta_batch_size to number of dynamic models
        else:
            assert meta_batch_size % self.num_models == 0, "meta_batch_size must a multiple the number of models in the dynamics ensemble"
            self.meta_batch_size = meta_batch_size

        self.max_path_length = max_path_length_env
        self.max_path_length_dyn = max_path_length_dyn if max_path_length_dyn is not None else max_path_length_env

        # batch_size is the number of trajectories for one fast grad update.
        self.batch_size = batch_size_env_samples * max_path_length_env * self.meta_batch_size  # batch_size for env sampling
        self.batch_size_dynamics_samples = batch_size_dynamics_samples * self.max_path_length_dyn * self.meta_batch_size  # batch_size for model sampling
        if initial_random_samples is None:
            self.initial_random_samples = self.batch_size
        else:
            self.initial_random_samples = initial_random_samples
        self.discount = discount
        self.gae_lambda = gae_lambda

        self.dynamic_model_epochs = dynamic_model_epochs
        self.num_maml_steps_per_iter = num_maml_steps_per_iter
        self.reset_from_env_traj = reset_from_env_traj
        self.retrain_model_when_reward_decreases = retrain_model_when_reward_decreases
        self.reset_policy_std = reset_policy_std
        self.reinit_model = reinit_model_cycle
        self.log_real_performance = log_real_performance

        self.plot = plot
        self.pause_for_plot = pause_for_plot
        self.center_adv = center_adv
        self.positive_adv = positive_adv
        self.store_paths = store_paths
        self.whole_paths = whole_paths
        self.fixed_horizon = fixed_horizon
        self.num_grad_updates = num_grad_updates  # number of gradient steps during training
        self.frac_gpu = frac_gpu
        self.resample_output_bias = resample_output_bias
        ''' setup sampler classes '''

        # env sampler - get samples from environment using the policy
        if sampler_cls is None:
            sampler_cls = MAMLVectorizedSampler
            sampler_args = dict(n_tasks=self.meta_batch_size,
                                n_envs=self.meta_batch_size *
                                batch_size_env_samples)
        self.env_sampler = sampler_cls(self, **sampler_args)

        # model sampler - makes (imaginary) rollouts with the estimated dynamics model ensemble
        self.model_sampler = MAMLModelVectorizedSampler(
            self, max_path_length=max_path_length_dyn)

        # random sampler - (initially) collects random samples from the environment to train the dynamics model
        if self.initial_random_samples:
            self.random_sampler = RandomVectorizedSampler(self)
        else:
            self.random_sampler = None