示例#1
0
    def __init__(self,
                 inner_algo,
                 env,
                 policy,
                 sampler,
                 task_sampler,
                 meta_optimizer,
                 meta_batch_size=40,
                 inner_lr=0.1,
                 outer_lr=1e-3,
                 num_grad_updates=1,
                 meta_evaluator=None,
                 evaluate_every_n_epochs=1):
        self._sampler = sampler

        self.max_episode_length = inner_algo.max_episode_length

        self._meta_evaluator = meta_evaluator
        self._policy = policy
        self._env = env
        self._task_sampler = task_sampler
        self._value_function = copy.deepcopy(inner_algo._value_function)
        self._initial_vf_state = self._value_function.state_dict()
        self._num_grad_updates = num_grad_updates
        self._meta_batch_size = meta_batch_size
        self._inner_algo = inner_algo
        self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr)
        self._meta_optimizer = make_optimizer(meta_optimizer,
                                              module=policy,
                                              lr=_Default(outer_lr),
                                              eps=_Default(1e-5))
        self._evaluate_every_n_epochs = evaluate_every_n_epochs
    def __init__(self,
                 optimizer=None,
                 learning_rate=None,
                 max_optimization_epochs=1000,
                 tolerance=1e-6,
                 batch_size=32,
                 callback=None,
                 verbose=False,
                 name='FirstOrderOptimizer'):
        self._opt_fun = None
        self._target = None
        self._callback = callback
        if optimizer is None:
            optimizer = tf.compat.v1.train.AdamOptimizer
        learning_rate = learning_rate or dict(learning_rate=_Default(1e-3))
        if not isinstance(learning_rate, dict):
            learning_rate = dict(learning_rate=learning_rate)

        self._tf_optimizer = optimizer
        self._learning_rate = learning_rate
        self._max_optimization_epochs = max_optimization_epochs
        self._tolerance = tolerance
        self._batch_size = batch_size
        self._verbose = verbose
        self._input_vars = None
        self._train_op = None
        self._name = name
示例#3
0
    def __init__(self,
                 env,
                 policy,
                 value_function,
                 sampler,
                 task_sampler,
                 inner_lr=_Default(1e-2),
                 outer_lr=1e-3,
                 max_kl_step=0.01,
                 discount=0.99,
                 gae_lambda=1,
                 center_adv=True,
                 positive_adv=False,
                 policy_ent_coeff=0.0,
                 use_softplus_entropy=False,
                 stop_entropy_gradient=False,
                 entropy_method='no_entropy',
                 meta_batch_size=40,
                 num_grad_updates=1,
                 meta_evaluator=None,
                 evaluate_every_n_epochs=1):

        policy_optimizer = OptimizerWrapper(
            (torch.optim.Adam, dict(lr=inner_lr)), policy)
        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
                                        value_function)

        inner_algo = VPG(env.spec,
                         policy,
                         value_function,
                         None,
                         policy_optimizer=policy_optimizer,
                         vf_optimizer=vf_optimizer,
                         num_train_per_epoch=1,
                         discount=discount,
                         gae_lambda=gae_lambda,
                         center_adv=center_adv,
                         positive_adv=positive_adv,
                         policy_ent_coeff=policy_ent_coeff,
                         use_softplus_entropy=use_softplus_entropy,
                         stop_entropy_gradient=stop_entropy_gradient,
                         entropy_method=entropy_method)

        meta_optimizer = (ConjugateGradientOptimizer,
                          dict(max_constraint_value=max_kl_step))

        super().__init__(inner_algo=inner_algo,
                         env=env,
                         policy=policy,
                         sampler=sampler,
                         task_sampler=task_sampler,
                         meta_optimizer=meta_optimizer,
                         meta_batch_size=meta_batch_size,
                         inner_lr=inner_lr,
                         outer_lr=outer_lr,
                         num_grad_updates=num_grad_updates,
                         meta_evaluator=meta_evaluator,
                         evaluate_every_n_epochs=evaluate_every_n_epochs)
示例#4
0
    def __init__(self,
                 env_spec,
                 policy,
                 baseline,
                 max_path_length=500,
                 discount=0.99,
                 gae_lambda=1,
                 center_adv=True,
                 positive_adv=False,
                 fixed_horizon=False,
                 epsilon=0.5,
                 l2_reg_dual=0.,
                 l2_reg_loss=0.,
                 optimizer=LbfgsOptimizer,
                 optimizer_args=None,
                 dual_optimizer=scipy.optimize.fmin_l_bfgs_b,
                 dual_optimizer_args=None,
                 name='REPS'):
        optimizer_args = optimizer_args or dict(max_opt_itr=_Default(50))
        dual_optimizer_args = dual_optimizer_args or dict(maxiter=50)

        self.policy = policy
        self.max_path_length = max_path_length

        self._env_spec = env_spec
        self._baseline = baseline
        self._discount = discount
        self._gae_lambda = gae_lambda
        self._center_adv = center_adv
        self._positive_adv = positive_adv
        self._fixed_horizon = fixed_horizon

        self._name = name
        self._name_scope = tf.name_scope(self._name)
        self._old_policy = policy.clone('old_policy')
        self._old_policy.parameters = self.policy.parameters

        self._feat_diff = None
        self._param_eta = None
        self._param_v = None
        self._f_dual = None
        self._f_dual_grad = None
        self._f_policy_kl = None
        self._policy_network = None
        self._old_policy_network = None

        self._optimizer = make_optimizer(optimizer, **optimizer_args)
        self._dual_optimizer = dual_optimizer
        self._dual_optimizer_args = dual_optimizer_args
        self._epsilon = float(epsilon)
        self._l2_reg_dual = float(l2_reg_dual)
        self._l2_reg_loss = float(l2_reg_loss)

        self._episode_reward_mean = collections.deque(maxlen=100)
        self.sampler_cls = RaySampler

        self.init_opt()
示例#5
0
文件: dqn.py 项目: thanhkaist/garage
    def __init__(self,
                 env_spec,
                 policy,
                 qf,
                 replay_buffer,
                 exploration_policy=None,
                 steps_per_epoch=20,
                 min_buffer_size=int(1e4),
                 buffer_batch_size=64,
                 rollout_batch_size=1,
                 n_train_steps=50,
                 max_path_length=None,
                 max_eval_path_length=None,
                 qf_lr=_Default(0.001),
                 qf_optimizer=tf.compat.v1.train.AdamOptimizer,
                 discount=1.0,
                 target_network_update_freq=5,
                 grad_norm_clipping=None,
                 double_q=False,
                 reward_scale=1.,
                 smooth_return=True,
                 name='DQN'):
        self._qf_optimizer = qf_optimizer
        self._qf_lr = qf_lr
        self._name = name
        self._target_network_update_freq = target_network_update_freq
        self._grad_norm_clipping = grad_norm_clipping
        self._double_q = double_q

        # clone a target q-function
        self._target_qf = qf.clone('target_qf')

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._discount = discount
        self._reward_scale = reward_scale
        self._smooth_return = smooth_return
        self.max_path_length = max_path_length
        self._max_eval_path_length = max_eval_path_length

        # used by OffPolicyVectorizedSampler
        self.env_spec = env_spec
        self.rollout_batch_size = rollout_batch_size
        self.replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self.sampler_cls = OffPolicyVectorizedSampler

        self.init_opt()
示例#6
0
    def __init__(self,
                 env,
                 policy,
                 value_function,
                 inner_lr=_Default(1e-1),
                 outer_lr=1e-3,
                 lr_clip_range=5e-1,
                 max_episode_length=100,
                 discount=0.99,
                 gae_lambda=1.0,
                 center_adv=True,
                 positive_adv=False,
                 policy_ent_coeff=0.0,
                 use_softplus_entropy=False,
                 stop_entropy_gradient=False,
                 entropy_method='no_entropy',
                 meta_batch_size=20,
                 num_grad_updates=1,
                 meta_evaluator=None,
                 evaluate_every_n_epochs=1):

        policy_optimizer = OptimizerWrapper(
            (torch.optim.Adam, dict(lr=inner_lr)), policy)
        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
                                        value_function)

        inner_algo = PPO(env.spec,
                         policy,
                         value_function,
                         policy_optimizer=policy_optimizer,
                         vf_optimizer=vf_optimizer,
                         lr_clip_range=lr_clip_range,
                         max_episode_length=max_episode_length,
                         num_train_per_epoch=1,
                         discount=discount,
                         gae_lambda=gae_lambda,
                         center_adv=center_adv,
                         positive_adv=positive_adv,
                         policy_ent_coeff=policy_ent_coeff,
                         use_softplus_entropy=use_softplus_entropy,
                         stop_entropy_gradient=stop_entropy_gradient,
                         entropy_method=entropy_method)

        super().__init__(inner_algo=inner_algo,
                         env=env,
                         policy=policy,
                         meta_optimizer=torch.optim.Adam,
                         meta_batch_size=meta_batch_size,
                         inner_lr=inner_lr,
                         outer_lr=outer_lr,
                         num_grad_updates=num_grad_updates,
                         meta_evaluator=meta_evaluator,
                         evaluate_every_n_epochs=evaluate_every_n_epochs)
示例#7
0
文件: dqn.py 项目: geyang/garage
    def __init__(self,
                 env_spec,
                 policy,
                 qf,
                 replay_buffer,
                 exploration_policy=None,
                 steps_per_epoch=20,
                 min_buffer_size=int(1e4),
                 buffer_batch_size=64,
                 max_episode_length_eval=None,
                 n_train_steps=50,
                 qf_lr=_Default(0.001),
                 qf_optimizer=tf.compat.v1.train.AdamOptimizer,
                 discount=1.0,
                 target_network_update_freq=5,
                 grad_norm_clipping=None,
                 double_q=False,
                 reward_scale=1.,
                 name='DQN'):
        self._qf_optimizer = qf_optimizer
        self._qf_lr = qf_lr
        self._name = name
        self._target_network_update_freq = target_network_update_freq
        self._grad_norm_clipping = grad_norm_clipping
        self._double_q = double_q

        # clone a target q-function
        self._target_qf = qf.clone('target_qf')

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._discount = discount
        self._reward_scale = reward_scale
        self.max_episode_length = env_spec.max_episode_length
        self._max_episode_length_eval = env_spec.max_episode_length

        if max_episode_length_eval is not None:
            self._max_episode_length_eval = max_episode_length_eval

        self._eval_env = None

        self.env_spec = env_spec
        self.replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self.sampler_cls = LocalSampler
        self.worker_cls = FragmentWorker

        self.init_opt()
示例#8
0
文件: dqn.py 项目: seraliilhan/garage
    def __init__(self,
                 env_spec,
                 policy,
                 qf,
                 replay_buffer,
                 exploration_policy=None,
                 steps_per_epoch=20,
                 min_buffer_size=int(1e4),
                 buffer_batch_size=64,
                 rollout_batch_size=1,
                 n_train_steps=50,
                 max_path_length=None,
                 qf_lr=_Default(0.001),
                 qf_optimizer=tf.compat.v1.train.AdamOptimizer,
                 discount=1.0,
                 target_network_update_freq=5,
                 grad_norm_clipping=None,
                 double_q=False,
                 reward_scale=1.,
                 smooth_return=True,
                 name='DQN'):
        self._qf_optimizer = qf_optimizer
        self._qf_lr = qf_lr
        self._name = name
        self._target_network_update_freq = target_network_update_freq
        self._grad_norm_clipping = grad_norm_clipping
        self._double_q = double_q

        # clone a target q-function
        self._target_qf = qf.clone('target_qf')

        super(DQN, self).__init__(env_spec=env_spec,
                                  policy=policy,
                                  qf=qf,
                                  exploration_policy=exploration_policy,
                                  min_buffer_size=min_buffer_size,
                                  n_train_steps=n_train_steps,
                                  steps_per_epoch=steps_per_epoch,
                                  buffer_batch_size=buffer_batch_size,
                                  rollout_batch_size=rollout_batch_size,
                                  replay_buffer=replay_buffer,
                                  max_path_length=max_path_length,
                                  discount=discount,
                                  reward_scale=reward_scale,
                                  smooth_return=smooth_return)
示例#9
0
    def __init__(
        self,
        env_spec,
        learner,
        *,
        batch_size,
        source=None,
        sampler=None,
        policy_optimizer=torch.optim.Adam,
        policy_lr=_Default(1e-3),
        loss='log_prob',
        minibatches_per_epoch=16,
        name='BC',
    ):
        self._source = source
        self.learner = learner
        self._optimizer = make_optimizer(policy_optimizer,
                                         module=self.learner,
                                         lr=policy_lr)
        if loss not in ('log_prob', 'mse'):
            raise ValueError('Loss should be either "log_prob" or "mse".')
        self._loss = loss
        self._minibatches_per_epoch = minibatches_per_epoch
        self._eval_env = None
        self._batch_size = batch_size
        self._name = name

        # For plotting
        self.policy = self.learner

        # Public fields for sampling.
        self._env_spec = env_spec
        self.exploration_policy = None
        self.policy = None
        self.max_episode_length = env_spec.max_episode_length
        self._sampler = sampler
        if isinstance(self._source, Policy):
            self.exploration_policy = self._source
            self._source = source
            if not isinstance(self._sampler, Sampler):
                raise TypeError('Source is a policy. Missing a sampler.')
        else:
            self._source = itertools.cycle(iter(source))
示例#10
0
    def __init__(
        self,
        env_spec,
        learner,
        *,
        batch_size,
        source=None,
        max_path_length=None,
        policy_optimizer=torch.optim.Adam,
        policy_lr=_Default(1e-3),
        loss='log_prob',
        minibatches_per_epoch=16,
        name='BC',
    ):
        self._source = source
        self.learner = learner
        self._optimizer = make_optimizer(policy_optimizer,
                                         module=self.learner,
                                         lr=policy_lr)
        if loss not in ('log_prob', 'mse'):
            raise ValueError('Loss should be either "log_prob" or "mse".')
        self._loss = loss
        self._minibatches_per_epoch = minibatches_per_epoch
        self._eval_env = None
        self._batch_size = batch_size
        self._name = name

        # Public fields for sampling.
        self.env_spec = env_spec
        self.policy = None
        self.max_path_length = max_path_length
        self.sampler_cls = None
        if isinstance(self._source, Policy):
            if max_path_length is None:
                raise ValueError('max_path_length must be passed if the '
                                 'source is a policy')
            self.policy = self._source
            self.sampler_cls = RaySampler
            self._source = source
        else:
            self._source = itertools.cycle(iter(source))
示例#11
0
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            qf2,
            replay_buffer,
            *,  # Everything after this is numbers.
            target_update_tau=0.01,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            discount=0.99,
            max_episode_length_eval=None,
            max_action=None,
            name='TD3',
            steps_per_epoch=20,
            n_train_steps=50,
            buffer_batch_size=64,
            min_buffer_size=1e4,
            reward_scale=1.,
            exploration_policy_sigma=0.2,
            actor_update_period=2,
            exploration_policy_clip=0.5,
            exploration_policy=None):
        action_bound = env_spec.action_space.high
        self._max_action = action_bound if max_action is None else max_action
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._name = name
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return

        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._target_policy = policy.clone('target_policy')
        self._target_qf = qf.clone('target_qf')

        self.qf2 = qf2
        self.qf = qf

        self._exploration_policy_sigma = exploration_policy_sigma
        self._exploration_policy_clip = exploration_policy_clip
        self._actor_update_period = actor_update_period
        self._action_loss = None

        self._target_qf2 = qf2.clone('target_qf2')
        self._policy_optimizer = policy_optimizer
        self._qf_optimizer = qf_optimizer
        self._policy_lr = policy_lr
        self._qf_lr = qf_lr

        self._policy = policy
        self._n_train_steps = n_train_steps

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._discount = discount
        self._reward_scale = reward_scale
        self.max_episode_length = env_spec.max_episode_length
        self._max_episode_length_eval = env_spec.max_episode_length

        if max_episode_length_eval is not None:
            self._max_episode_length_eval = max_episode_length_eval

        self._eval_env = None

        self._env_spec = env_spec
        self._replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self.sampler_cls = LocalSampler
        self.worker_cls = FragmentWorker

        self._init_opt()
示例#12
0
文件: td3.py 项目: yangyi0318/garage
    def __init__(
            self,
            env_spec,
            policy,
            qf1,
            qf2,
            replay_buffer,
            *,  # Everything after this is numbers.
            max_episode_length_eval=None,
            grad_steps_per_env_step,
            exploration_policy,
            uniform_random_policy=None,
            max_action=None,
            target_update_tau=0.005,
            discount=0.99,
            reward_scaling=1.,
            update_actor_interval=2,
            buffer_batch_size=64,
            replay_buffer_size=1e6,
            min_buffer_size=1e4,
            exploration_noise=0.1,
            policy_noise=0.2,
            policy_noise_clip=0.5,
            clip_return=np.inf,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            policy_optimizer=torch.optim.Adam,
            qf_optimizer=torch.optim.Adam,
            num_evaluation_episodes=10,
            steps_per_epoch=20,
            start_steps=10000,
            update_after=1000,
            use_deterministic_evaluation=False):

        self._env_spec = env_spec
        action_bound = self._env_spec.action_space.high[0]
        self._max_action = action_bound if max_action is None else max_action
        self._action_dim = self._env_spec.action_space.shape[0]
        self._tau = target_update_tau
        self._discount = discount
        self._reward_scaling = reward_scaling
        self._exploration_noise = exploration_noise
        self._policy_noise = policy_noise
        self._policy_noise_clip = policy_noise_clip
        self._clip_return = clip_return
        self._replay_buffer_size = replay_buffer_size
        self._min_buffer_size = min_buffer_size
        self._buffer_batch_size = buffer_batch_size
        self._grad_steps_per_env_step = grad_steps_per_env_step
        self._update_actor_interval = update_actor_interval
        self._steps_per_epoch = steps_per_epoch
        self._start_steps = start_steps
        self._update_after = update_after
        self._num_evaluation_episodes = num_evaluation_episodes
        self.max_episode_length = env_spec.max_episode_length
        self._max_episode_length_eval = env_spec.max_episode_length

        if max_episode_length_eval is not None:
            self._max_episode_length_eval = max_episode_length_eval
        self._use_deterministic_evaluation = use_deterministic_evaluation

        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []
        self._eval_env = None
        self.exploration_policy = exploration_policy
        self._uniform_random_policy = uniform_random_policy
        self.worker_cls = FragmentWorker
        self.sampler_cls = LocalSampler

        self._replay_buffer = replay_buffer
        self.policy = policy
        self._qf_1 = qf1
        self._qf_2 = qf2
        self._target_policy = copy.deepcopy(self.policy)
        self._target_qf_1 = copy.deepcopy(self._qf_1)
        self._target_qf_2 = copy.deepcopy(self._qf_2)

        self._policy_optimizer = make_optimizer(policy_optimizer,
                                                module=self.policy,
                                                lr=policy_lr)
        self._qf_optimizer_1 = make_optimizer(qf_optimizer,
                                              module=self._qf_1,
                                              lr=qf_lr)
        self._qf_optimizer_2 = make_optimizer(qf_optimizer,
                                              module=self._qf_2,
                                              lr=qf_lr)
        self._actor_loss = torch.zeros(1)
示例#13
0
文件: td3.py 项目: seraliilhan/garage
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            qf2,
            replay_buffer,
            *,  # Everything after this is numbers.
            target_update_tau=0.01,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            discount=0.99,
            max_action=None,
            name='TD3',
            steps_per_epoch=20,
            max_path_length=None,
            max_eval_path_length=None,
            n_train_steps=50,
            buffer_batch_size=64,
            min_buffer_size=1e4,
            rollout_batch_size=1,
            reward_scale=1.,
            exploration_policy_sigma=0.2,
            actor_update_period=2,
            exploration_policy_clip=0.5,
            smooth_return=True,
            exploration_policy=None):
        action_bound = env_spec.action_space.high
        self._max_action = action_bound if max_action is None else max_action
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._name = name
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return
        self._success_history = deque(maxlen=100)

        self._episode_rewards = []
        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._target_policy = policy.clone('target_policy')
        self._target_qf = qf.clone('target_qf')

        self.qf2 = qf2
        self._exploration_policy_sigma = exploration_policy_sigma
        self._exploration_policy_clip = exploration_policy_clip
        self._actor_update_period = actor_update_period
        self._action_loss = None

        self._target_qf2 = qf2.clone('target_qf2')
        self._policy_optimizer = policy_optimizer
        self._qf_optimizer = qf_optimizer
        self._policy_lr = policy_lr
        self._qf_lr = qf_lr

        super(TD3, self).__init__(env_spec=env_spec,
                                  policy=policy,
                                  qf=qf,
                                  replay_buffer=replay_buffer,
                                  discount=discount,
                                  steps_per_epoch=steps_per_epoch,
                                  max_path_length=max_path_length,
                                  max_eval_path_length=max_eval_path_length,
                                  n_train_steps=n_train_steps,
                                  buffer_batch_size=buffer_batch_size,
                                  min_buffer_size=min_buffer_size,
                                  rollout_batch_size=rollout_batch_size,
                                  reward_scale=reward_scale,
                                  smooth_return=smooth_return,
                                  exploration_policy=exploration_policy)
示例#14
0
文件: dqn.py 项目: ziyiwu9494/garage
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            replay_buffer,
            sampler,
            exploration_policy=None,
            eval_env=None,
            double_q=True,
            qf_optimizer=torch.optim.Adam,
            *,  # Everything after this is numbers.
            steps_per_epoch=20,
            n_train_steps=50,
            max_episode_length_eval=None,
            deterministic_eval=False,
            buffer_batch_size=64,
            min_buffer_size=int(1e4),
            num_eval_episodes=10,
            discount=0.99,
            qf_lr=_Default(1e-3),
            clip_rewards=None,
            clip_gradient=10,
            target_update_freq=5,
            reward_scale=1.):
        self._clip_reward = clip_rewards
        self._clip_grad = clip_gradient

        self._steps_per_epoch = steps_per_epoch
        self._target_update_freq = target_update_freq
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._policy = policy
        self._qf = qf
        self._n_train_steps = n_train_steps

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._double_q = double_q
        self._discount = discount
        self._reward_scale = reward_scale
        self.max_episode_length = env_spec.max_episode_length
        self._max_episode_length_eval = (max_episode_length_eval
                                         or self.max_episode_length)
        self._episode_reward_mean = collections.deque(maxlen=100)
        self._num_eval_episodes = num_eval_episodes
        self._deterministic_eval = deterministic_eval

        self.env_spec = env_spec
        self.replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self._target_qf = copy.deepcopy(self._qf)
        self._qf_optimizer = make_optimizer(qf_optimizer,
                                            module=self._qf,
                                            lr=qf_lr)
        self._eval_env = eval_env

        self._sampler = sampler
示例#15
0
文件: ddpg.py 项目: ziyiwu9494/garage
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            replay_buffer,
            sampler,
            *,  # Everything after this is numbers.
            steps_per_epoch=20,
            n_train_steps=50,
            buffer_batch_size=64,
            min_buffer_size=int(1e4),
            max_episode_length_eval=None,
            exploration_policy=None,
            target_update_tau=0.01,
            discount=0.99,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            max_action=None,
            reward_scale=1.,
            name='DDPG'):
        action_bound = env_spec.action_space.high
        self._max_action = action_bound if max_action is None else max_action
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._name = name
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return

        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._target_policy = policy.clone('target_policy')
        self._target_qf = qf.clone('target_qf')
        self._policy_optimizer = policy_optimizer
        self._qf_optimizer = qf_optimizer
        self._policy_lr = policy_lr
        self._qf_lr = qf_lr

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._discount = discount
        self._reward_scale = reward_scale
        self.max_episode_length = env_spec.max_episode_length
        self._max_episode_length_eval = max_episode_length_eval

        if max_episode_length_eval is None:
            self._max_episode_length_eval = env_spec.max_episode_length

        self._eval_env = None

        self._env_spec = env_spec
        self._replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self._sampler = sampler

        self._init_opt()
示例#16
0
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            replay_buffer,
            *,  # Everything after this is numbers.
            max_path_length,
            steps_per_epoch=20,
            n_train_steps=50,
            max_eval_path_length=None,
            buffer_batch_size=64,
            min_buffer_size=int(1e4),
            exploration_policy=None,
            target_update_tau=0.01,
            discount=0.99,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=torch.optim.Adam,
            qf_optimizer=torch.optim.Adam,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            max_action=None,
            reward_scale=1.,
            smooth_return=True):
        action_bound = env_spec.action_space.high
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return
        self._max_action = action_bound if max_action is None else max_action

        self._steps_per_epoch = steps_per_epoch
        self._episode_rewards = []
        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._policy = policy
        self._qf = qf
        self._n_train_steps = n_train_steps

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._discount = discount
        self._reward_scale = reward_scale
        self._smooth_return = smooth_return
        self.max_path_length = max_path_length
        self._max_eval_path_length = max_eval_path_length

        self.env_spec = env_spec
        self.replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self._target_policy = copy.deepcopy(self.policy)
        self._target_qf = copy.deepcopy(self._qf)
        self._policy_optimizer = make_optimizer(policy_optimizer,
                                                module=self.policy,
                                                lr=policy_lr)
        self._qf_optimizer = make_optimizer(qf_optimizer,
                                            module=self._qf,
                                            lr=qf_lr)
        self._eval_env = None
        self.sampler_cls = LocalSampler
示例#17
0
文件: ddpg.py 项目: irisliucy/garage
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            replay_buffer,
            *,  # Everything after this is numbers.
            steps_per_epoch=20,
            n_train_steps=50,
            max_path_length=None,
            max_eval_path_length=None,
            buffer_batch_size=64,
            min_buffer_size=int(1e4),
            rollout_batch_size=1,
            exploration_policy=None,
            target_update_tau=0.01,
            discount=0.99,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            max_action=None,
            reward_scale=1.,
            smooth_return=True,
            name='DDPG'):
        action_bound = env_spec.action_space.high
        self._max_action = action_bound if max_action is None else max_action
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._name = name
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return
        self._success_history = deque(maxlen=100)

        self._episode_rewards = []
        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._target_policy = policy.clone('target_policy')
        self._target_qf = qf.clone('target_qf')
        self._policy_optimizer = policy_optimizer
        self._qf_optimizer = qf_optimizer
        self._policy_lr = policy_lr
        self._qf_lr = qf_lr

        self._min_buffer_size = min_buffer_size
        self._qf = qf
        self._steps_per_epoch = steps_per_epoch
        self._n_train_steps = n_train_steps
        self._buffer_batch_size = buffer_batch_size
        self._discount = discount
        self._reward_scale = reward_scale
        self._smooth_return = smooth_return
        self.max_path_length = max_path_length
        self._max_eval_path_length = max_eval_path_length

        # used by OffPolicyVectorizedSampler
        self.rollout_batch_size = rollout_batch_size
        self.env_spec = env_spec
        self.replay_buffer = replay_buffer
        self.policy = policy
        self.exploration_policy = exploration_policy

        self.sampler_cls = OffPolicyVectorizedSampler

        self.init_opt()
示例#18
0
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            replay_buffer,
            *,  # Everything after this is numbers.
            steps_per_epoch=20,
            n_train_steps=50,
            max_path_length=None,
            max_eval_path_length=None,
            buffer_batch_size=64,
            min_buffer_size=int(1e4),
            rollout_batch_size=1,
            exploration_policy=None,
            target_update_tau=0.01,
            discount=0.99,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=torch.optim.Adam,
            qf_optimizer=torch.optim.Adam,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            max_action=None,
            reward_scale=1.,
            smooth_return=True):
        action_bound = env_spec.action_space.high
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return
        self._max_action = action_bound if max_action is None else max_action

        self._success_history = deque(maxlen=100)
        self._episode_rewards = []
        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        super().__init__(env_spec=env_spec,
                         policy=policy,
                         qf=qf,
                         n_train_steps=n_train_steps,
                         steps_per_epoch=steps_per_epoch,
                         max_path_length=max_path_length,
                         max_eval_path_length=max_eval_path_length,
                         buffer_batch_size=buffer_batch_size,
                         min_buffer_size=min_buffer_size,
                         rollout_batch_size=rollout_batch_size,
                         exploration_policy=exploration_policy,
                         replay_buffer=replay_buffer,
                         use_target=True,
                         discount=discount,
                         reward_scale=reward_scale,
                         smooth_return=smooth_return)

        self._target_policy = copy.deepcopy(self.policy)
        self._target_qf = copy.deepcopy(self.qf)
        self._policy_optimizer = make_optimizer(policy_optimizer,
                                                module=self.policy,
                                                lr=policy_lr)
        self._qf_optimizer = make_optimizer(qf_optimizer,
                                            module=self.qf,
                                            lr=qf_lr)