Exemplo n.º 1
0
    def __init__(self,
                 name,
                 args,
                 env_args,
                 sess_config=None,
                 save=True,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None):
        super().__init__(name,
                         args,
                         env_args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device)
        del self.buffer

        outside_value = float(args['ac']['policy_end_lr'])
        points = [(0, float(args['ac']['policy_lr'])),
                  (args['ac']['policy_decay_steps'], outside_value)]
        self.policy_lr_scheduler = PiecewiseSchedule(
            points, outside_value=outside_value)

        outside_value = float(args['ac']['value_end_lr'])
        points = [(0, float(args['ac']['value_lr'])),
                  (args['ac']['value_decay_steps'], outside_value)]
        self.value_lr_scheduler = PiecewiseSchedule(
            points, outside_value=outside_value)
Exemplo n.º 2
0
    def __init__(self, args, state_shape, action_dim):
        super().__init__(args, state_shape, action_dim)
        self.data_structure = None            

        # params for prioritized replay
        self.alpha = float(args['alpha']) if 'alpha' in args else .5
        self.beta = float(args['beta0']) if 'beta0' in args else .4
        self.beta_schedule = PiecewiseSchedule([(0, args['beta0']), (float(args['beta_steps']), 1.)], 
                                                outside_value=1.)
        self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4

        self.top_priority = 2.
        self.to_update_priority = args['to_update_priority'] if 'to_update_priority' in args else True

        self.sample_i = 0   # count how many times self.sample is called

        init_buffer(self.memory, self.capacity, state_shape, action_dim, self.n_steps == 1)

        # Code for single agent
        if self.n_steps > 1:
            self.tb_capacity = args['tb_capacity']
            self.tb_idx = 0
            self.tb_full = False
            self.tb = {}
            init_buffer(self.tb, self.tb_capacity, state_shape, action_dim, True)
Exemplo n.º 3
0
Arquivo: per.py Projeto: xlnwel/d2rl
 def _add_attributes(self):
     super()._add_attributes()
     self._top_priority = 1.
     self._data_structure = None            
     self._use_is_ratio = getattr(self, '_use_is_ratio', True)
     self._beta = float(getattr(self, 'beta0', .4))
     if getattr(self, '_beta_schedule', None):
         assert isinstance(self._beta_schedule, list)
         self._beta_schedule = PiecewiseSchedule(self._beta_schedule)
     self._sample_i = 0   # count how many times self._sample is called
    def __init__(self, args, state_space, action_dim):
        super().__init__(args, state_space, action_dim)
        self.data_structure = None

        # params for prioritized replay
        self.alpha = float(args['alpha']) if 'alpha' in args else .5
        self.beta = float(args['beta0']) if 'beta0' in args else .4
        self.beta_schedule = PiecewiseSchedule(
            [(0, args['beta0']), (float(args['beta_steps']), 1.)],
            outside_value=1.)
        self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4

        self.top_priority = 2.

        self.sample_i = 0  # count how many times self.sample is called
Exemplo n.º 5
0
 def _schedule_act_epsilon(self, env):
     """ Schedules action epsilon """
     if self._schedule_act_eps:
         if isinstance(self._act_eps, (list, tuple)):
             logger.info(f'Schedule action epsilon: {self._act_eps}')
             self._act_eps = PiecewiseSchedule(self._act_eps)
         else:
             self._act_eps = compute_act_eps(
                 self._act_eps_type, self._act_eps,
                 getattr(self, '_id', None),
                 getattr(self, '_n_workers', getattr(env, 'n_workers', 1)),
                 env.n_envs)
             if env.action_shape != ():
                 self._act_eps = self._act_eps.reshape(-1, 1)
             self._schedule_act_eps = False  # not run-time scheduling
     print('Action epsilon:', np.reshape(self._act_eps, -1))
     if not isinstance(getattr(self, '_act_eps', None), PiecewiseSchedule):
         self._act_eps = tf.convert_to_tensor(self._act_eps, tf.float32)
Exemplo n.º 6
0
    def __init__(self,
                 name,
                 args,
                 env_args,
                 buffer_args,
                 sess_config=None,
                 save=False,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None):
        self.critic_loss_type = args['critic']['loss_type']
        self.polyak = args['polyak'] if 'polyak' in args else .995

        # learning rate schedule
        self.schedule_lr = 'schedule_lr' in args and args['schedule_lr']
        if self.schedule_lr:
            self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4),
                                                         (150000, 1e-4),
                                                         (300000, 5e-5)],
                                                        outside_value=5e-5)
            self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4),
                                                          (150000, 3e-4),
                                                          (300000, 5e-5)],
                                                         outside_value=5e-5)

        super().__init__(name,
                         args,
                         env_args,
                         buffer_args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device)
Exemplo n.º 7
0
class ActionScheduler:
    def _setup_action_schedule(self, env):
        # eval action epsilon and temperature
        self._eval_act_eps = tf.convert_to_tensor(
            getattr(self, '_eval_act_eps', 0), tf.float32)
        self._eval_act_temp = tf.convert_to_tensor(
            getattr(self, '_eval_act_temp', .5), tf.float32)

        self._schedule_act_eps = getattr(self, '_schedule_act_eps', False)
        self._schedule_act_temp = getattr(self, '_schedule_act_temp', False)

        self._schedule_act_epsilon(env)
        self._schedule_act_temperature(env)

    def _schedule_act_epsilon(self, env):
        """ Schedules action epsilon """
        if self._schedule_act_eps:
            if isinstance(self._act_eps, (list, tuple)):
                logger.info(f'Schedule action epsilon: {self._act_eps}')
                self._act_eps = PiecewiseSchedule(self._act_eps)
            else:
                self._act_eps = compute_act_eps(
                    self._act_eps_type, self._act_eps,
                    getattr(self, '_id', None),
                    getattr(self, '_n_workers', getattr(env, 'n_workers', 1)),
                    env.n_envs)
                if env.action_shape != ():
                    self._act_eps = self._act_eps.reshape(-1, 1)
                self._schedule_act_eps = False  # not run-time scheduling
        print('Action epsilon:', np.reshape(self._act_eps, -1))
        if not isinstance(getattr(self, '_act_eps', None), PiecewiseSchedule):
            self._act_eps = tf.convert_to_tensor(self._act_eps, tf.float32)

    def _schedule_act_temperature(self, env):
        """ Schedules action temperature """
        if self._schedule_act_temp:
            self._act_temp = compute_act_temp(
                self._min_temp, self._max_temp,
                getattr(self, '_n_exploit_envs',
                        0), getattr(self, '_id', None),
                getattr(self, '_n_workers', getattr(env, 'n_workers', 1)),
                env.n_envs)
            self._act_temp = self._act_temp.reshape(-1, 1)
            self._schedule_act_temp = False  # not run-time scheduling
        else:
            self._act_temp = getattr(self, '_act_temp', 1)
        print('Action temperature:', np.reshape(self._act_temp, -1))
        self._act_temp = tf.convert_to_tensor(self._act_temp, tf.float32)

    def _get_eps(self, evaluation):
        """ Gets action epsilon """
        if evaluation:
            eps = self._eval_act_eps
        else:
            if self._schedule_act_eps:
                eps = self._act_eps.value(self.env_step)
                self.store(act_eps=eps)
                eps = tf.convert_to_tensor(eps, tf.float32)
            else:
                eps = self._act_eps
        return eps

    def _get_temp(self, evaluation):
        """ Gets action temperature """
        return self._eval_act_temp if evaluation else self._act_temp
Exemplo n.º 8
0
class Learner(Agent):
    def __init__(self,
                 name,
                 args,
                 env_args,
                 sess_config=None,
                 save=True,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None):
        super().__init__(name,
                         args,
                         env_args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device)
        del self.buffer

        outside_value = float(args['ac']['policy_end_lr'])
        points = [(0, float(args['ac']['policy_lr'])),
                  (args['ac']['policy_decay_steps'], outside_value)]
        self.policy_lr_scheduler = PiecewiseSchedule(
            points, outside_value=outside_value)

        outside_value = float(args['ac']['value_end_lr'])
        points = [(0, float(args['ac']['value_lr'])),
                  (args['ac']['value_decay_steps'], outside_value)]
        self.value_lr_scheduler = PiecewiseSchedule(
            points, outside_value=outside_value)

    def apply_gradients(self, timestep, *grads):
        policy_lr = self.policy_lr_scheduler.value(timestep)
        val_lr = self.value_lr_scheduler.value(timestep)
        print('policy learning rate:', policy_lr)
        print('value learning rate:', val_lr)

        grads = np.mean(grads, axis=0)

        feed_dict = {g_var: g for g_var, g in zip(self.ac.grads, grads)}

        feed_dict.update({self.ac.policy_lr: policy_lr, self.ac.v_lr: val_lr})

        fetches = [self.ac.opt_step]

        fetches.append([self.ac.policy_optop, self.ac.v_optop])

        # do not log_tensorboard, use record_stats if required
        learn_step, _ = self.sess.run(fetches, feed_dict=feed_dict)

        if hasattr(self, 'saver') and learn_step % 100 == 0:
            self.save()

        return self.get_weights()

    def get_weights(self):
        return self.variables.get_flat()

    def record_stats(self, score_mean, score_std, epslen_mean, entropy,
                     approx_kl, clip_frac):
        log_info = dict(score_mean=score_mean,
                        score_std=score_std,
                        epslen_mean=epslen_mean,
                        entropy=entropy,
                        approx_kl=approx_kl,
                        clip_frac=clip_frac)
        # a wraper since ray does not support (*args)
        super().record_stats(**log_info)

    def print_construction_complete(self):
        pwc('Learner has been constructed.', color='cyan')
Exemplo n.º 9
0
class Agent(OffPolicyOperation):
    """ Interface """
    def __init__(self,
                 name,
                 args,
                 env_args,
                 buffer_args,
                 sess_config=None,
                 save=False,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None):
        self.critic_loss_type = args['critic']['loss_type']
        self.polyak = args['polyak'] if 'polyak' in args else .995

        # learning rate schedule
        self.schedule_lr = 'schedule_lr' in args and args['schedule_lr']
        if self.schedule_lr:
            self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4),
                                                         (150000, 1e-4),
                                                         (300000, 5e-5)],
                                                        outside_value=5e-5)
            self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4),
                                                          (150000, 3e-4),
                                                          (300000, 5e-5)],
                                                         outside_value=5e-5)

        super().__init__(name,
                         args,
                         env_args,
                         buffer_args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device)

    @property
    def main_variables(self):
        return self.actor.trainable_variables + self.critic.trainable_variables

    @property
    def target_variables(self):
        return self.target_actor.trainable_variables + self.target_critic.trainable_variables

    """ Implementation """

    def _build_graph(self):
        if self.device and 'GPU' in self.device:
            with tf.device('/CPU: 0'):
                self.data = self._prepare_data(self.buffer)
        else:
            self.data = self._prepare_data(self.buffer)

        self.actor, self.critic, self.target_actor, self.target_critic = self._create_main_target_actor_critic(
        )
        self.action_det = self.action = self.actor.action

        self._compute_loss()

        _, self.actor_lr, self.opt_step, _, self.actor_opt_op = self.actor._optimization_op(
            self.actor_loss, opt_step=True, schedule_lr=self.schedule_lr)
        _, self.critic_lr, _, _, self.critic_opt_op = self.critic._optimization_op(
            self.critic_loss, schedule_lr=self.schedule_lr)
        self.opt_op = tf.group(self.actor_opt_op, self.critic_opt_op)

        # target net operations
        self.init_target_op, self.update_target_op = self._target_net_ops()

        self._log_loss()

    def _create_main_target_actor_critic(self):
        # main actor-critic
        actor, critic = self._create_actor_critic(is_target=False)
        # target actor-critic
        target_actor, target_critic = self._create_actor_critic(is_target=True)

        return actor, critic, target_actor, target_critic

    def _create_actor_critic(self, is_target):
        log_tensorboard = False if is_target else self.log_tensorboard
        log_params = False if is_target else self.log_params

        scope_name = 'target' if is_target else 'main'
        state = self.data['next_state'] if is_target else self.data['state']
        scope_prefix = self.name + '/' + scope_name
        self.args['actor'][
            'max_action_repetitions'] = self.max_action_repetitions

        with tf.variable_scope(scope_name):
            actor = Actor('actor',
                          self.args['actor'],
                          self.graph,
                          state,
                          self.action_dim,
                          scope_prefix=scope_prefix,
                          log_tensorboard=log_tensorboard,
                          log_params=log_params)

            critic = DoubleCritic('critic',
                                  self.args['critic'],
                                  self.graph,
                                  state,
                                  self.data['action'],
                                  actor.action,
                                  self.action_dim,
                                  scope_prefix=scope_prefix,
                                  log_tensorboard=log_tensorboard,
                                  log_params=log_params)

        return actor, critic

    def _compute_loss(self):
        with tf.name_scope('loss'):
            self.actor_loss = self._actor_loss()
            self.priority, self.critic_loss = self._critic_loss()
            self.loss = self.actor_loss + self.critic_loss

    def _actor_loss(self):
        with tf.name_scope('actor_loss'):
            return -tf.reduce_mean(
                self.data['IS_ratio'] * self.critic.Q1_with_actor)

    def _critic_loss(self):
        with tf.name_scope('critic_loss'):
            target_Q = n_step_target(self.data['reward'], self.data['done'],
                                     self.target_critic.Q_with_actor,
                                     self.gamma, self.data['steps'])

            Q1_error = tf.abs(target_Q - self.critic.Q1, name='Q1_error')
            Q2_error = tf.abs(target_Q - self.critic.Q2, name='Q2_error')

            loss_func = huber_loss if self.critic_loss_type == 'huber' else tf.square
            TD_squared = (loss_func(Q1_error) + loss_func(Q2_error))

            critic_loss = tf.reduce_mean(self.data['IS_ratio'] * TD_squared)

        priority = self._compute_priority((Q1_error + Q2_error) / 2.)

        return priority, critic_loss

    def _target_net_ops(self):
        with tf.name_scope('target_net_op'):
            target_main_var_pairs = list(
                zip(self.target_variables, self.main_variables))
            init_target_op = list(
                map(lambda v: tf.assign(v[0], v[1], name='init_target_op'),
                    target_main_var_pairs))
            update_target_op = list(
                map(
                    lambda v: tf.assign(v[0],
                                        self.polyak * v[0] +
                                        (1. - self.polyak) * v[1],
                                        name='update_target_op'),
                    target_main_var_pairs))

        return init_target_op, update_target_op

    def _initialize_target_net(self):
        self.sess.run(self.init_target_op)

    def _update_target_net(self):
        self.sess.run(self.update_target_op)

    def _log_loss(self):
        if self.log_tensorboard:
            with tf.name_scope('info'):
                tf.compat.v1.summary.scalar('actor_loss_', self.actor_loss)
                tf.compat.v1.summary.scalar('critic_loss_', self.critic_loss)
                stats_summary('Q_with_actor',
                              self.critic.Q_with_actor,
                              max=True,
                              hist=True)
                stats_summary('reward',
                              self.data['reward'],
                              min=True,
                              hist=True)
                stats_summary('priority', self.priority, hist=True, max=True)

    def _get_feeddict(self, t):
        return {
            self.actor_lr: self.actor_lr_scheduler.value(t),
            self.critic_lr: self.critic_lr_scheduler.value(t)
        }
Exemplo n.º 10
0
class Agent(OffPolicyOperation):
    """ Interface """
    def __init__(self,
                 name,
                 args,
                 env_args,
                 buffer_args,
                 sess_config=None,
                 save=False,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None):
        self.raw_temperature = args['temperature']
        self.critic_loss_type = args['loss_type']

        # learning rate schedule
        self.schedule_lr = 'schedule_lr' in args and args['schedule_lr']
        if self.schedule_lr:
            self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4),
                                                         (150000, 1e-4),
                                                         (300000, 5e-5)],
                                                        outside_value=5e-5)
            self.Q_lr_scheduler = PiecewiseSchedule([(0, 3e-4), (150000, 3e-4),
                                                     (300000, 1e-4)],
                                                    outside_value=1e-4)
            self.alpha_lr_scheduler = PiecewiseSchedule([(0, 1e-4),
                                                         (150000, 1e-4),
                                                         (300000, 5e-5)],
                                                        outside_value=5e-5)

        super().__init__(name,
                         args,
                         env_args,
                         buffer_args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device)

    @override(OffPolicyOperation)
    def _build_graph(self):
        if 'gpu' in self.device:
            with tf.device('/cpu: 0'):
                self.data = self._prepare_data(self.buffer)
        else:
            self.data = self._prepare_data(self.buffer)

        self.actor = self._actor()

        self._action_surrogate()

        self.critic = self._critic()

        if self.raw_temperature == 'auto':
            self.temperature = self._auto_temperature()
            self.alpha = self.temperature.alpha
            self.next_alpha = self.temperature.next_alpha
        else:
            # reward scaling indirectly affects the policy temperature
            # we neutralize the effect by scaling the temperature here
            # see my blog for more info https://xlnwel.github.io/blog/reinforcement%20learning/SAC/
            self.alpha = self.raw_temperature * self.buffer.reward_scale
            self.next_alpha = self.alpha

        self._compute_loss()
        self._optimize()

        self._log_loss()

    def _actor(self):
        policy_args = self.args['Policy']
        policy_args['max_action_repetitions'] = self.max_action_repetitions
        policy_args['polyak'] = self.args['polyak']
        return SoftPolicy('SoftPolicy',
                          policy_args,
                          self.graph,
                          self.data['state'],
                          self.data['next_state'],
                          self.action_dim,
                          scope_prefix=self.name,
                          log_tensorboard=self.log_tensorboard,
                          log_params=self.log_params)

    def _action_surrogate(self):
        self.action = self.actor.action
        self.action_det = self.actor.action_det
        self.next_action = self.actor.next_action
        self.logpi = self.actor.logpi
        self.next_logpi = self.actor.next_logpi

    def _critic(self):
        q_args = self.args['Q']
        q_args['polyak'] = self.args['polyak']
        return SoftQ('SoftQ',
                     q_args,
                     self.graph,
                     self.data['state'],
                     self.data['next_state'],
                     self.data['action'],
                     self.action,
                     self.next_action,
                     scope_prefix=self.name,
                     log_tensorboard=self.log_tensorboard,
                     log_params=self.log_params)

    def _auto_temperature(self):
        return Temperature('Temperature',
                           self.args['Temperature'],
                           self.graph,
                           self.data['state'],
                           self.data['next_state'],
                           self.action,
                           self.next_action,
                           scope_prefix=self.name,
                           log_tensorboard=self.log_tensorboard,
                           log_params=self.log_params)

    def _compute_loss(self):
        with tf.name_scope('loss'):
            if self.raw_temperature == 'auto':
                self.alpha_loss = self._alpha_loss()
                self.loss = self.alpha_loss
            else:
                self.loss = 0
            self.actor_loss = self._actor_loss()
            self.priority, self.Q1_loss, self.Q2_loss, self.critic_loss = self._critic_loss(
            )
            self.loss += self.actor_loss + self.critic_loss

    def _alpha_loss(self):
        target_entropy = -self.action_dim
        with tf.name_scope('alpha_loss'):
            return -tf.reduce_mean(
                self.data['IS_ratio'] * self.temperature.log_alpha *
                tf.stop_gradient(self.logpi + target_entropy))

    def _actor_loss(self):
        with tf.name_scope('actor_loss'):
            return tf.reduce_mean(
                self.data['IS_ratio'] *
                (self.alpha * self.logpi - self.critic.Q1_with_actor))

    def _critic_loss(self):
        with tf.name_scope('critic_loss'):
            n_V = tf.subtract(self.critic.next_Q_with_actor,
                              self.next_alpha * self.next_logpi,
                              name='n_V')
            target_Q = n_step_target(self.data['reward'], self.data['done'],
                                     n_V, self.gamma, self.data['steps'])
            Q1_error = tf.abs(target_Q - self.critic.Q1, name='Q1_error')
            Q2_error = tf.abs(target_Q - self.critic.Q2, name='Q2_error')

            Q1_loss = tf.reduce_mean(self.data['IS_ratio'] * Q1_error**2)
            Q2_loss = tf.reduce_mean(self.data['IS_ratio'] * Q2_error**2)
            critic_loss = Q1_loss + Q2_loss

        priority = self._compute_priority((Q1_error + Q2_error) / 2.)

        return priority, Q1_loss, Q2_loss, critic_loss

    def _optimize(self):
        with tf.name_scope('optimizer'):
            opt_ops = []
            if self.raw_temperature == 'auto':
                _, self.alpha_lr, _, _, temp_op = self.temperature._optimization_op(
                    self.alpha_loss, schedule_lr=self.schedule_lr)
                opt_ops.append(temp_op)
            _, self.actor_lr, self.opt_step, _, actor_opt_op = self.actor._optimization_op(
                self.actor_loss, opt_step=True, schedule_lr=self.schedule_lr)
            _, self.Q_lr, _, _, Q_opt_op = self.critic._optimization_op(
                self.critic_loss, schedule_lr=self.schedule_lr)
            opt_ops += [actor_opt_op, Q_opt_op]
            self.opt_op = tf.group(*opt_ops)

    @override(OffPolicyOperation)
    def _initialize_target_net(self):
        self.sess.run(self.actor.init_target_op + self.critic.init_target_op)

    @override(OffPolicyOperation)
    def _update_target_net(self):
        self.sess.run(self.actor.update_target_op +
                      self.critic.update_target_op)

    @override(OffPolicyOperation)
    def _get_feeddict(self, t):
        return {
            self.actor_lr: self.actor_lr_scheduler.value(t),
            self.Q_lr: self.Q_lr_scheduler.value(t),
            self.alpha_lr: self.alpha_lr_scheduler.value(t)
        }

    def _log_loss(self):
        if self.log_tensorboard:
            with tf.name_scope('info'):
                stats_summary('reward',
                              self.data['reward'],
                              min=True,
                              max=True,
                              hist=True)
                with tf.name_scope('actor'):
                    stats_summary('orig_action', self.actor.orig_action)
                    stats_summary('entropy',
                                  self.actor.action_distribution.entropy())
                    stats_summary('action_std',
                                  self.actor.action_distribution.std)
                    stats_summary('orig_logpi', self.actor.orig_logpi)
                    tf.compat.v1.summary.scalar('orig_logpi_0',
                                                self.actor.orig_logpi[0][0])
                    stats_summary('action', self.actor.action)
                    stats_summary('logpi', self.actor.logpi)
                    tf.compat.v1.summary.scalar('actor_loss_', self.actor_loss)
                with tf.name_scope('critic'):
                    stats_summary('Q1_with_actor',
                                  self.critic.Q1_with_actor,
                                  min=True,
                                  max=True)
                    stats_summary('Q2_with_actor',
                                  self.critic.Q2_with_actor,
                                  min=True,
                                  max=True)
                    if self.buffer_type == 'proportional':
                        stats_summary('priority',
                                      self.priority,
                                      std=True,
                                      max=True,
                                      hist=True)
                    tf.compat.v1.summary.scalar('Q1_loss_', self.Q1_loss)
                    tf.compat.v1.summary.scalar('Q2_loss_', self.Q2_loss)
                    tf.compat.v1.summary.scalar('critic_loss_',
                                                self.critic_loss)
                if self.raw_temperature == 'auto':
                    with tf.name_scope('alpha'):
                        stats_summary('alpha', self.alpha, std=True)
                        tf.compat.v1.summary.scalar('alpha_loss',
                                                    self.alpha_loss)
Exemplo n.º 11
0
    def __init__(self,
                 name,
                 args,
                 env_args,
                 sess_config=None,
                 save=False,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None,
                 reuse=None,
                 graph=None):
        # hyperparameters
        self.gamma = args['gamma']
        self.gae_discount = self.gamma * args['lam']
        self.n_minibatches = args['n_minibatches']

        self.use_lstm = args['ac']['use_lstm']
        self.entropy_coef = args['ac']['entropy_coef']
        self.n_value_updates = args['ac']['n_value_updates']
        self.minibatch_idx = 0

        # environment info
        self.env_vec = create_gym_env(env_args)
        self.seq_len = self.env_vec.max_episode_steps

        self.buffer = PPOBuffer(env_args['n_workers'] * env_args['n_envs'],
                                self.seq_len, self.n_minibatches,
                                self.env_vec.state_shape, np.float32,
                                self.env_vec.action_shape, np.float32)

        super().__init__(name,
                         args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device,
                         reuse=reuse,
                         graph=graph)

        self.schedule_lr = 'schedule_lr' in args and args['schedule_lr']
        if self.schedule_lr:
            self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4),
                                                         (400000, 1e-4),
                                                         (600000, 5e-5)],
                                                        outside_value=5e-5)
            self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4),
                                                          (400000, 3e-4),
                                                          (600000, 5e-5)],
                                                         outside_value=5e-5)

        if self.use_lstm:
            # don't distinguish lstm at training from that at running
            # since training is done after running
            self.last_lstm_state = None

        with self.graph.as_default():
            self.variables = TensorFlowVariables(
                [self.ac.policy_loss, self.ac.V_loss], self.sess)
Exemplo n.º 12
0
class PrioritizedReplay(Replay):
    """ Interface """
    def __init__(self, args, state_shape, action_dim):
        super().__init__(args, state_shape, action_dim)
        self.data_structure = None            

        # params for prioritized replay
        self.alpha = float(args['alpha']) if 'alpha' in args else .5
        self.beta = float(args['beta0']) if 'beta0' in args else .4
        self.beta_schedule = PiecewiseSchedule([(0, args['beta0']), (float(args['beta_steps']), 1.)], 
                                                outside_value=1.)
        self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4

        self.top_priority = 2.
        self.to_update_priority = args['to_update_priority'] if 'to_update_priority' in args else True

        self.sample_i = 0   # count how many times self.sample is called

        init_buffer(self.memory, self.capacity, state_shape, action_dim, self.n_steps == 1)

        # Code for single agent
        if self.n_steps > 1:
            self.tb_capacity = args['tb_capacity']
            self.tb_idx = 0
            self.tb_full = False
            self.tb = {}
            init_buffer(self.tb, self.tb_capacity, state_shape, action_dim, True)

    @override(Replay)
    def sample(self):
        assert_colorize(self.good_to_learn, 'There are not sufficient transitions to start learning --- '
                                            f'transitions in buffer: {len(self)}\t'
                                            f'minimum required size: {self.min_size}')
        with self.locker:        
            samples = self._sample()
            self.sample_i += 1
            self._update_beta()

        return samples

    @override(Replay)
    def add(self, state, action, reward, done):
        if self.n_steps > 1:
            self.tb['priority'][self.tb_idx] = self.top_priority
        else:
            self.memory['priority'][self.mem_idx] = self.top_priority
            self.data_structure.update(self.top_priority, self.mem_idx)
        super()._add(state, action, reward, done)

    def update_priorities(self, priorities, saved_mem_idxs):
        with self.locker:
            if self.to_update_priority:
                self.top_priority = max(self.top_priority, np.max(priorities))
            for priority, mem_idx in zip(priorities, saved_mem_idxs):
                self.data_structure.update(priority, mem_idx)

    """ Implementation """
    def _update_beta(self):
        self.beta = self.beta_schedule.value(self.sample_i)

    @override(Replay)
    def _merge(self, local_buffer, length):
        end_idx = self.mem_idx + length
        assert np.all(local_buffer['priority'][: length])
        for idx, mem_idx in enumerate(range(self.mem_idx, end_idx)):
            self.data_structure.update(local_buffer['priority'][idx], mem_idx % self.capacity)
            
        super()._merge(local_buffer, length)
        
    def _compute_IS_ratios(self, probabilities):
        IS_ratios = (np.min(probabilities) / probabilities)**self.beta

        return IS_ratios
Exemplo n.º 13
0
Arquivo: per.py Projeto: xlnwel/d2rl
class PERBase(Replay):
    """ Base class for PER, left in case one day I implement rank-based PER """
    def _add_attributes(self):
        super()._add_attributes()
        self._top_priority = 1.
        self._data_structure = None            
        self._use_is_ratio = getattr(self, '_use_is_ratio', True)
        self._beta = float(getattr(self, 'beta0', .4))
        if getattr(self, '_beta_schedule', None):
            assert isinstance(self._beta_schedule, list)
            self._beta_schedule = PiecewiseSchedule(self._beta_schedule)
        self._sample_i = 0   # count how many times self._sample is called

    @override(Replay)
    def sample(self, batch_size=None):
        assert self.good_to_learn(), (
            'There are not sufficient transitions to start learning --- '
            f'transitions in buffer({len(self)}) vs '
            f'minimum required size({self._min_size})')
        samples = self._sample(batch_size=batch_size)
        self._sample_i += 1
        if hasattr(self, '_beta_schedule'):
            self._update_beta()
        return samples

    @override(Replay)
    def add(self, **kwargs):
        super().add(**kwargs)
        # super().add updates self._mem_idx 
        if self._n_envs == 1:
            self._data_structure.update(self._mem_idx - 1, self._top_priority)

    def update_priorities(self, priorities, idxes):
        assert not np.any(np.isnan(priorities)), priorities
        np.testing.assert_array_less(0, priorities)
        if self._to_update_top_priority:
            self._top_priority = max(self._top_priority, np.max(priorities))
        self._data_structure.batch_update(idxes, priorities)

    """ Implementation """
    def _update_beta(self):
        self._beta = self._beta_schedule.value(self._sample_i)

    @override(Replay)
    def _merge(self, local_buffer, length):    
        priority = local_buffer.pop('priority')[:length] \
            if 'priority' in local_buffer else self._top_priority * np.ones(length)
        np.testing.assert_array_less(0, priority)
        # update sum tree
        mem_idxes = np.arange(self._mem_idx, self._mem_idx + length) % self._capacity
        self._data_structure.batch_update(mem_idxes, priority)
        # update memory
        super()._merge(local_buffer, length)
        
    def _compute_IS_ratios(self, probabilities):
        """
        w = (N * p)**(-beta)
        max(w) = max(N * p)**(-beta) = (N * min(p))**(-beta)
        norm_w = w / max(w) = (N*p)**(-beta) / (N * min(p))**(-beta)
               = (min(p) / p)**beta
        """
        IS_ratios = (np.min(probabilities) / probabilities)**self._beta

        return IS_ratios
Exemplo n.º 14
0
class PrioritizedReplay(Replay):
    """ Interface """
    def __init__(self, args, state_space, action_dim):
        super().__init__(args, state_space, action_dim)
        self.data_structure = None

        # params for prioritized replay
        self.alpha = float(args['alpha']) if 'alpha' in args else .5
        self.beta = float(args['beta0']) if 'beta0' in args else .4
        self.beta_schedule = PiecewiseSchedule(
            [(0, args['beta0']), (float(args['beta_steps']), 1.)],
            outside_value=1.)
        self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4

        self.top_priority = 2.

        self.sample_i = 0  # count how many times self.sample is called

    @override(Replay)
    def sample(self):
        assert_colorize(
            self.good_to_learn,
            'There are not sufficient transitions to start learning --- '
            f'transitions in buffer: {len(self)}\t'
            f'minimum required size: {self.min_size}')
        with self.locker:
            samples = self._sample()
            self.sample_i += 1
            self._update_beta()

        return samples

    @override(Replay)
    def add(self, state, action, reward, done):
        if self.n_steps > 1:
            self.tb['priority'][self.tb_idx] = self.top_priority
        else:
            self.memory['priority'][self.mem_idx] = self.top_priority
        super()._add(state, action, reward, done)

    def update_priorities(self, priorities, saved_mem_idxs):
        with self.locker:
            for priority, mem_idx in zip(priorities, saved_mem_idxs):
                self.data_structure.update(priority, mem_idx)

    """ Implementation """

    def _update_beta(self):
        self.beta = self.beta_schedule.value(self.sample_i)

    @override(Replay)
    def _merge(self, local_buffer, length, start=0):
        end_idx = self.mem_idx + length
        for idx, mem_idx in enumerate(range(self.mem_idx, end_idx)):
            self.data_structure.update(local_buffer['priority'][idx],
                                       mem_idx % self.capacity)

        super()._merge(local_buffer, length, start)

    def _compute_IS_ratios(self, N, probabilities):
        IS_ratios = np.power(probabilities * N, -self.beta)
        IS_ratios /= np.max(
            IS_ratios)  # normalize ratios to avoid scaling the update upward

        return IS_ratios