示例#1
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.save = None
        self.load = None

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if var.name.endswith('/w:0') and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b],
                               terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.save = functools.partial(save_variables, sess=self.sess)
        self.load = functools.partial(load_variables, sess=self.load)
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
    rms = RunningMeanStd(shape=x.shape[1:])
    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range),
                              max(clip_range))
    return norm_x, rms
示例#3
0
     def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
                 batch_size=128, observation_range=(-1., 1.), action_range= [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True, critic_l2_reg=0.,
                 adaptive_param_noise_policy_threshold=.1, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., restore=False):

        # Inputs.
        # self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs0 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs0')
        # self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.obs1 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None, action_shape), name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        """Filewriter summary"""
        monitor_directory = os.path.join("Experiment_data")
        self.summary_dir = os.path.join(monitor_directory, "summary")
        # if restore:
        #     dirname = 'run20' # The last name
        #     self.summary_dir = os.path.join(self.summary_dir, dirname)
        # else:
        self.summary_dir = utils.new_summary_dir(self.summary_dir)

        # record the detailed parameters
        utils.log_params(self.summary_dir, {
            "actor learning rate": self.actor_lr,
            "critic learning rate": self.critic_lr,
            "batch size": self.batch_size,
            "actor update rate": self.tau,
            "critic update rate": self.tau,
            "action noise": self.action_noise,
            "param noise": self.param_noise,
            "reward function": 'General reward function',
            "result_function": 'The second 100'
        })

        self.merged = tf.summary.merge_all()
示例#4
0
    def _init(self,
              np_random,
              flavor,
              dim,
              hid_size=32,
              n_hid=2,
              alpha_sysid=0.1,
              test=False):

        print("obs dim:", dim.ob)

        # inputs & hyperparameters
        self.flavor = flavor
        self.dim = dim
        self.alpha_sysid = alpha_sysid
        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=(None, dim.ob_concat))
        self.ob_traj = U.get_placeholder(name="ob_traj",
                                         dtype=tf.float32,
                                         shape=[None, dim.window, dim.ob])
        self.ac_traj = U.get_placeholder(name="ac_traj",
                                         dtype=tf.float32,
                                         shape=[None, dim.window, dim.ac])

        # regular inputs whitening
        ob, sysid = tf.split(self.ob, [dim.ob, dim.sysid], axis=1)
        with tf.variable_scope("ob_filter"):
            self.ob_rms = RunningMeanStd(shape=(dim.ob_concat))
            obz_all = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std,
                -5.0,
                5.0,
                name="ob_normalizer")
        obz, sysidz = tf.split(obz_all, [dim.ob, dim.sysid], axis=1)
        print("obz dim:", obz.shape, "sysidz dim:", sysidz.shape)
        with tf.variable_scope("ob_white"):
            obz = tf.identity(obz)
        with tf.variable_scope("sysid_white"):
            self.sysidz = tf.identity(sysidz)

        # trajectory inputs for SysID
        # NOTE: the environment should be defined such that
        # actions are relatively close to Normal(0,1)
        ob_trajz = tf.clip_by_value(
            (self.ob_traj - self.ob_rms.mean[:dim.ob]) /
            self.ob_rms.std[:dim.ob],
            -5.0,
            5.0,
            name="ob_traj_white")
        trajs = tf.concat([ob_trajz, self.ac_traj], axis=2)

        # these rewards will be optimized via direct gradient-based optimization
        # (not RL reward), in the same place as e.g. the entropy regularization
        self.extra_rewards = []
        self.extra_reward_names = []

        with tf.variable_scope("sysid"):
            if flavor == PLAIN:
                self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid)
            elif flavor == EXTRA:
                self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid)
            elif flavor == EMBED:
                self.traj2embed = sysid_convnet(np_random, trajs, dim.embed)

        EMBED_N_HID = 2
        EMBED_HID_SZ = 2 * dim.sysid

        # policy
        with tf.variable_scope("pol"):
            if flavor == BLIND:
                policy_input = obz
                self.sysid_err_supervised = tf.constant(0.0)
            elif flavor == PLAIN:
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(sysidz), self.traj2sysid)
                policy_input = tf.concat([obz, self.traj2sysid
                                          ]) if test else obz_all
            elif flavor == EXTRA:
                sysid_processor_input = self.traj2sysid if test else sysidz
                sysid_processor = MLPModule(np_random, sysid_processor_input,
                                            EMBED_N_HID, EMBED_HID_SZ, 1.0,
                                            dim.embed, "sysid_processor")
                policy_input = tf.concat([obz, sysid_processor],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(sysidz), self.traj2sysid)
            elif flavor == EMBED:
                self.embed = MLPModule(np_random, sysidz, EMBED_N_HID,
                                       EMBED_HID_SZ, 1.0, dim.embed, "embed")
                embed_input = self.traj2embed if test else self.embed
                policy_input = tf.concat([obz, embed_input],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(self.embed), self.traj2embed)
                mean, var = tf.nn.moments(self.embed, 0)
                dist = tf.distributions.Normal(loc=mean, scale=tf.sqrt(var))
                std_dist = tf.distributions.Normal(loc=0.0, scale=1.0)
                embed_KL = tf.reduce_mean(
                    tf.distributions.kl_divergence(dist, std_dist))
                self.extra_rewards.append(-0.1 * embed_KL)
                self.extra_reward_names.append("neg_embed_KL")
            elif flavor == TRAJ:
                self.traj_conv = sysid_convnet(np_random, trajs, dim.embed)
                policy_input = tf.concat([obz, self.traj_conv],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.constant(0.0)
            else:
                raise ValueError("flavor '{}' does not exist".format(flavor))

            # main policy MLP. outputs mean and logstd of stochastic Gaussian policy
            with tf.variable_scope("policy"):
                print("policy input dimensionality:",
                      policy_input.get_shape().as_list())
                mean = MLPModule(np_random, policy_input, n_hid, hid_size,
                                 0.01, dim.ac, "pol")
                logstd = tf.maximum(
                    tf.get_variable(name="logstd",
                                    shape=[1, dim.ac],
                                    initializer=tf.constant_initializer(-0.3)),
                    -1.0)

            with tf.variable_scope("policy_to_gaussian"):
                pdparam = tf.concat([mean, mean * 0.0 + logstd], 1)
                self.pdtype = DiagGaussianPdType(dim.ac)
                self.pd = self.pdtype.pdfromflat(pdparam)

        # value function
        with tf.variable_scope("vf"):
            self.vpred = MLPModule(np_random, tf.stop_gradient(policy_input),
                                   n_hid, hid_size, 0.1, 1, "vf")[:, 0]

        # switch between stochastic and deterministic policy
        with tf.variable_scope("stochastic_switch"):
            self.stochastic = tf.placeholder(dtype=tf.bool,
                                             shape=(),
                                             name="stochastic")
            self.ac = U.switch(self.stochastic, self.pd.sample(),
                               self.pd.mode())

        # function we'll call when interacting with environment
        self._act = U.function([self.stochastic, self.ob],
                               [self.ac, self.vpred])

        # for test time, the trajectory is fed in
        self._act_traj = U.function(
            [self.stochastic, self.ob, self.ob_traj, self.ac_traj],
            [self.ac, self.vpred])
示例#5
0
class TransitionClassifier(object):
    def __init__(self,
                 ob_size,
                 ac_size,
                 hidden_size=100,
                 log_reward=False,
                 entcoeff=0.001,
                 scope="adversary",
                 dyn_norm=True):
        self.scope = scope
        self.ob_size = ob_size
        self.ac_size = ac_size
        # self.input_size = ob_size + ac_size
        self.hidden_size = hidden_size
        self.log_reward = log_reward
        self.dyn_norm = dyn_norm
        self.build_ph()
        # Build grpah
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph)
        # Build accuracy
        generator_acc = tf.reduce_mean(
            tf.cast(tf.nn.sigmoid(generator_logits) < 0.5, tf.float32))
        expert_acc = tf.reduce_mean(
            tf.cast(tf.nn.sigmoid(expert_logits) > 0.5, tf.float32))

        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        if log_reward:
            reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8)
        else:
            reward_op = tf.nn.sigmoid(generator_logits)

        self.reward = U.function(
            [self.generator_obs_ph, self.generator_acs_ph], reward_op)

        lr = tf.placeholder(tf.float32, None)
        self.trainer = tf.train.AdamOptimizer(learning_rate=lr)
        gvs = self.trainer.compute_gradients(self.total_loss,
                                             self.get_trainable_variables())
        self._train = U.function([
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph, lr
        ],
                                 self.losses,
                                 updates=[self.trainer.apply_gradients(gvs)])

    def build_ph(self):
        self.generator_obs_ph = tf.placeholder(tf.float32,
                                               (None, self.ob_size),
                                               name="observations_ph")
        self.generator_acs_ph = tf.placeholder(tf.float32,
                                               (None, self.ac_size),
                                               name="actions_ph")
        self.expert_obs_ph = tf.placeholder(tf.float32, (None, self.ob_size),
                                            name="expert_observations_ph")
        self.expert_acs_ph = tf.placeholder(tf.float32, (None, self.ac_size),
                                            name="expert_actions_ph")

    def build_graph(self, obs_ph, acs_ph):
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(shape=[self.ob_size])
            obs = normalize(obs_ph, self.obs_rms)
            _input = tf.concat(
                [obs, acs_ph],
                axis=1)  # concatenate the two input -> form a transition
            p_h1 = tf.contrib.layers.fully_connected(_input,
                                                     self.hidden_size,
                                                     activation_fn=tf.nn.tanh)
            p_h2 = tf.contrib.layers.fully_connected(p_h1,
                                                     self.hidden_size,
                                                     activation_fn=tf.nn.tanh)
            logits = tf.contrib.layers.fully_connected(p_h2,
                                                       1,
                                                       activation_fn=None)
        return logits

    def get_trainable_variables(self):
        return tf.trainable_variables(self.scope)

    def get_reward(self, obs, acs):
        return np.squeeze(self.reward(obs, acs))

    def build_reward_op(self, obs_ph, acs_ph):
        logits = self.build_graph(obs_ph, acs_ph)
        if self.log_reward:
            return -tf.log(1 - tf.nn.sigmoid(logits) + 1e-8)
        return tf.nn.sigmoid(logits)

    def set_expert_data(self, data):
        self.data = Dataset(data, deterministic=False)

    def train(self, rl_ob, rl_ac, steps=1, lr=3e-4):
        n = rl_ob.shape[0]
        loss_buf = []
        batch_size = rl_ob.shape[0] // steps
        for batch in iterbatches([rl_ob, rl_ac],
                                 include_final_partial_batch=False,
                                 batch_size=batch_size):
            exp_ob, exp_ac = self.data.next_batch(batch_size)
            if self.obs_rms and self.dyn_norm:
                self.obs_rms.update(np.concatenate([exp_ob, rl_ob], axis=0))
            loss_buf.append(self._train(*batch, exp_ob, exp_ac, lr))
        logger.info(fmt_row(13, self.loss_name))
        logger.info(fmt_row(13, np.mean(loss_buf, axis=0)))
class MADDPG(object):
    def __init__(self,
                 name,
                 actor,
                 critic,
                 memory,
                 obs_space_n,
                 act_space_n,
                 agent_index,
                 obs_rms,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        self.name = name
        self.num_agents = len(obs_space_n)
        self.agent_index = agent_index

        from gym import spaces
        continuous_ctrl = not isinstance(act_space_n[0], spaces.Discrete)
        # TODO: remove after testing
        assert continuous_ctrl

        # Multi-agent inputs
        # self.obs0 = []
        # self.obs1 = []
        self.actions = []
        # self.norm_obs0_ph = []
        # self.norm_obs1_ph = []

        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(
                                       self.num_agents,
                                       None,
                                   ) + obs_space_n[self.agent_index].shape,
                                   name="obs0")
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(
                                       self.num_agents,
                                       None,
                                   ) + obs_space_n[self.agent_index].shape,
                                   name="obs1")

        # if continuous_ctrl:
        #     self.actions = tf.placeholder(tf.float32, shape=(self.num_agents, None,) + act_space_n[self.agent_index].shape, name="action")
        # else:
        #     act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        #     self.actions = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]

        # this is required to reshape obs and actions for concatenation
        obs_shape_list = [self.num_agents] + list(
            obs_space_n[self.agent_index].shape)
        act_shape_list = [self.num_agents] + list(
            act_space_n[self.agent_index].shape)
        self.obs_shape_prod = np.prod(obs_shape_list)
        self.act_shape_prod = np.prod(act_shape_list)

        for i in range(self.num_agents):
            # each obs in obs0,obs1 contains info about ego agent and relative pos/vel of other agents
            # self.obs0.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs0_"+str(i)))
            # self.obs1.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs1_"+str(i)))

            if continuous_ctrl:
                self.actions.append(
                    tf.placeholder(tf.float32,
                                   shape=[None] + list(act_space_n[i].shape),
                                   name="action" + str(i)))
            else:
                self.actions.append(
                    make_pdtype(act_space_n[i]).sample_placeholder(
                        [None], name="action" + str(i)))

            # self.norm_obs0_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs0_"+str(i)))
            # self.norm_obs1_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs1_"+str(i)))

        # self.norm_obs0_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs0")
        # self.norm_obs1_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs1")

        # we only provide single agent inputs for these placeholders
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')

        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        # TODO: need to update the replay buffer storage function to account for multiple agents
        if self.normalize_observations:
            self.obs_rms = obs_rms
        else:
            self.obs_rms = None

        # Need to transpose observations so we can normalize them
        # converts tensor to shape (batch_size, num_agents, space_size)
        # transose on dim 0 and 1, leave dim 2 unchanged
        obs0_t = tf.transpose(self.obs0, perm=[1, 0, 2])
        obs1_t = tf.transpose(self.obs1, perm=[1, 0, 2])
        actions_t = tf.transpose(self.actions, perm=[1, 0, 2])

        # each entry in obs_t is normalized wrt the agent
        normalized_obs0 = tf.clip_by_value(normalize(obs0_t, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1_t, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # convert the obs to original shape after normalization for convenience
        normalized_act_obs0 = tf.transpose(normalized_obs0, perm=[1, 0, 2])
        normalized_act_obs1 = tf.transpose(normalized_obs1, perm=[1, 0, 2])

        # need to specify exact shape, since we dont always pass batch size number of obs/act
        normalized_obs0_flat = tf.reshape(normalized_obs0,
                                          [-1, self.obs_shape_prod])
        normalized_obs1_flat = tf.reshape(normalized_obs1,
                                          [-1, self.obs_shape_prod])
        actions_t_flat = tf.reshape(actions_t, [-1, self.act_shape_prod])

        # Return normalization.
        # TODO: update this to handle multiple agents if required
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        # Each agents gets its own observation
        self.actor_tf = actor(normalized_act_obs0[self.agent_index])
        self.target_actor_tf = target_actor(
            normalized_act_obs1[self.agent_index])

        # Critic gets all observations
        self.normalized_critic_tf = critic(normalized_obs0_flat,
                                           actions_t_flat)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # need to provide critic() with all actions
        act_input_n = self.actions + []  # copy actions
        act_input_n[
            self.
            agent_index] = self.actor_tf  # update current agent action using its actor
        act_input_n_t = tf.transpose(act_input_n, perm=[1, 0, 2])
        act_input_n_t_flat = tf.reshape(act_input_n_t,
                                        [-1, self.act_shape_prod])
        self.normalized_critic_with_actor_tf = critic(normalized_obs0_flat,
                                                      act_input_n_t_flat,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # we need to use actions for all agents
        target_act_input_n = self.actions + []  # copy actions
        target_act_input_n[
            self.
            agent_index] = self.target_actor_tf  # update current agent action using its target actor
        target_act_input_n_t = tf.transpose(target_act_input_n, perm=[1, 0, 2])
        target_act_input_n_t_flat = tf.reshape(target_act_input_n_t,
                                               [-1, self.act_shape_prod])
        Q_obs1 = denormalize(
            target_critic(normalized_obs1_flat, target_act_input_n_t_flat),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            # param noise is added to actor; hence obs for current agent is required
            self.setup_param_noise(normalized_act_obs0[self.agent_index])
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if var.name.endswith('/w:0') and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    # TODO: need to provide all observations to compute q
    def step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        # feed_dict={ph: [data] for ph, data in zip(self.obs0, obs)}
        # feed_dict = {self.obs0: [obs]}

        # Get the normalized obs first
        # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action[0], q, None, None

    # TODO: test this
    # Computing this every time step may slow things
    def get_q_value(self, obs_n, act_n):
        # assuming computing q value for one state; hence need [] around data
        feed_dict = {ph: [data] for ph, data in zip(self.obs0, obs_n)}
        act_dict = {ph: [data] for ph, data in zip(self.actions, act_n)}
        feed_dict.update(act_dict)
        q = self.sess.run(self.critic_with_actor_tf, feed_dict=feed_dict)
        return q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        # print(action)
        B = obs0.shape[0]
        a_idx = self.agent_index
        for b in range(B):
            self.memory.append(obs0[b][a_idx], action[b][a_idx],
                               reward[b][a_idx], obs1[b][a_idx],
                               terminal1[b][a_idx])

            # NOTE: calling update for each agent is ok, since the mean and std are uneffected
            # this is because the same obs are repeated num_agent times, which dont affect value
            if self.normalize_observations:
                # provide full obs for obs_rms update
                obs0_shape = (len(obs0[b]), ) + obs0[b][a_idx].shape
                assert obs0_shape == (self.num_agents, ) + obs0[b][a_idx].shape
                self.obs_rms.update(np.array([obs0[b]]))

    # TODO: not using this right now
    def update_obs_rms(self, obs0):
        if not self.normalize_observations:
            return
        B = obs0.shape[0]
        for b in range(B):
            # provide full obs for obs_rms update
            self.obs_rms.update(np.array([obs0[b]]))
        return

    def train(self, agents):
        # generate indices to access batches from all agents
        replay_sample_index = self.memory.generate_index(self.batch_size)

        # collect replay sample from all agents
        obs0_n = []
        obs1_n = []
        rewards_n = []
        act_n = []
        terminals1_n = []
        for i in range(self.num_agents):
            # Get a batch.
            batch = agents[i].memory.sample(batch_size=self.batch_size,
                                            index=replay_sample_index)
            obs0_n.append(batch['obs0'])
            obs1_n.append(batch['obs1'])
            act_n.append(batch['actions'])
            # rewards_n.append(batch['rewards'])
            # terminals1_n.append(batch['terminals1'])
        batch = self.memory.sample(batch_size=self.batch_size,
                                   index=replay_sample_index)

        # fill placeholders in obs1 with corresponding obs from each agent's replay buffer
        # self.obs1 and obs1_n are lists of size num_agents
        # feed_dict={ph: data for ph, data in zip(self.obs1, obs1_n)}
        feed_dict = {self.obs1: obs1_n}

        # TODO: find a better way to do this
        # Get the normalized obs first
        # norm_obs1 = self.sess.run(self.norm_obs1, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {self.norm_obs1_ph: norm_obs1}
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs1_ph, norm_obs1)}

        # actions required for critic
        act_dict = {ph: data for ph, data in zip(self.actions, act_n)}
        feed_dict.update(act_dict)
        feed_dict.update({self.rewards: batch['rewards']})
        feed_dict.update(
            {self.terminals1: batch['terminals1'].astype('float32')})

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict=feed_dict)
            # old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })

            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict=feed_dict)
            # target_Q = self.sess.run(self.target_Q, feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]

        # generate feed_dict for multiple observations and actions
        # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)}
        feed_dict = {self.obs0: obs0_n}

        # Get the normalized obs first
        # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {self.norm_obs0_ph: norm_obs0}
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

        # act_dict={ph: data for ph, data in zip(self.actions, act_n)}
        feed_dict.update(act_dict)
        feed_dict.update({self.critic_target: target_Q})

        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops, feed_dict=feed_dict)
        # actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
        #     self.obs0: batch['obs0'],
        #     self.actions: batch['actions'],
        #     self.critic_target: target_Q,
        # })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess

    def agent_initialize(self, sess):
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)
        # setup saving and loading functions
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self, agents):
        if self.stats_sample is None:
            replay_sample_index = self.memory.generate_index(self.batch_size)
            # collect replay sample from all agents
            obs0_n, act_n = [], []
            for i in range(self.num_agents):
                batch = agents[i].memory.sample(batch_size=self.batch_size,
                                                index=replay_sample_index)
                obs0_n.append(batch['obs0'])
                act_n.append(batch['actions'])
            # generate feed_dict for multiple observations and actions
            # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)}
            feed_dict = {self.obs0: obs0_n}

            # Get the normalized obs first
            # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
            # use the normalized obs for training
            # feed_dict = {self.norm_obs0_ph: norm_obs0}
            # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

            actions_dict = {ph: data for ph, data in zip(self.actions, act_n)}
            feed_dict.update(actions_dict)

            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = feed_dict
        values = self.sess.run(self.stats_ops, feed_dict=self.stats_sample)

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self, agents):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        replay_sample_index = self.memory.generate_index(self.batch_size)
        obs0_n = []
        for i in range(self.num_agents):
            batch = agents[i].memory.sample(batch_size=self.batch_size,
                                            index=replay_sample_index)
            obs0_n.append(batch['obs0'])
        # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)}
        feed_dict = {self.obs0: obs0_n}

        # Get the normalized obs first
        # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {self.norm_obs0_ph: norm_obs0}
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

        feed_dict.update(
            {self.param_noise_stddev: self.param_noise.current_stddev})

        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict=feed_dict)
        # distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
        #     self.obs0: batch['obs0'],
        #     self.param_noise_stddev: self.param_noise.current_stddev,
        # })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, obs_name='ob',\
              obrms=True, final_std=0.01, init_logstd=0.0, observation_permutation=None,action_permutation=None, soft_mirror=False):
        assert isinstance(ob_space, gym.spaces.Box)

        obs_perm_mat = np.zeros(
            (len(observation_permutation), len(observation_permutation)),
            dtype=np.float32)
        self.obs_perm_mat = obs_perm_mat
        for i, perm in enumerate(observation_permutation):
            obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm)

        if isinstance(ac_space, gym.spaces.Box):
            act_perm_mat = np.zeros(
                (len(action_permutation), len(action_permutation)),
                dtype=np.float32)
            self.act_perm_mat = act_perm_mat
            for i, perm in enumerate(action_permutation):
                self.act_perm_mat[i][int(np.abs(perm))] = np.sign(perm)
        elif isinstance(ac_space, gym.spaces.MultiDiscrete):
            total_dim = int(np.sum(ac_space.nvec))
            dim_index = np.concatenate([[0], np.cumsum(ac_space.nvec)])
            act_perm_mat = np.zeros((total_dim, total_dim), dtype=np.float32)
            self.act_perm_mat = act_perm_mat
            for i, perm in enumerate(action_permutation):
                perm_mat = np.identity(ac_space.nvec[i])
                if np.sign(perm) < 0:
                    perm_mat = np.flipud(perm_mat)
                    self.act_perm_mat[
                        dim_index[i]:dim_index[i] + ac_space.nvec[i],
                        dim_index[int(np.abs(perm)
                                      )]:dim_index[int(np.abs(perm))] +
                        ac_space.nvec[int(np.abs(perm))]] = perm_mat

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        print(self.pdtype)
        print([sequence_length] + list(ob_space.shape))
        ob = U.get_placeholder(name=obs_name,
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        mirror_ob = tf.matmul(ob, obs_perm_mat)
        mirror_obz = tf.clip_by_value(
            (mirror_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        if not obrms:
            obz = ob
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        if isinstance(ac_space, gym.spaces.Box):
            pol_net = GenericFF('pol_net', ob_space.shape[0], [],
                                pdtype.param_shape()[0] // 2, hid_size,
                                num_hid_layers)
        elif isinstance(ac_space, gym.spaces.MultiDiscrete):
            pol_net = GenericFF('pol_net', ob_space.shape[0], [],
                                pdtype.param_shape()[0], hid_size,
                                num_hid_layers)

        orig_out = pol_net.get_output_tensor(obz, None, tf.nn.tanh)
        mirr_out = tf.matmul(
            pol_net.get_output_tensor(mirror_obz, None, tf.nn.tanh),
            act_perm_mat)
        if not soft_mirror:
            mean = orig_out + mirr_out
        else:
            mean = orig_out
            self.additional_loss = tf.reduce_mean(
                tf.abs(orig_out - mirr_out)) * 1.0

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, pdtype.param_shape()[0] // 2],
                initializer=tf.constant_initializer(init_logstd))
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = mean

        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def __init__(self, actor, critic, experts, obs_dim, memory, observation_shape, action_shape,
        expert_is_np=False,
        param_noise=None, action_noise=None,
        gamma=0.95, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = copy(actor)
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.experts = experts
        self.obs_dim = obs_dim
        # self.critic_obs0 = self.experts[0].obs0
        # self.critic_obs1 = self.experts[0].obs1
        # self.critic_actor = self.experts[0].use_tf_actor

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        expert0_normalize_obs0 = [tf.clip_by_value(normalize(self.obs0[:, :self.obs_dim], self.experts[i].obs_rms),
            self.observation_range[0], self.observation_range[1]) for i in range(len(self.experts))]
        expert_qv0 = tf.squeeze(tf.stack([experts[i].critic(expert0_normalize_obs0[i], self.actions)\
         for i in range(len(self.experts))]), axis=2)
        # expert_qv0 = tf.Print(expert_qv0, [expert_qv0], '>>>> qv0 :', summarize=10)
        expert_qv0 = tf.reduce_sum(self.obs0[:, self.obs_dim:] * tf.transpose(expert_qv0), axis=1)

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions, tf.stop_gradient(expert_qv0))
        self.critic_tf = tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1])

        expert_qv0_with_actor_tf = tf.squeeze(tf.stack([experts[i].critic(expert0_normalize_obs0[i], self.actor_tf) for i in range(len(self.experts))]),
            axis=2)
        expert_qv0_with_actor_tf = tf.reduce_sum(self.obs0[:, self.obs_dim:] * tf.transpose(expert_qv0_with_actor_tf), axis=1)

        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, tf.stop_gradient(expert_qv0_with_actor_tf))
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)

        action1 = target_actor(normalized_obs1)
        expert0_normalize_obs1 = [tf.clip_by_value(normalize(self.obs1[:, :self.obs_dim], self.experts[i].obs_rms),
            self.observation_range[0], self.observation_range[1]) for i in range(len(self.experts))]
        expert_qv1 = tf.squeeze(tf.stack([(experts[i].critic(expert0_normalize_obs1[i], action1)) for i in range(len(self.experts))]), axis=2)
        expert_qv1 = tf.reduce_sum(self.obs1[:, self.obs_dim:] * tf.transpose(expert_qv1), axis=1)

        self.Q_obs1 = target_critic(normalized_obs1, action1, tf.stop_gradient(expert_qv1))
        # self.Q_obs1 = tf.Print(self.Q_obs1, [self.Q_obs1], '>>>> Q :', summarize=10)
        # self.terminals1 = tf.Print(self.terminals1, [self.terminals1], '>>>> terminal :', summarize=10)

        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * self.Q_obs1

        self.expert_qv1 = expert_qv1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None # recurrent architectures not supported yet
示例#9
0
class MlpPolicy(object):
    recurrent = False

    def __init__(self, name, *args, **kwargs):
        self.scope = name
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            self._init(*args, **kwargs)

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=False,
              popart=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[None] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope("popart"):
            self.v_rms = RunningMeanStd(shape=[1])

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.norm_vpred = dense(last_out,
                                1,
                                "vffinal",
                                weight_init=U.normc_initializer(1.0))[:, 0]
        if popart:
            self.vpred = denormalize(self.norm_vpred, self.v_rms)
        else:
            self.vpred = self.norm_vpred

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        self.use_popart = popart
        if popart:
            self.init_popart()

        ret = tf.placeholder(tf.float32, [None])
        vferr = tf.reduce_mean(tf.square(self.vpred - ret))
        self.vlossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr,
                                                  self.get_vf_variable()))

    def init_popart(self):
        old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.v_rms.std
        old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.v_rms.mean

        renormalize_Q_outputs_op = []
        vs = self.output_vars
        M, b = vs
        renormalize_Q_outputs_op += [M.assign(M * old_std / new_std)]
        renormalize_Q_outputs_op += [
            b.assign((b * old_std + old_mean - new_mean) / new_std)
        ]
        self.renorm_v = U.function([old_std, old_mean], [],
                                   updates=renormalize_Q_outputs_op)

    def act(self, stochastic, ob):
        ac1, vpred1 = self._act(stochastic, ob[None])
        return ac1[0], vpred1[0]

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.trainable_variables(self.scope)

    def get_initial_state(self):
        return []

    def get_vf_variable(self):
        return tf.trainable_variables(self.scope + "/vf")

    def update_popart(self, v_targets):
        old_mean, old_std = U.get_session().run(
            [self.v_rms.mean, self.v_rms.std])
        self.v_rms.update(v_targets)
        self.renorm_v(old_std, old_mean)

    @property
    def output_vars(self):
        output_vars = [
            var for var in self.get_vf_variable() if 'vffinal' in var.name
        ]
        return output_vars

    def save_policy(self, name):
        U.save_variables(name, variables=self.get_variables())

    def load_policy(self, name):
        U.load_variables(name, variables=self.get_variables())
示例#10
0
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions,
                                    observation_shape,
                                    name='target_critic',
                                    network=critic.network,
                                    **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions,
                                  observation_shape,
                                  name='target_actor',
                                  network=actor.network,
                                  **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(
                comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(
                comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_variables
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [
            var.get_shape().as_list()
            for var in self.critic.trainable_variables
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables,
                                   self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables,
                                   self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables,
                                          self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(
                        perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name,
                                                    var.name))
            for var, perturbed_var in zip(
                    self.actor.variables,
                    self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(
                        perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name,
                                                    var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None  # recurrent architectures not supported yet
示例#11
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None # None: no recurrent network, Int num: MLP

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        share_layer = obz

        for i in range(num_hid_layers):
            share_layer = tf.nn.tanh(tf.layers.dense(share_layer, hid_size, name="fc%i" % (i + 1),
                                                       kernel_initializer=U.normc_initializer(1.0)))

        self.lstm_input = tf.expand_dims(share_layer, [0])

        step_size = tf.shape(obz)[:1]

        # add the recurrent layers or combined with one Mlp
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(hid_size, state_is_tuple=True)
        c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
        h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
        self.lstm_state_init = [c_init, h_init]

        c_in = U.get_placeholder(name="cin", dtype=tf.float32, shape=[1, lstm_cell.state_size.c])
        h_in = U.get_placeholder(name="hin", dtype=tf.float32, shape=[1, lstm_cell.state_size.h])

        self.state_in = [c_in, h_in]
        lstm_state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)

        self.lstm_out1, lstm_state_out = tf.nn.dynamic_rnn(lstm_cell, self.lstm_input,
                                                           initial_state=lstm_state_in,
                                                           sequence_length=step_size,
                                                           time_major=False)

        lstm_c, lstm_h = lstm_state_out
        self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]
        self.lstm_out = tf.reshape(self.lstm_out1, [-1, hid_size])

        with tf.variable_scope('vf'):

            self.last_out = self.lstm_out
            # for i in range(num_hid_layers):
            #     self.last_out = tf.nn.tanh(tf.layers.dense(self.last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(self.last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            self.last_out_pol = self.lstm_out
            # for i in range(num_hid_layers):
            #     self.last_out_pol = tf.nn.tanh(tf.layers.dense(self.last_out_pol, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(self.last_out_pol, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                self.pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                self.pdparam = tf.layers.dense(self.last_out_pol, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(self.pdparam)

        # self.state_in = []
        # self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob, self.state_in[0], self.state_in[1]], [ac, self.vpred, self.state_out])
示例#12
0
class DDPG(tf.Module):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions,
                                    observation_shape,
                                    name='target_critic',
                                    network=critic.network,
                                    **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions,
                                  observation_shape,
                                  name='target_actor',
                                  network=actor.network,
                                  **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(
                comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(
                comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_variables
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [
            var.get_shape().as_list()
            for var in self.critic.trainable_variables
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables,
                                   self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables,
                                   self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables,
                                          self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(
                        perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name,
                                                    var.name))
            for var, perturbed_var in zip(
                    self.actor.variables,
                    self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(
                        perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name,
                                                    var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_param_noise(self):
        assert self.param_noise is not None

        # Configure perturbed actor.
        self.perturbed_actor = Actor(self.actor.nb_actions,
                                     self.observation_shape,
                                     name='param_noise_actor',
                                     network=self.actor.network,
                                     **self.actor.network_kwargs)

        # Configure separate copy for stddev adoption.
        self.perturbed_adaptive_actor = Actor(
            self.actor.nb_actions,
            self.observation_shape,
            name='adaptive_param_noise_actor',
            network=self.actor.network,
            **self.actor.network_kwargs)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1

    @tf.function
    def step(self, obs, apply_noise=True, compute_Q=True):
        normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms),
                                          self.observation_range[0],
                                          self.observation_range[1])
        actor_tf = self.actor(normalized_obs)
        if self.param_noise is not None and apply_noise:
            action = self.perturbed_actor(normalized_obs)
        else:
            action = actor_tf

        if compute_Q:
            normalized_critic_with_actor_tf = self.critic(
                normalized_obs, actor_tf)
            q = denormalize(
                tf.clip_by_value(normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            action += noise
        action = tf.clip_by_value(action, self.action_range[0],
                                  self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b],
                               terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        batch = self.memory.sample(batch_size=self.batch_size)
        obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1'])
        actions, rewards, terminals1 = tf.constant(
            batch['actions']), tf.constant(batch['rewards']), tf.constant(
                batch['terminals1'], dtype=tf.float32)
        normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(
            obs0, obs1, rewards, terminals1)

        if self.normalize_returns and self.enable_popart:
            old_mean = self.ret_rms.mean
            old_std = self.ret_rms.std
            self.ret_rms.update(target_Q.flatten())
            # renormalize Q outputs
            new_mean = self.ret_rms.mean
            new_std = self.ret_rms.std
            for vs in [
                    self.critic.output_vars, self.target_critic.output_vars
            ]:
                kernel, bias = vs
                kernel.assign(kernel * old_std / new_std)
                bias.assign((bias * old_std + old_mean - new_mean) / new_std)

        actor_grads, actor_loss = self.get_actor_grads(normalized_obs0)
        critic_grads, critic_loss = self.get_critic_grads(
            normalized_obs0, actions, target_Q)

        if MPI is not None:
            self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr)
            self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr)
        else:
            self.actor_optimizer.apply_gradients(
                zip(actor_grads, self.actor.trainable_variables))
            self.critic_optimizer.apply_gradients(
                zip(critic_grads, self.critic.trainable_variables))

        return critic_loss, actor_loss

    @tf.function
    def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards,
                                             terminals1):
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        Q_obs1 = denormalize(
            self.target_critic(normalized_obs1,
                               self.target_actor(normalized_obs1)),
            self.ret_rms)
        target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1
        return normalized_obs0, target_Q

    @tf.function
    def get_actor_grads(self, normalized_obs0):
        with tf.GradientTape() as tape:
            actor_tf = self.actor(normalized_obs0)
            normalized_critic_with_actor_tf = self.critic(
                normalized_obs0, actor_tf)
            critic_with_actor_tf = denormalize(
                tf.clip_by_value(normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            actor_loss = -tf.reduce_mean(critic_with_actor_tf)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        if self.clip_norm:
            actor_grads = [
                tf.clip_by_norm(grad, clip_norm=self.clip_norm)
                for grad in actor_grads
            ]
        if MPI is not None:
            actor_grads = tf.concat(
                [tf.reshape(g, (-1, )) for g in actor_grads], axis=0)
        return actor_grads, actor_loss

    @tf.function
    def get_critic_grads(self, normalized_obs0, actions, target_Q):
        with tf.GradientTape() as tape:
            normalized_critic_tf = self.critic(normalized_obs0, actions)
            normalized_critic_target_tf = tf.clip_by_value(
                normalize(target_Q, self.ret_rms), self.return_range[0],
                self.return_range[1])
            critic_loss = tf.reduce_mean(
                tf.square(normalized_critic_tf - normalized_critic_target_tf))
            # The first is input layer, which is ignored here.
            if self.critic_l2_reg > 0.:
                # Ignore the first input layer.
                for layer in self.critic.network_builder.layers[1:]:
                    # The original l2_regularizer takes half of sum square.
                    critic_loss += (self.critic_l2_reg / 2.) * tf.reduce_sum(
                        tf.square(layer.kernel))
        critic_grads = tape.gradient(critic_loss,
                                     self.critic.trainable_variables)
        if self.clip_norm:
            critic_grads = [
                tf.clip_by_norm(grad, clip_norm=self.clip_norm)
                for grad in critic_grads
            ]
        if MPI is not None:
            critic_grads = tf.concat(
                [tf.reshape(g, (-1, )) for g in critic_grads], axis=0)
        return critic_grads, critic_loss

    def initialize(self):
        if MPI is not None:
            sync_from_root(self.actor.trainable_variables +
                           self.critic.trainable_variables)
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    @tf.function
    def update_target_net(self):
        for var, target_var in zip(self.actor.variables,
                                   self.target_actor.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)
        for var, target_var in zip(self.critic.variables,
                                   self.target_critic.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)

    def get_stats(self):

        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        obs0 = self.stats_sample['obs0']
        actions = self.stats_sample['actions']
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_critic_tf = self.critic(normalized_obs0, actions)
        critic_tf = denormalize(
            tf.clip_by_value(normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        actor_tf = self.actor(normalized_obs0)
        normalized_critic_with_actor_tf = self.critic(normalized_obs0,
                                                      actor_tf)
        critic_with_actor_tf = denormalize(
            tf.clip_by_value(normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        stats = {}
        if self.normalize_returns:
            stats['ret_rms_mean'] = self.ret_rms.mean
            stats['ret_rms_std'] = self.ret_rms.std
        if self.normalize_observations:
            stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean)
            stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std)
        stats['reference_Q_mean'] = tf.reduce_mean(critic_tf)
        stats['reference_Q_std'] = reduce_std(critic_tf)
        stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf)
        stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf)
        stats['reference_action_mean'] = tf.reduce_mean(actor_tf)
        stats['reference_action_std'] = reduce_std(actor_tf)

        if self.param_noise:
            perturbed_actor_tf = self.perturbed_actor(normalized_obs0)
            stats['reference_perturbed_action_mean'] = tf.reduce_mean(
                perturbed_actor_tf)
            stats['reference_perturbed_action_std'] = reduce_std(
                perturbed_actor_tf)
            stats.update(self.param_noise.get_stats())
        return stats

    def adapt_param_noise(self, obs0):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        mean_distance = self.get_mean_distance(obs0).numpy()

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()

        self.param_noise.adapt(mean_distance)
        return mean_distance

    @tf.function
    def get_mean_distance(self, obs0):
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        update_perturbed_actor(self.actor, self.perturbed_adaptive_actor,
                               self.param_noise.current_stddev)

        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        actor_tf = self.actor(normalized_obs0)
        adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0)
        mean_distance = tf.sqrt(
            tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf)))
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            update_perturbed_actor(self.actor, self.perturbed_actor,
                                   self.param_noise.current_stddev)
示例#13
0
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 td3_variant=False,
                 td3_policy_freq=1,
                 td3_policy_noise=0.0,
                 td3_noise_clip=0.5):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        #추가된 내용
        #parameters for using TD3 variant of DDPG
        #https://arxiv.org/abs/1802.09477
        self.td3_variant = td3_variant
        self.td3_policy_freq = td3_policy_freq
        self.td3_policy_noise = td3_policy_noise
        self.td3_noise_clip = td3_noise_clip

        #노말라이제이션 코드 her에서 가져오자.
        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)

        if self.td3_variant:
            logger.info('using TD3 variant model')
            self.normalized_critic_tf, self.normalized_critic_tf2 = critic(
                normalized_obs0, self.actions)
            self.critic_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            self.normalized_critic_with_actor_tf, _ = critic(normalized_obs0,
                                                             self.actor_tf,
                                                             reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            out_q1, out_q2 = target_critic(normalized_obs1,
                                           target_actor(normalized_obs1))
            min_q1 = tf.minimum(out_q1, out_q2)  #min 값 내준다
            Q_obs1 = denormalize(min_q1, self.ret_rms)
            #기존 ddpg와의 차이점은 동일한 네트워크를 2개의 네트워크로 학습시키고 작은 q값을 내준다. --> 두 Q함수들은 하나의 타겟을 사용합니다, 두 Q함수중 작은 값을 사용하여 계산됩니다.
            # 그리고 난 다음 둘다 이 타겟에 대해 regressing 함으로써 배웁니다. 이로써 Q함수의 과대평가를 막는다
            # 그걸 rms와 denormalize해서 Q_obs1로 넘겨줌
            #-->Clipped Double-Q Learning 알고리즘을 따른다.
        else:
            self.normalized_critic_tf = critic(normalized_obs0, self.actions)
            self.critic_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                          self.actor_tf,
                                                          reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            Q_obs1 = denormalize(
                target_critic(normalized_obs1, target_actor(normalized_obs1)),
                self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
示例#14
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 td3_variant=False,
                 td3_policy_freq=1,
                 td3_policy_noise=0.0,
                 td3_noise_clip=0.5):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        #추가된 내용
        #parameters for using TD3 variant of DDPG
        #https://arxiv.org/abs/1802.09477
        self.td3_variant = td3_variant
        self.td3_policy_freq = td3_policy_freq
        self.td3_policy_noise = td3_policy_noise
        self.td3_noise_clip = td3_noise_clip

        #노말라이제이션 코드 her에서 가져오자.
        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)

        if self.td3_variant:
            logger.info('using TD3 variant model')
            self.normalized_critic_tf, self.normalized_critic_tf2 = critic(
                normalized_obs0, self.actions)
            self.critic_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            self.normalized_critic_with_actor_tf, _ = critic(normalized_obs0,
                                                             self.actor_tf,
                                                             reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            out_q1, out_q2 = target_critic(normalized_obs1,
                                           target_actor(normalized_obs1))
            min_q1 = tf.minimum(out_q1, out_q2)  #min 값 내준다
            Q_obs1 = denormalize(min_q1, self.ret_rms)
            #기존 ddpg와의 차이점은 동일한 네트워크를 2개의 네트워크로 학습시키고 작은 q값을 내준다. --> 두 Q함수들은 하나의 타겟을 사용합니다, 두 Q함수중 작은 값을 사용하여 계산됩니다.
            # 그리고 난 다음 둘다 이 타겟에 대해 regressing 함으로써 배웁니다. 이로써 Q함수의 과대평가를 막는다
            # 그걸 rms와 denormalize해서 Q_obs1로 넘겨줌
            #-->Clipped Double-Q Learning 알고리즘을 따른다.
        else:
            self.normalized_critic_tf = critic(normalized_obs0, self.actions)
            self.critic_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                          self.actor_tf,
                                                          reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
            Q_obs1 = denormalize(
                target_critic(normalized_obs1, target_actor(normalized_obs1)),
                self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        # self.target_soft_updates = [actor_soft_updates, critic_soft_updates] DDPG
        self.actor_target_soft_updates = actor_soft_updates
        self.critic_target_soft_updates = critic_soft_updates

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        if self.td3_variant:
            logger.info('using TD3 variant loss')
            self.critic_loss = tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf) \
                               + tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf2)
        else:

            self.critic_loss = tf.reduce_mean(
                tf.square(self.normalized_critic_tf -
                          normalized_critic_target_tf))

        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):

        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {
            self.obs0: U.adjust_shape(self.obs0, [obs])
        }  #obs0에만 obs feed해준다

        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        #Exploration을 위해 액션에 노이즈 추가
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b],
                               terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self, train_iter):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        if self.td3_policy_noise > 0:
            noise = np.random.normal(loc=0.0,
                                     scale=self.td3_policy_noise,
                                     size=np.shape(batch['actions']))
            noise = np.clip(noise, -self.td3_noise_clip, self.td3_noise_clip)
            # Get all gradients and perform a synced update.
            ops = [
                self.actor_grads, self.actor_loss, self.critic_grads,
                self.critic_loss
            ]
            actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
                ops,
                feed_dict={
                    self.obs0:
                    batch['obs0'],
                    self.actions:
                    np.clip(batch['actions'] + noise, self.action_range[0],
                            self.action_range[1]),
                    self.critic_target:
                    target_Q,
                })
        else:
            # Get all gradients and perform a synced update.
            ops = [
                self.actor_grads, self.actor_loss, self.critic_grads,
                self.critic_loss
            ]
            actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
                ops,
                feed_dict={
                    self.obs0: batch['obs0'],
                    self.actions: batch['actions'],
                    self.critic_target: target_Q,
                })

        #TD3 has hyperparameter for how frequently to update actor policy and target networks
        if train_iter % self.td3_policy_freq == 0:
            self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)

        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self, train_iter):
        # TD3 has hyperparameter for how frequently to update actor policy and target networks
        if train_iter % self.td3_policy_freq == 0:
            self.sess.run(self.actor_target_soft_updates)
            self.sess.run(self.critic_target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    #파라미터에 노이즈는 왜추가하지?
    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#15
0
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
示例#16
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        last_out = obz
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'vfconv1', (7, 7), (3, 3), pad='VALID'))
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'vfconv2', (5, 5), (2, 2), pad='VALID'))
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'vfconv3', (3, 3), (1, 1), pad='VALID'))
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'vfconv4', (3, 3), (1, 1), pad='VALID'))
        last_out = tf.reshape(last_out, tf.convert_to_tensor([-1, 784 * 4]))
        last_out = tf.nn.tanh(
            tf.layers.dense(last_out,
                            512,
                            kernel_initializer=U.normc_initializer(1.0)))

        self.vpred = tf.layers.dense(
            last_out, 1, kernel_initializer=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'polconv1', (7, 7), (3, 3), pad='VALID'))
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'polconv2', (5, 5), (2, 2), pad='VALID'))
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'polconv3', (3, 3), (1, 1), pad='VALID'))
        last_out = tf.nn.tanh(
            U.conv2d(last_out, 64, 'polconv4', (3, 3), (1, 1), pad='VALID'))
        last_out = tf.reshape(last_out, tf.convert_to_tensor([-1, 784 * 4]))
        last_out = tf.nn.tanh(
            tf.layers.dense(last_out,
                            512,
                            kernel_initializer=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = tf.layers.dense(
                last_out,
                pdtype.param_shape()[0] // 2,
                kernel_initializer=U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = tf.layers.dense(
                last_out,
                pdtype.param_shape()[0],
                kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#17
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            num_policy = 1
            state_embedding = tf.tile(tf.expand_dims(obz, axis=1),
                                      [1, num_policy, 1])

            rnn_cell = rnn.BasicLSTMCell(num_units=pdtype.param_shape()[0])

            self.sub_policies, states = tf.nn.dynamic_rnn(
                cell=rnn_cell,
                inputs=state_embedding,
                dtype=tf.float32,
                scope='subpolicy')

            lstm_cell = rnn.BasicLSTMCell(num_units=num_policy)

            concatenated = tf.concat([self.sub_policies, state_embedding],
                                     axis=2)

            self.out, states = tf.nn.dynamic_rnn(cell=lstm_cell,
                                                 inputs=concatenated,
                                                 dtype=tf.float32,
                                                 scope='master')
            last_output = self.out[:, -1, :]

            self.chosen_index = tf.argmax(last_output, axis=1)
            # self.weights = tf.nn.softmax(logits=last_output, dim=
            self.weights = tf.one_hot(indices=self.chosen_index,
                                      depth=num_policy)

            last_out = tf.reduce_sum(tf.expand_dims(self.weights, axis=2) *
                                     self.sub_policies,
                                     axis=1)

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(last_out)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#18
0
    def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers,
              kind, elm_mode):
        assert isinstance(ob_space, gym.spaces.Box)
        assert isinstance(sensor_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        ob_sensor = U.get_placeholder(name="ob_sensor",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(sensor_space.shape))

        x = ob / 255.0
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = ODEBlock(32, (3, 3))(x)
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))

        ## Obfilter on sensor output
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=sensor_space.shape)

        obz_sensor = tf.clip_by_value(
            (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        # x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))

        last_out = obz_sensor
        if not elm_mode:
            ## Adapted from mlp_policy
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="vffc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            y = tf.layers.dense(last_out,
                                64,
                                name="vffinal",
                                kernel_initializer=U.normc_initializer(1.0))
        else:
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                name="vffc1",
                                kernel_initializer=U.normc_initializer(1.0),
                                trainable=False))
            y = tf.layers.dense(last_out,
                                64,
                                name="vffinal",
                                kernel_initializer=U.normc_initializer(1.0))

        x = tf.concat([x, y], 1)
        logits = tf.layers.dense(x,
                                 pdtype.param_shape()[0],
                                 name="logits",
                                 kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(
            x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:,
                                                                             0]

        # self.session.run(logits.kernel)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob, ob_sensor],
                               [ac, self.vpred, logits])
示例#19
0
    def __init__(self,
                 name,
                 actor,
                 critic,
                 memory,
                 obs_space_n,
                 act_space_n,
                 agent_index,
                 obs_rms,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        self.name = name
        self.num_agents = len(obs_space_n)
        self.agent_index = agent_index

        from gym import spaces
        continuous_ctrl = not isinstance(act_space_n[0], spaces.Discrete)
        # TODO: remove after testing
        assert continuous_ctrl

        # Multi-agent inputs
        # self.obs0 = []
        # self.obs1 = []
        self.actions = []
        # self.norm_obs0_ph = []
        # self.norm_obs1_ph = []

        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(
                                       self.num_agents,
                                       None,
                                   ) + obs_space_n[self.agent_index].shape,
                                   name="obs0")
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(
                                       self.num_agents,
                                       None,
                                   ) + obs_space_n[self.agent_index].shape,
                                   name="obs1")

        # if continuous_ctrl:
        #     self.actions = tf.placeholder(tf.float32, shape=(self.num_agents, None,) + act_space_n[self.agent_index].shape, name="action")
        # else:
        #     act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        #     self.actions = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]

        # this is required to reshape obs and actions for concatenation
        obs_shape_list = [self.num_agents] + list(
            obs_space_n[self.agent_index].shape)
        act_shape_list = [self.num_agents] + list(
            act_space_n[self.agent_index].shape)
        self.obs_shape_prod = np.prod(obs_shape_list)
        self.act_shape_prod = np.prod(act_shape_list)

        for i in range(self.num_agents):
            # each obs in obs0,obs1 contains info about ego agent and relative pos/vel of other agents
            # self.obs0.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs0_"+str(i)))
            # self.obs1.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs1_"+str(i)))

            if continuous_ctrl:
                self.actions.append(
                    tf.placeholder(tf.float32,
                                   shape=[None] + list(act_space_n[i].shape),
                                   name="action" + str(i)))
            else:
                self.actions.append(
                    make_pdtype(act_space_n[i]).sample_placeholder(
                        [None], name="action" + str(i)))

            # self.norm_obs0_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs0_"+str(i)))
            # self.norm_obs1_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs1_"+str(i)))

        # self.norm_obs0_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs0")
        # self.norm_obs1_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs1")

        # we only provide single agent inputs for these placeholders
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')

        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        # TODO: need to update the replay buffer storage function to account for multiple agents
        if self.normalize_observations:
            self.obs_rms = obs_rms
        else:
            self.obs_rms = None

        # Need to transpose observations so we can normalize them
        # converts tensor to shape (batch_size, num_agents, space_size)
        # transose on dim 0 and 1, leave dim 2 unchanged
        obs0_t = tf.transpose(self.obs0, perm=[1, 0, 2])
        obs1_t = tf.transpose(self.obs1, perm=[1, 0, 2])
        actions_t = tf.transpose(self.actions, perm=[1, 0, 2])

        # each entry in obs_t is normalized wrt the agent
        normalized_obs0 = tf.clip_by_value(normalize(obs0_t, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1_t, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # convert the obs to original shape after normalization for convenience
        normalized_act_obs0 = tf.transpose(normalized_obs0, perm=[1, 0, 2])
        normalized_act_obs1 = tf.transpose(normalized_obs1, perm=[1, 0, 2])

        # need to specify exact shape, since we dont always pass batch size number of obs/act
        normalized_obs0_flat = tf.reshape(normalized_obs0,
                                          [-1, self.obs_shape_prod])
        normalized_obs1_flat = tf.reshape(normalized_obs1,
                                          [-1, self.obs_shape_prod])
        actions_t_flat = tf.reshape(actions_t, [-1, self.act_shape_prod])

        # Return normalization.
        # TODO: update this to handle multiple agents if required
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        # Each agents gets its own observation
        self.actor_tf = actor(normalized_act_obs0[self.agent_index])
        self.target_actor_tf = target_actor(
            normalized_act_obs1[self.agent_index])

        # Critic gets all observations
        self.normalized_critic_tf = critic(normalized_obs0_flat,
                                           actions_t_flat)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # need to provide critic() with all actions
        act_input_n = self.actions + []  # copy actions
        act_input_n[
            self.
            agent_index] = self.actor_tf  # update current agent action using its actor
        act_input_n_t = tf.transpose(act_input_n, perm=[1, 0, 2])
        act_input_n_t_flat = tf.reshape(act_input_n_t,
                                        [-1, self.act_shape_prod])
        self.normalized_critic_with_actor_tf = critic(normalized_obs0_flat,
                                                      act_input_n_t_flat,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # we need to use actions for all agents
        target_act_input_n = self.actions + []  # copy actions
        target_act_input_n[
            self.
            agent_index] = self.target_actor_tf  # update current agent action using its target actor
        target_act_input_n_t = tf.transpose(target_act_input_n, perm=[1, 0, 2])
        target_act_input_n_t_flat = tf.reshape(target_act_input_n_t,
                                               [-1, self.act_shape_prod])
        Q_obs1 = denormalize(
            target_critic(normalized_obs1_flat, target_act_input_n_t_flat),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            # param noise is added to actor; hence obs for current agent is required
            self.setup_param_noise(normalized_act_obs0[self.agent_index])
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
示例#20
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # determine the dimensions of the state space and observation space
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        option =  U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim),)

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_pure = tf.clip_by_value((ob[:,:-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0)

        # define the Q-function network here
        last_out0 = obz_pure # for option 0
        last_out1 = obz_pure # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "vffc0%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "vffc1%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0))

        # return the Q function value Q(s,o)
        self.vpred = U.switch(option[0], last_out1, last_out0)[:,0]

        
        # define the policy over options here
        last_out0 = obz_pure # for option 0
        last_out1 = obz_pure # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "oppi0%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "oppi1%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # return the probabilities for executing the options
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0]
        # we always terminate
        termination_sample = tf.constant([True])
        
        # implement the intra option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01),bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # now we never perform the ZOH, both policies are fully functional
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        ac = tf.clip_by_value(ac,-1.0,1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])        
        self._get_op = U.function([ob], [self.op_pi])
示例#21
0
class ActorLearner(object):
    def __init__(self, name, actor, memory, observation_shape, action_shape,
        gamma=0.95, tau=0.001, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), return_range=(-np.inf, np.inf),
        actor_l2_reg=0., actor_lr=5e-5, clip_norm=None, ):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='expert_actor_obs0')
        self.action_target = tf.placeholder(tf.float32, shape=(None,) + action_shape, name=name+'action_target')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.return_range = return_range
        self.observation_range = observation_range
        self.clip_norm = clip_norm
        self.batch_size = batch_size
        self.stats_sample = None
        self.actor_l2_reg = actor_l2_reg
        self.actor = actor
        self.actor_lr = actor_lr

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope(name + 'obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)

        # Set up parts.
        self.setup_actor_optimizer()
        self.setup_stats()

        self.initial_state = None # recurrent architectures not supported yet

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = tf.reduce_mean(tf.square(self.actor_tf - self.action_target))
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr)
        self.optimize_expr = self.actor_optimizer.minimize(self.actor_loss, var_list=self.actor.trainable_vars)

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        self.stats_ops = ops
        self.stats_names = names

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)
        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss]
        actor_grads, actor_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.action_target: batch['actions'],
        })
        # with self.graph.as_default():
        self.optimize_expr.run(session=self.sess,
            feed_dict={
                self.obs0: batch['obs0'],
                self.action_target: batch['actions'],
            }
            )

        return actor_loss

    def initialize(self, sess):
        self.sess = sess

    def save(self, path):
        save_variables(path)

    def load(self, path):
        load_variables(path)

    def store_transition(self, obs0, action):
        # B = obs0.shape[0]
        # for b in range(B):
        self.memory.append(obs0, action)
        if self.normalize_observations:
            self.obs_rms.update(obs0)
        print("Stored ", obs0.shape)

    def __call__(self, obs):
        # with self.graph.as_default():
        print("Expert Actor call")
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, obs)}
        # import IPython; IPython.embed()
        action = self.sess.run([self.actor_tf], feed_dict=feed_dict)
        print("Expert Actor return")
        return action
示例#22
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        # assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32),
                                  depth=ob_space.n)
            print(last_out)
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32),
                                  depth=ob_space.n)
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#23
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              bound_by_sigmoid=False,
              sigmoid_coef=1.,
              activation='tanh',
              normalize_obs=True,
              actions='gaussian',
              avg_norm_symmetry=False,
              symmetric_interpretation=False,
              stdclip=5.0,
              gaussian_bias=False,
              gaussian_from_binary=False,
              parallel_value=False,
              pv_layers=2,
              pv_hid_size=512,
              three=False):
        assert isinstance(ob_space, gym.spaces.Box)

        if actions == 'binary':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'beta':
            self.pdtype = pdtype = BetaPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'bernoulli':
            self.pdtype = pdtype = BernoulliPdType(ac_space.low.size)
        elif actions == 'gaussian':
            self.pdtype = pdtype = make_pdtype(ac_space)
        elif actions == 'cat_3':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 2)
        elif actions == 'cat_5':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 4)
        else:
            assert False

        sequence_length = None

        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None])

        if normalize_obs:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            if avg_norm_symmetry:
                # Warning works only for normal observations (41 numbers)
                ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) +
                           self.ob_rms.mean) / 2
                ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) +
                          self.ob_rms.std) / 2  # Pretty crude
            else:
                ob_mean = self.ob_rms.mean
                ob_std = self.ob_rms.std

            obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip,
                                   stdclip)

            #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41)
            #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41)
        else:
            obz = self.ob

        vpreds = []
        pparams = []

        for part in range(1 if not three else 3):
            part_prefix = "" if part == 0 else "part_" + str(part)

            # Predicted value
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            part_prefix + "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))

            vpreds.append(
                U.dense(last_out,
                        1,
                        part_prefix + "vffinal",
                        weight_init=U.normc_initializer(1.0)))
            vpreds[-1] = vpreds[-1][:, 0]

            if parallel_value:
                last_out_2 = obz
                for i in range(pv_layers):
                    last_out_2 = tf.nn.tanh(
                        U.dense(last_out_2,
                                pv_hid_size,
                                part_prefix + "pv_vffc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0)))
                last_out_2 = U.dense(last_out_2,
                                     1,
                                     part_prefix + "pv_vffinal",
                                     weight_init=U.normc_initializer(1.0))
                vpreds[-1] += last_out_2[:, 0]

            last_out = obz
            if activation == 'tanh': activation = tf.nn.tanh
            elif activation == 'relu': activation = tf.nn.relu
            for i in range(num_hid_layers):
                dense = U.dense(last_out,
                                hid_size,
                                part_prefix + "polfc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0))
                last_out = activation(dense)

            if actions == 'gaussian':
                if gaussian_fixed_var:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    if bound_by_sigmoid:
                        mean = tf.nn.sigmoid(mean * sigmoid_coef)
                    logstd = tf.get_variable(
                        name=part_prefix + "logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    logstd = mean * 0.0 + logstd
                else:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    logstd = U.dense(last_out,
                                     pdtype.param_shape()[0] // 2,
                                     part_prefix + "polfinal_2",
                                     U.normc_initializer(0.01))
                if gaussian_bias:
                    mean = mean + 0.5

                pdparam = U.concatenate([mean, logstd], axis=1)
            elif actions == 'beta':
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "beta_lastlayer",
                                  U.normc_initializer(0.01))
                pdparam = tf.nn.softplus(pdparam)
            elif actions in ['bernoulli', 'binary']:
                if bound_by_sigmoid:
                    raise NotImplementedError(
                        "bound by sigmoid not implemented here")
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "polfinal",
                                  U.normc_initializer(0.01))
            elif actions in ['cat_3']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat3_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            elif actions in ['cat_5']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat5_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            else:
                assert False

            pparams.append(pdparam)

        pparams = tf.stack(pparams)
        vpreds = tf.stack(vpreds)
        pparams = tf.transpose(pparams,
                               perm=(1, 0, 2))  # [batchsize, networks, values]
        vpreds = tf.transpose(vpreds,
                              perm=(1, 0))  # [batchsize, networks, values]

        self.stochastic = tf.placeholder(name="stochastic",
                                         dtype=tf.bool,
                                         shape=())

        if three:
            batchsize = tf.shape(pdparam)[0]
            NO_OBSTACLES_ID = 5
            OBST_DIST = [278, 279, 280, 281, 282, 283, 284,
                         285]  # TODO: Alternative approach
            distances = [self.ob[:, i] for i in OBST_DIST]
            distances = tf.stack(distances, axis=1)
            no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0),
                                   tf.int32)
            distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1),
                                tf.int32)
            no_obstacles_ahead = distances * no_obstacles  # 0 if obstacles, 1 if no obstacles
            begin = tf.cast(tf.less(self.st, 75), tf.int32)
            take_id = (1 - begin) * (
                1 + no_obstacles_ahead
            )  # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead

            take_id = tf.stack((tf.range(batchsize), take_id), axis=1)
            pdparam = tf.gather_nd(pparams, take_id)

            self.vpred = tf.gather_nd(vpreds, take_id)
            #self.vpred = tf.Print(self.vpred, [take_id])
        else:
            self.vpred = vpreds[:, 0]
            pdparam = pparams[:, 0]

        self.pd = pdtype.pdfromflat(pdparam)

        if hasattr(self.pd, 'real_mean'):
            real_mean = self.pd.real_mean()
            ac = U.switch(self.stochastic, self.pd.sample(), real_mean)
        else:
            ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([self.stochastic, self.ob, self.st],
                               [ac, self.vpred, ob_mean, ob_std])

        if actions == 'binary':
            self._binary_f = U.function([self.stochastic, self.ob, self.st],
                                        [ac, self.pd.flat, self.vpred])
示例#24
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0, kind='small'):
        assert isinstance(ob_space, gym.spaces.Box)

        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        option =  U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError


        # Network to compute value function and termination probabilities
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = x
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]

        self.vpred_ent = dense3D2(last_out, 1, "vffinal_ent", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]

        self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0]
        termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))


        # Network to compute policy over options and intra_option policies
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Discrete):
        #     mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01))
        #     logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
        #     pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        # else:
        pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)


        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob, option], [ac, self.vpred, self.vpred_ent, last_out])
        self._get_logits = U.function([stochastic, ob, option], [self.pd.logits] )


        self._get_v = U.function([ob, option], [self.vpred])
        self._get_v_ent = U.function([ob, option], [self.vpred_ent])  # Entropy value estimate
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self.get_vpred_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate
        self._get_op = U.function([ob], [self.op_pi])
示例#25
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#26
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., expert=None, save_networks=False,
        supervise=False, actor_only=False, critic_only=False, both_ours_sup=False, gail=False, pofd=False):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.expert = expert
        self.save_networks = save_networks
        self.supervise = supervise
        self.actor_only = actor_only
        self.critic_only = critic_only
        self.both_ours_sup = both_ours_sup
        self.gail=gail
        self.pofd=pofd

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        print(target_actor.vars)

        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        if self.expert is not None:
            self.expert.set_tf(actor=actor, critic=critic, obs0=normalized_obs0, actions=self.actions, obs_rms=self.obs_rms, ret_rms=self.ret_rms,
                               observation_range=self.observation_range, return_range=self.return_range, 
                               supervise=self.supervise, actor_only=self.actor_only, critic_only=self.critic_only, 
                               both_ours_sup=self.both_ours_sup, gail=self.gail, pofd=self.pofd)

        training_step = tf.get_variable('training_step', shape=[1], initializer=tf.ones_initializer)
        self.training_step_run = training_step.assign(training_step + 1)
        self.training_step = 0

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        if self.expert is not None:
            self.expert_actor_loss = self.expert.actor_loss + self.actor_loss
            if self.gail:
                self.expert_actor_loss = self.expert.actor_loss
            if self.pofd:
                self.expert_actor_loss = self.expert.actor_loss + self.actor_loss

        # self.expert_actor_loss = self.actor_loss

        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        if self.expert is not None:
            self.expert_actor_grads = U.flatgrad(self.expert_actor_loss, self.actor.trainable_vars,
                                                 clip_norm=self.clip_norm)
        else:
            self.expert_actor_grads = None
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg

        if self.expert is not None:
            self.expert_critic_loss = self.expert.critic_loss + self.critic_loss
            if self.gail:
                self.expert_critic_loss = self.expert.discriminator_loss
            if self.pofd:
                self.expert_critic_loss = self.expert.discriminator_loss + self.critic_loss
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        if self.expert is not None:
            self.expert_critic_grads = U.flatgrad(self.expert_critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        else:
            self.expert_critic_grads = None
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
        
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []
        
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
        
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
        
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
        
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self, pretrain):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        if self.expert is not None and pretrain:
            # self.training_step < self.expert_steps:
            expert_batch = self.expert.sample(batch_size=self.batch_size)
            ops = [self.training_step_run, self.expert_actor_grads, self.expert_actor_loss,
                   self.expert_critic_grads, self.expert_critic_loss]
            training_step, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
                self.expert.expert_state: expert_batch['obs0'],
                self.expert.expert_action: expert_batch['actions']
            })
        else:
            ops = [self.training_step_run, self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
            training_step, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
        self.training_step = training_step[0]

        if self.save_networks:
            if self.training_step.astype(int) % self.save_steps == 0:
                logger.info('Saved network with {} training steps'.format(self.training_step.astype(int)))
                self.saver.save(self.sess, self.ckp_dir, global_step=self.training_step.astype(int))

        return critic_loss, actor_loss

    def initialize(self, sess, saver, ckp_dir, save_steps, expert_steps):
        self.sess = sess
        self.saver = saver
        self.ckp_dir = ckp_dir
        self.save_steps = save_steps
        self.expert_steps = expert_steps

        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

        checkpoint = tf.train.get_checkpoint_state(self.ckp_dir)
        if self.saver and checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            logger.info('Successfully loaded {}'.format(checkpoint.model_checkpoint_path))
        else:
            logger.info('Could not find old network weights')

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))
        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
        
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })
示例#27
0
class DDPG(object):
     def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
                 batch_size=128, observation_range=(-1., 1.), action_range= [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True, critic_l2_reg=0.,
                 adaptive_param_noise_policy_threshold=.1, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., restore=False):

        # Inputs.
        # self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs0 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs0')
        # self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.obs1 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None, action_shape), name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        """Filewriter summary"""
        monitor_directory = os.path.join("Experiment_data")
        self.summary_dir = os.path.join(monitor_directory, "summary")
        # if restore:
        #     dirname = 'run20' # The last name
        #     self.summary_dir = os.path.join(self.summary_dir, dirname)
        # else:
        self.summary_dir = utils.new_summary_dir(self.summary_dir)

        # record the detailed parameters
        utils.log_params(self.summary_dir, {
            "actor learning rate": self.actor_lr,
            "critic learning rate": self.critic_lr,
            "batch size": self.batch_size,
            "actor update rate": self.tau,
            "critic update rate": self.tau,
            "action noise": self.action_noise,
            "param noise": self.param_noise,
            "reward function": 'General reward function',
            "result_function": 'The second 100'
        })

        self.merged = tf.summary.merge_all()

     def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

     def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None     ## 确保假设完全正确

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

     def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

     def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

     def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

     def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
        
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

     def setup_stats(self):
        ops = []
        names = []
        
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
        
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
        
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
        
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

     def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.multiply(action, self.action_range)
        # action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

     def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

     def save_data(self):
        self.memory.save_data()

     def train(self, dec_actor_lr, dec_critic_lr):
        # change the learning rate
        self.actor_lr = dec_actor_lr
        self.critic_lr = dec_critic_lr
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })
            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

     def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)
        ## wirte the graph
        self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph)

     def restore_model(self, model_directory, saver, sess):
        ckpt = tf.train.get_checkpoint_state(model_directory)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            self.sess = sess
            logger.info('Load the saved model from the directory!!!')
            self.summary_writer = tf.summary.FileWriter(self.summary_dir)

     def update_target_net(self):
        self.sess.run(self.target_soft_updates)

     def feedback_adptive_explore(self):
        self.param_noise.adapt_variance()

     def ou_adaptive_explore(self):
        self.action_noise.adapt_decrease()

     def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

     def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })

     def log_scalar(self, name, value, index):
        summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=index)
示例#28
0
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., expert=None, save_networks=False,
        supervise=False, actor_only=False, critic_only=False, both_ours_sup=False, gail=False, pofd=False):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.expert = expert
        self.save_networks = save_networks
        self.supervise = supervise
        self.actor_only = actor_only
        self.critic_only = critic_only
        self.both_ours_sup = both_ours_sup
        self.gail=gail
        self.pofd=pofd

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        print(target_actor.vars)

        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        if self.expert is not None:
            self.expert.set_tf(actor=actor, critic=critic, obs0=normalized_obs0, actions=self.actions, obs_rms=self.obs_rms, ret_rms=self.ret_rms,
                               observation_range=self.observation_range, return_range=self.return_range, 
                               supervise=self.supervise, actor_only=self.actor_only, critic_only=self.critic_only, 
                               both_ours_sup=self.both_ours_sup, gail=self.gail, pofd=self.pofd)

        training_step = tf.get_variable('training_step', shape=[1], initializer=tf.ones_initializer)
        self.training_step_run = training_step.assign(training_step + 1)
        self.training_step = 0

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
示例#29
0
def _mlpPolicy(hiddens,
               ob,
               ob_space,
               ac_space,
               scope,
               gaussian_fixed_var=True,
               reuse=False):
    assert isinstance(ob_space, gym.spaces.Box)

    with tf.variable_scope(scope, reuse=reuse):
        pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        with tf.variable_scope("obfilter"):
            ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - ob_rms.mean) / ob_rms.std, -5.0, 5.0)
            last_out = obz
            #for i in range(num_hid_layers):
            for (i, hidden) in zip(range(len(hiddens)), hiddens):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hidden,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            #for i in range(num_hid_layers):
            #for hidden in hiddens:
            for (i, hidden) in zip(range(len(hiddens)), hiddens):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hidden,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        pd = pdtype.pdfromflat(pdparam)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, pd.sample(), pd.mode())
        _act = U.function([stochastic, ob], [ac, vpred])

        return pd.logits, _act
示例#30
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            alpha = tf.nn.softplus(
                U.dense(last_out,
                        ac_space.high.size,
                        'polfc_alpha',
                        weight_init=U.normc_initializer(0.001))) + 1.0
            beta = tf.nn.softplus(
                U.dense(last_out,
                        ac_space.high.size,
                        'polfc_beta',
                        weight_init=U.normc_initializer(0.001))) + 1.0
        else:
            raise NotImplementedError

        self.pd = tfp.distributions.Beta(alpha, beta)

        self.state_in = []
        self.state_out = []

        # compute sampled action
        sampled_action = self.pd.sample()

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, sampled_action, self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#31
0
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.save = None
        self.load = None

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
示例#32
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
        
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []
        
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
        
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
        
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
        
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
        
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })