示例#1
0
class MLPModel(Model):
    def __init__(self,
                 ob_space,
                 ac_space,
                 ob_filter=True,
                 gaussian_fixed_var=True):
        self.ob_filter = ob_filter
        self.gaussian_fixed_var = gaussian_fixed_var
        super(MLPModel, self).__init__(ob_space, ac_space)

    def _create_network(self):
        x = self.ob

        # create ob filter
        if self.ob_filter:
            self.ob_rms = RunningMeanStd(shape=self.ob_space.shape)
            x = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # actor
        l = x
        l = tf.nn.tanh(
            U.dense(l, 32, "a_1", weight_init=U.normc_initializer(1.0)))
        l = tf.nn.tanh(
            U.dense(l, 32, "a_2", weight_init=U.normc_initializer(1.0)))
        action_layer = l

        # critic
        l = x
        l = tf.nn.tanh(
            U.dense(l, 32, "c_1", weight_init=U.normc_initializer(1.0)))
        l = tf.nn.tanh(
            U.dense(l, 32, "c_2", weight_init=U.normc_initializer(1.0)))
        value_layer = l

        self._create_logit_value(action_layer, value_layer,
                                 self.gaussian_fixed_var)

    def update_ob_norm(self, ob):
        if not hasattr(self, 'ob_rms'): return
        self.ob_rms.update(ob)
示例#2
0
class CnnPolicy(object):
    recurrent = False
    def __init__(self, name, ob_space, ac_space, hid_size, num_hid_layers, kind='large'):
        with tf.variable_scope(name):
            self._init(ob_space, ac_space, hid_size, num_hid_layers, kind)
            self.scope = tf.get_variable_scope().name
            self.recurrent = False

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        self.ob = [ob_p, ob_f]
        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0
            
        x = self.img_encoder(x, kind)
        
        ob_last = tf.concat((obpz, x), axis=-1)

        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred_ext = tf.layers.dense(last_out, 1, name='vf_ext', kernel_initializer=U.normc_initializer(1.0))[:,0]
            self.vpred_int = tf.layers.dense(last_out, 1, name='vf_int', kernel_initializer=U.normc_initializer(1.0))[:,0]


        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred_ext, self.vpred_int])

    def img_encoder(self, x, kind):
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError
        return x

    
    def act(self, stochastic, ob):
        ob1, ob2 = ob
        ob2 = np.array(ob2)
        ac1, vpred_ext, vpred_int = self._act(stochastic, ob1, ob2)
        norm_ac1 = np.tanh(ac1)
        return norm_ac1[0], ac1[0], vpred_ext[0], vpred_int[0]
    
    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
    def get_initial_state(self):
        return []

    def update_obs_rms(self, ob):
        obp = np.array(zip(*ob.tolist())[0])
        self.ob_rms.update(obp)
示例#3
0
class EnsembleDDPG(object):
    def __init__(self, actor, critics, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
                 gamma=0.99, tau=0.005, normalize_returns=False, enable_popart=False, normalize_observations=False,
                 batch_size=100, observation_range=(-np.inf, np.inf), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, action_noise_scale=0.2,
                 action_noise_clip=0.5,
                 critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., use_mpi_adam=False):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range

        # settings
        self.use_mpi_adam = use_mpi_adam

        # set the list of critic
        self.critics = critics

        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # remember the noise scale and clip
        self.action_noise_scale = action_noise_scale
        self.action_noise_clip = action_noise_clip

        # Observation normalization.

        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor

        # set up different target critics, primary and supplementary

        target_critics = []
        for critic in critics:
            target_critic = copy(critic)
            target_critic.name = 'target_' + critic.name
            target_critics.append(target_critic)

        self.target_critics = target_critics

        # Create networks and core TF parts that are shared across setup parts.
        
        # actor_tf pi(s) is built from the actor and normalized_obs0 
        self.actor_tf = actor(normalized_obs0)

        # normalized_critic_tf normalized Q(s,a) is built from the observation and action 
        self.normalized_critic_tfs = [critic(normalized_obs0, self.actions) for critic in critics]
        self.normalized_critic_tf_main = self.normalized_critic_tfs[0]

        # critic_tf Q(s,a) is built from de-normalization and clipping from normalized Q(s,a)
        self.critic_tfs = [ denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
              for normalized_critic_tf in self.normalized_critic_tfs]

        self.critic_tf_mean = 0
        for critic_tf in self.critic_tfs:
            self.critic_tf_mean += critic_tf

        self.critic_tf_mean *= 1.0 / len(self.critic_tfs)

        # normalized_critic_with_actor_tf normalized Q(s,pi(s)) is built from the observation,
        # and action provided by actor
        self.normalized_critic_with_actor_tfs = [
            critic(normalized_obs0, self.actor_tf, reuse=True) for critic in critics
        ]

        # critic_with_actor_tf is built from de-normalization and clipping from normalized Q(s,pi(s))
        self.critic_with_actor_tfs = [denormalize(
            tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
            self.ret_rms)
            for normalized_critic_with_actor_tf in self.normalized_critic_with_actor_tfs
        ]
        self.critic_with_actor_tf_main = self.critic_with_actor_tfs[0]
        self.critic_with_actor_tf_mean = 0
        for critic_with_actor_tf in self.critic_with_actor_tfs:
            self.critic_with_actor_tf_mean += critic_with_actor_tf
        self.critic_with_actor_tf_mean *= 1.0 / len(self.critics)

        # Q_obs1 Q(s',pi'(s)) is built from next state s'(observation), target actor pi',
        # and de-normalization
        target_action = target_actor(normalized_obs1)
        self.target_Q_vals = []
        self.target_Q_val_mean = 0

        # add noise in target critic functions
        for target_critic in target_critics:
            target_action_noise = tf.clip_by_value(tf.random_normal(
                tf.shape(target_action), mean=0.0, stddev=action_noise_scale, dtype=tf.float32),
                clip_value_min=-action_noise_clip, clip_value_max=action_noise_clip)
            noisy_target_action = tf.clip_by_value(target_action + target_action_noise,
                                                   clip_value_min=action_range[0], clip_value_max=action_range[1])
            target_Q_obs = denormalize(target_critic(normalized_obs1, noisy_target_action), self.ret_rms)
            target_Q_val = self.rewards + (1. - self.terminals1) * gamma * target_Q_obs

            self.target_Q_vals.append(target_Q_val)
            self.target_Q_val_mean += target_Q_val

        self.target_Q_val_mean *= 1.0 / (len(critics))

        # merge trainable variables into one set
        self.target_critic_vars = []
        self.critic_vars = []
        self.critic_trainable_vars = []
        for critic in critics:
            self.critic_vars += critic.vars
            self.critic_trainable_vars += critic.trainable_vars
        for target_critic in target_critics:
            self.target_critic_vars += target_critic.vars

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        
        # setup optimizer 
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        if self.use_mpi_adam:
            actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars,
                                                                        self.tau)
            critic_init_updates, critic_soft_updates = get_target_updates(self.critic_vars, self.target_critic_vars,
                                                                          self.tau)
            self.target_init_updates = [actor_init_updates, critic_init_updates]
            self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

            self.target_soft_update_actor = actor_soft_updates
            self.target_soft_update_critic = critic_soft_updates
        else:
            actor_init_updates, actor_soft_updates = get_target_updates(self.actor.trainable_vars,
                                                                        self.target_actor.vars, self.tau)
            critic_init_updates, critic_soft_updates = get_target_updates(self.critic_trainable_vars,
                                                                          self.target_critic_vars, self.tau)
            self.target_init_updates = [actor_init_updates, critic_init_updates]
            self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

            self.target_soft_update_actor = actor_soft_updates
            self.target_soft_update_critic = critic_soft_updates

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')

        # Here use the Q(s,pi(s)) as the loss function
        #   use primary critic function to generate policy updates
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf_mean)

        self.actor_loss_array = []
        for critic_with_actor_tf in self.critic_with_actor_tfs:
            self.actor_loss_array.append(-tf.reduce_mean(critic_with_actor_tf))

        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]

        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))

        self.actor_grads = []
        for actor_loss in self.actor_loss_array:
            self.actor_grads.append(tf.reshape(
                U.flatgrad(actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm),
                shape=[-1,1]))

        self.actor_grad_array = tf.concat(self.actor_grads,axis=1)
        self.actor_grad_array = tf.reshape(self.actor_grad_array, shape=[-1, len(self.critics)])
        self.actor_grad_mean = tf.reduce_mean(self.actor_grad_array, axis=1)
        self.actor_grad_var = reduce_var(self.actor_grad_array, axis=1)

        # sum up the gradients
        self.actor_grad_var_std = tf.sqrt(tf.reduce_sum(self.actor_grad_var))

        # print the shape of gradients
        print("[Tiancheng Shape] Actor Grad Array", self.actor_grad_array.shape)
        print("[Tiancheng Shape] Actor Grad Mean", self.actor_grad_mean.shape)
        print("[Tiancheng Shape] Actor Grad Variance", self.actor_grad_var.shape)
        print("[Tiancheng Shape] Actor Grad VarStd", self.actor_grad_var_std.shape)

        # add support to single-threaded adam
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        if self.use_mpi_adam:
            self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            self.actor_grads = list(
                zip(tf.gradients(self.actor_loss, self.actor.trainable_vars), self.actor.trainable_vars))

            self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr,beta1=0.9, beta2=0.999, epsilon=1e-08)
            self.actor_train = self.actor_optimizer.apply_gradients(self.actor_grads)


    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')

        # normalize critic target, normalized y ( not sure we need to use different target values here. )
        # TODO: abandon static evaluation of critic target values, use dynamic computing method

        # Use square error between normalized_critic_tf normalized Q(s,a) and normalized critic_target y
        # ( not use denormalized version ) as loss function, for two different critic, we need to train them both
        self.critic_loss = 0

        # merge the critic loss for all the Q value functions
        for normalized_critic_tf, critic_target_tf in zip(self.normalized_critic_tfs,self.target_Q_vals):
            normalized_critic_target_tf = tf.clip_by_value(normalize(tf.stop_gradient(critic_target_tf), self.ret_rms),
                                                           self.return_range[0], self.return_range[1])
            self.critic_loss += tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf))

        # apply l2_regularization on some trainable variables and add them into loss function
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic_trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))

            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))

            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg

        critic_shapes = [var.get_shape().as_list() for var in self.critic_trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic_trainable_vars, clip_norm=self.clip_norm)

        # un-flatten the gradients for several critics, and compute moment
        self.critic_grad_array = tf.reshape(self.critic_grads,shape=[-1,len(self.critics)])
        self.critic_grad_mean = tf.reduce_mean(self.critic_grad_array,axis=1)
        self.critic_grad_var = reduce_var(self.critic_grad_array,axis=1)

        # sum up the gradients
        self.critic_grad_var_std = tf.sqrt(tf.reduce_sum(self.critic_grad_var))

        # print the shape of gradients
        print("[Tiancheng Shape] Critic Grad Array", self.critic_grad_array.shape)
        print("[Tiancheng Shape] Critic Grad Mean", self.critic_grad_mean.shape)
        print("[Tiancheng Shape] Critic Grad Variance", self.critic_grad_var.shape)
        print("[Tiancheng Shape] Critic Grad VarStd", self.critic_grad_var_std.shape)

        # add support to single-thread adam
        if self.use_mpi_adam:
            self.critic_optimizer = MpiAdam(var_list=self.critic_trainable_vars,
                                            beta1=0.9, beta2=0.999, epsilon=1e-08)
        else:
            self.critic_grads = list(
                zip(tf.gradients(self.critic_loss, self.critic_trainable_vars), self.critic_trainable_vars))
            self.critic_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr, beta1=0.9, beta2=0.999,
                                                           epsilon=1e-08)
            self.critic_train = self.critic_optimizer.apply_gradients(self.critic_grads)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []

        self.critic_output_vars = []
        self.target_output_vars = []
        for critic, target_critic in zip(self.critics,self.target_critics):
            self.critic_output_vars += critic.output_vars
            self.target_output_vars += target_critic.output_vars

        for vs in [self.critic_output_vars,self.target_output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        # TODO: compute the variance of values and gradient, for both J and Q
        ops += [tf.reduce_mean(self.critic_tf_mean)]
        names += ['MeanQ_mean_over_states']
        ops += [reduce_std(self.critic_tf_mean)]
        names += ['MeanQ_std_over_states']

        # print the shape of gradients
        ops += [self.actor_grad_var_std]
        names += ['Actor Grad Variance Std']
        ops += [tf.norm(self.actor_grad_mean)]
        names += ['Actor Grad Mean Norm']

        # print the shape of gradients
        ops += [self.critic_grad_var_std]
        names += ['Critic Grad Variance Std']
        ops += [tf.norm(self.critic_grad_mean)]
        names += ['Critic Grad Mean Norm']

        # TODO: outdated stats need to be re-arranged
        # ops += [tf.reduce_mean(self.critic_tf0)]
        # names += ['reference_Q0_mean']
        # ops += [reduce_std(self.critic_tf0)]
        # names += ['reference_Q0_std']
        #
        # ops += [tf.reduce_mean(self.critic_tf1)]
        # names += ['reference_Q1_mean']
        # ops += [reduce_std(self.critic_tf1)]
        # names += ['reference_Q1_std']
        #
        # ops += [tf.reduce_mean(self.critic_with_actor_tf0)]
        # names += ['reference_actor_Q0_mean']
        # ops += [reduce_std(self.critic_with_actor_tf0)]
        # names += ['reference_actor_Q0_std']
        #
        # ops += [tf.reduce_mean(self.critic_with_actor_tf1)]
        # names += ['reference_actor_Q1_mean']
        # ops += [reduce_std(self.critic_with_actor_tf1)]
        # names += ['reference_actor_Q1_std']
        #
        # ops += [tf.reduce_mean(self.actor_tf)]
        # names += ['reference_action_mean']
        # ops += [reduce_std(self.actor_tf)]
        # names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    # compute the action from the observation pi(s)
    #   has an option to compute the q function at the same time 
    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            # TODO: not sure what to do for this critic_with_actor_tf, set to critic_with_actor_tf0
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf_main], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self, take_update=True,stop_critic_training=False,stop_actor_training=False):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        # if self.normalize_returns and self.enable_popart:
        #     # compute old mean, old std and target Q values
        #     # old mean and std is used for normalization
        #     # and target Q values for
        #     old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
        #         self.obs1: batch['obs1'],
        #         self.rewards: batch['rewards'],
        #         self.terminals1: batch['terminals1'].astype('float32'),
        #     })
        #
        #     # compute something
        #     self.ret_rms.update(target_Q.flatten())
        #     self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
        #         self.old_std : np.array([old_std]),
        #         self.old_mean : np.array([old_mean]),
        #     })
        #
        #     # Run sanity check. Disabled by default since it slows down things considerably.
        #     # print('running sanity check')
        #     # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
        #     #     self.obs1: batch['obs1'],
        #     #     self.rewards: batch['rewards'],
        #     #     self.terminals1: batch['terminals1'].astype('float32'),
        #     # })
        #     # print(target_Q_new, target_Q, new_mean, new_std)
        #     # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        # else:
        #     # compute target Q value functions ( ( 1 - terminal ) * gamma * Q(s,pi(s)) + r )
        #     target_Q = self.sess.run([self.target_Q], feed_dict={
        #         self.obs1: batch['obs1'],
        #         self.rewards: batch['rewards'],
        #         self.terminals1: batch['terminals1'].astype('float32'),
        #     })

        # Get all gradients and perform a "synced update".
        # compute the gradients of actor and critic


        if self.use_mpi_adam:
            ops = [self.critic_grads, self.critic_loss]

            critic_grads, critic_loss = self.sess.run(ops, feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            if not stop_critic_training:
                self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

            if take_update:
                ops = [self.actor_grads, self.actor_loss]
                actor_grads, actor_loss = self.sess.run(ops, feed_dict={
                    self.obs0: batch['obs0'],
                })

                if not stop_actor_training:
                    self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
                return critic_loss, actor_loss

        else:
            if stop_critic_training:
                ops = [self.critic_grads, self.critic_grads, self.critic_loss]
            else:
                ops = [self.critic_train, self.critic_grads, self.critic_loss]

            _, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            if take_update:
                if stop_actor_training:
                    ops = [self.actor_grads, self.actor_grads, self.actor_loss]
                else:
                    ops = [self.actor_train, self.actor_grads, self.actor_loss]
                _, actor_grads, actor_loss = self.sess.run(ops, feed_dict={
                    self.obs0: batch['obs0'],
                })
                return critic_loss, actor_loss

        return critic_loss, 0

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        if self.use_mpi_adam:
            self.actor_optimizer.sync()
            self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
            self.obs1: self.stats_sample['obs1'],
            self.rewards: self.stats_sample['rewards'],
            self.terminals1: self.stats_sample['terminals1'].astype('float32'),
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })
示例#4
0
class DDPG(tf.Module):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None # recurrent architectures not supported yet


    def setup_param_noise(self):
        assert self.param_noise is not None

        # Configure perturbed actor.
        self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)

        # Configure separate copy for stddev adoption.
        self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1

    @tf.function
    def step(self, obs, apply_noise=True, compute_Q=True):
        normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1])
        actor_tf = self.actor(normalized_obs)
        if self.param_noise is not None and apply_noise:
            action = self.perturbed_actor(normalized_obs)
        else:
            action = actor_tf

        if compute_Q:
            normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf)
            q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            action += noise
        action = tf.clip_by_value(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        batch = self.memory.sample(batch_size=self.batch_size)
        obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1'])
        actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32)
        normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1)

        if self.normalize_returns and self.enable_popart:
            old_mean = self.ret_rms.mean
            old_std = self.ret_rms.std
            self.ret_rms.update(target_Q.flatten())
            # renormalize Q outputs
            new_mean = self.ret_rms.mean
            new_std = self.ret_rms.std
            for vs in [self.critic.output_vars, self.target_critic.output_vars]:
                kernel, bias = vs
                kernel.assign(kernel * old_std / new_std)
                bias.assign((bias * old_std + old_mean - new_mean) / new_std)


        actor_grads, actor_loss = self.get_actor_grads(normalized_obs0)
        critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q)

        if MPI is not None:
            self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr)
            self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr)
        else:
            self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
            self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

        return critic_loss, actor_loss

    @tf.function
    def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1):
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1])
        Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms)
        target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1
        return normalized_obs0, target_Q

    @tf.function
    def get_actor_grads(self, normalized_obs0):
        with tf.GradientTape() as tape:
            actor_tf = self.actor(normalized_obs0)
            normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
            critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
            actor_loss = -tf.reduce_mean(critic_with_actor_tf)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        if self.clip_norm:
            actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads]
        if MPI is not None:
            actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0)
        return actor_grads, actor_loss

    @tf.function
    def get_critic_grads(self, normalized_obs0, actions, target_Q):
        with tf.GradientTape() as tape:
            normalized_critic_tf = self.critic(normalized_obs0, actions)
            normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1])
            critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf))
            # The first is input layer, which is ignored here.
            if self.critic_l2_reg > 0.:
                # Ignore the first input layer.
                for layer in self.critic.network_builder.layers[1:]:
                    # The original l2_regularizer takes half of sum square.
                    critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel))
        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        if self.clip_norm:
            critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads]
        if MPI is not None:
            critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0)
        return critic_grads, critic_loss


    def initialize(self):
        if MPI is not None:
            sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables)
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    @tf.function
    def update_target_net(self):
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)

    def get_stats(self):

        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        obs0 = self.stats_sample['obs0']
        actions = self.stats_sample['actions']
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        normalized_critic_tf = self.critic(normalized_obs0, actions)
        critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        actor_tf = self.actor(normalized_obs0)
        normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
        critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)

        stats = {}
        if self.normalize_returns:
            stats['ret_rms_mean'] = self.ret_rms.mean
            stats['ret_rms_std'] = self.ret_rms.std
        if self.normalize_observations:
            stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean)
            stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std)
        stats['reference_Q_mean'] = tf.reduce_mean(critic_tf)
        stats['reference_Q_std'] = reduce_std(critic_tf)
        stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf)
        stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf)
        stats['reference_action_mean'] = tf.reduce_mean(actor_tf)
        stats['reference_action_std'] = reduce_std(actor_tf)

        if self.param_noise:
            perturbed_actor_tf = self.perturbed_actor(normalized_obs0)
            stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf)
            stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf)
            stats.update(self.param_noise.get_stats())
        return stats


    
    def adapt_param_noise(self, obs0):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        mean_distance = self.get_mean_distance(obs0).numpy()

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()

        self.param_noise.adapt(mean_distance)
        return mean_distance

    @tf.function
    def get_mean_distance(self, obs0):
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev)

        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        actor_tf = self.actor(normalized_obs0)
        adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0)
        mean_distance = tf.sqrt(tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf)))
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)
示例#5
0
class Model(object):
    def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6,
                 normalize_observations=True, normalize_returns=False, enable_popart=False,
                 noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1.,
                 batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3,
                 observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 **network_kwargs):
        # logger.info('Using agent with the following configuration:')
        # logger.info(str(self.__dict__.items()))
        observation_shape = env.observation_space.shape
        action_shape = env.action_space.shape

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.total_timesteps = total_timesteps
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.enable_popart = enable_popart
        self.clip_norm = clip_norm
        self.reward_scale = reward_scale
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.l2_reg_coef = l2_reg_coef

        self.stats_sample = None

        self.action_noise = None
        self.param_noise = None
        nb_actions = self.env.action_space.shape[-1]
        if noise_type is not None:
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                                         desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                                sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

        assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
        self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                             observation_shape=env.observation_space.shape)
        self.critic = Critic(network=network, **network_kwargs)
        self.actor = Actor(nb_actions, network=network, **network_kwargs)

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = self.critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
        self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars,
                                                                      self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor,
                                                                       self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                       self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.l2_reg_coef > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if
                               var.name.endswith('/w:0') and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.l2_reg_coef))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.l2_reg_coef),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def train_step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def step(self, obs, compute_Q=True):
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(self.actor_tf, feed_dict=feed_dict)
            q = None

        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                                                        feed_dict={
                                                            self.obs1: batch['obs1'],
                                                            self.rewards: batch['rewards'],
                                                            self.terminals1: batch['terminals1'].astype('float32'),
                                                        })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std: np.array([old_std]),
                self.old_mean: np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })

    def learn(self,
              total_timesteps=None,
              seed=None,
              nb_epochs=None,  # with default settings, perform 1M steps total
              nb_epoch_cycles=20,
              nb_rollout_steps=100,
              render=False,
              nb_train_steps=50,  # per epoch cycle and MPI worker,
              batch_size=64,  # per MPI worker
              param_noise_adaption_interval=50,):

        set_global_seeds(seed)

        if total_timesteps is not None:
            assert nb_epochs is None
            nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
        else:
            nb_epochs = 500

        if MPI is not None:
            rank = MPI.COMM_WORLD.Get_rank()
        else:
            rank = 0

        # eval_episode_rewards_history = deque(maxlen=100)
        episode_rewards_history = deque(maxlen=100)
        sess = U.get_session()
        # Prepare everything.
        self.initialize(sess)
        sess.graph.finalize()
        self.reset()

        obs = self.env.reset()
        # if eval_env is not None:
        #     eval_obs = eval_env.reset()
        nenvs = obs.shape[0]

        episode_reward = np.zeros(nenvs, dtype=np.float32)  # vector
        episode_step = np.zeros(nenvs, dtype=int)  # vector
        episodes = 0  # scalar
        t = 0  # scalar

        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                if nenvs > 1:
                    # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                    # of the environments, so resetting here instead
                    self.reset()
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q, _, _ = self.train_step(obs, apply_noise=True, compute_Q=True)

                    # Execute next action.
                    if rank == 0 and render:
                        self.env.render()

                    # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                    # new_obs, r, done, info = self.env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, info = self.env.step(action)
                    # note these outputs are batched from vecenv

                    t += 1
                    if rank == 0 and render:
                        self.env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    # the batched data will be unrolled in memory.py's append.
                    self.store_transition(obs, action, r, new_obs, done)

                    obs = new_obs

                    for d in range(len(done)):
                        if done[d]:
                            # Episode done.
                            epoch_episode_rewards.append(episode_reward[d])
                            episode_rewards_history.append(episode_reward[d])
                            epoch_episode_steps.append(episode_step[d])
                            episode_reward[d] = 0.
                            episode_step[d] = 0
                            epoch_episodes += 1
                            episodes += 1
                            if nenvs == 1:
                                self.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = self.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = self.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    self.update_target_net()
            #
            # # Evaluate.
            # eval_episode_rewards = []
            # eval_qs = []
            # if eval_env is not None:
            #     eval_obs = eval_env.reset()
            #     nenvs_eval = eval_obs.shape[0]
            #     eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
            #     for t_rollout in range(nb_eval_steps):
            #         eval_action, eval_q, _, _ = self.train_step(eval_obs, apply_noise=False, compute_Q=True)
            #         # eval_obs, eval_r, eval_done, eval_info = eval_env.step(
            #         #     max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
            #         eval_obs, eval_r, eval_done, eval_info = eval_env.step(eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
            #
            #         if render_eval:
            #             eval_env.render()
            #         eval_episode_reward += eval_r
            #
            #         eval_qs.append(eval_q)
            #         for d in range(len(eval_done)):
            #             if eval_done[d]:
            #                 eval_episode_rewards.append(eval_episode_reward[d])
            #                 eval_episode_rewards_history.append(eval_episode_reward[d])
            #                 eval_episode_reward[d] = 0.0

            if MPI is not None:
                mpi_size = MPI.COMM_WORLD.Get_size()
            else:
                mpi_size = 1

            # save trainable variables
            file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time()))
            model_save_path = self.def_path_pre + file_name
            self.save(model_save_path)

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = self.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            # combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            # if eval_env is not None:
            #     combined_stats['eval/return'] = eval_episode_rewards
            #     combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            #     combined_stats['eval/Q'] = eval_qs
            #     combined_stats['eval/episodes'] = len(eval_episode_rewards)

            combined_stats_sums = np.array([np.array(x).flatten()[0] for x in combined_stats.values()])
            if MPI is not None:
                combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

            combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            if rank == 0:
                logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(self.env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(self.env.get_state(), f)
                # if eval_env and hasattr(eval_env, 'get_state'):
                #     with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                #         pickle.dump(eval_env.get_state(), f)
        self.sess.graph._unsafe_unfinalize()
        return self

    def save(self, save_path=None):
        save_variables(save_path=save_path, sess=self.sess)
        print('save model variables to', save_path)

    def load_newest(self, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)))
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[-1])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)

    def load_index(self, index, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True)
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[index])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)
示例#6
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.,
                 td3_variant=False,td3_policy_freq=1,td3_policy_noise=0.0,td3_noise_clip=0.5):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        
        #추가된 내용
        #parameters for using TD3 variant of DDPG
        #https://arxiv.org/abs/1802.09477
        self.td3_variant = td3_variant
        self.td3_policy_freq = td3_policy_freq
        self.td3_policy_noise = td3_policy_noise
        self.td3_noise_clip = td3_noise_clip

        #노말라이제이션 코드 her에서 가져오자.
        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)

        if self.td3_variant:
            logger.info('using TD3 variant model')
            self.normalized_critic_tf, self.normalized_critic_tf2 = critic(normalized_obs0, self.actions)
            self.critic_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
            self.normalized_critic_with_actor_tf, _ = critic(normalized_obs0, self.actor_tf, reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
                self.ret_rms)
            out_q1, out_q2 = target_critic(normalized_obs1, target_actor(normalized_obs1))
            min_q1 = tf.minimum(out_q1,out_q2)
            Q_obs1 = denormalize(min_q1, self.ret_rms)
        else:
            self.normalized_critic_tf = critic(normalized_obs0, self.actions)
            self.critic_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
            self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
                self.ret_rms)
            Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.actor_target_soft_updates = actor_soft_updates
        self.critic_target_soft_updates = critic_soft_updates

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                       self.return_range[0], self.return_range[1])
        if self.td3_variant:
            logger.info('using TD3 variant loss')
            self.critic_loss = tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf) \
                               + tf.losses.mean_squared_error(normalized_critic_target_tf,self.normalized_critic_tf2)
        else:

            self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))

        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self,train_iter):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        if self.td3_policy_noise > 0:
            noise = np.random.normal(loc=0.0,scale=self.td3_policy_noise,size=np.shape(batch['actions']))
            noise = np.clip(noise,-self.td3_noise_clip,self.td3_noise_clip)
            # Get all gradients and perform a synced update.
            ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
            actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
                self.obs0: batch['obs0'],
                self.actions: np.clip(batch['actions'] + noise,self.action_range[0],self.action_range[1]),
                self.critic_target: target_Q,
            })
        else:
            # Get all gradients and perform a synced update.
            ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
            actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })


        #TD3 has hyperparameter for how frequently to update actor policy and target networks
        if train_iter % self.td3_policy_freq == 0:
            self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)

        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self, train_iter):
        # TD3 has hyperparameter for how frequently to update actor policy and target networks
        if train_iter % self.td3_policy_freq == 0:
            self.sess.run(self.actor_target_soft_updates)
            self.sess.run(self.critic_target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })
示例#7
0
class SAC(RLAlgorithm):
    """Soft Actor-Critic (SAC)

    References
    ----------
    [1] Tuomas Haarnoja*, Aurick Zhou*, Kristian Hartikainen*, George Tucker,
        Sehoon Ha, Jie Tan, Vikash Kumar, Henry Zhu, Abhishek Gupta, Pieter
        Abbeel, and Sergey Levine. Soft Actor-Critic Algorithms and
        Applications. arXiv preprint arXiv:1812.05905. 2018.
    """

    def __init__(
            self,
            training_environment,
            evaluation_environment,
            policy,
            Qs,
            pool,
            plotter=None,

            lr=3e-4,
            reward_scale=1.0,
            target_entropy='auto',
            discount=0.99,
            tau=5e-3,
            target_update_interval=1,
            action_prior='uniform',
            reparameterize=False,
            store_extra_policy_info=False,

            save_full_state=False,
            **kwargs,
    ):
        """
        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy: A policy function approximator.
            initial_exploration_policy: ('Policy'): A policy that we use
                for initial exploration which is not trained by the algorithm.
            Qs: Q-function approximators. The min of these
                approximators will be used. Usage of at least two Q-functions
                improves performance by reducing overestimation bias.
            pool (`PoolBase`): Replay pool to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            lr (`float`): Learning rate used for the function approximators.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            target_update_interval ('int'): Frequency at which target network
                updates occur in iterations.
            reparameterize ('bool'): If True, we use a gradient estimator for
                the policy derived using the reparameterization trick. We use
                a likelihood ratio based estimator otherwise.
        """

        super(SAC, self).__init__(**kwargs)

        self._training_environment = training_environment
        self._evaluation_environment = evaluation_environment
        self._policy = policy

        self._Qs = Qs
        self._Q_targets = tuple(tf.keras.models.clone_model(Q) for Q in Qs)

        self._pool = pool
        self._plotter = plotter

        self._policy_lr = lr
        self._Q_lr = lr
        self.value_rms = RunningMeanStd(shape=(1,))

        self._reward_scale = reward_scale
        self._target_entropy = (
            -np.prod(self._training_environment.action_space.shape)
            if target_entropy == 'auto'
            else target_entropy)

        self._discount = discount
        self._tau = tau
        self._target_update_interval = target_update_interval
        self._action_prior = action_prior

        self._reparameterize = reparameterize
        self._store_extra_policy_info = store_extra_policy_info

        self._save_full_state = save_full_state

        observation_shape = self._training_environment.active_observation_shape
        action_shape = self._training_environment.action_space.shape

        assert len(observation_shape) == 1, observation_shape
        self._observation_shape = observation_shape
        assert len(action_shape) == 1, action_shape
        self._action_shape = action_shape

        self._build()

    def _build(self):
        self._training_ops = {}

        self._init_global_step()
        self._init_placeholders()
        self._init_actor_update()
        self._init_critic_update()
        self._init_diagnostics_ops()

    def _init_placeholders(self):
        """Create input placeholders for the SAC algorithm.

        Creates `tf.placeholder`s for:
            - observation
            - next observation
            - action
            - reward
            - terminals
        """
        self._iteration_ph = tf.placeholder(
            tf.int64, shape=None, name='iteration')

        self._observations_ph = tf.placeholder(
            tf.float32,
            shape=(None, *self._observation_shape),
            name='observation',
        )

        self._next_observations_ph = tf.placeholder(
            tf.float32,
            shape=(None, *self._observation_shape),
            name='next_observation',
        )

        self._actions_ph = tf.placeholder(
            tf.float32,
            shape=(None, *self._action_shape),
            name='actions',
        )

        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, 1),
            name='rewards',
        )

        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, 1),
            name='terminals',
        )

        if self._store_extra_policy_info:
            self._log_pis_ph = tf.placeholder(
                tf.float32,
                shape=(None, 1),
                name='log_pis',
            )
            self._raw_actions_ph = tf.placeholder(
                tf.float32,
                shape=(None, *self._action_shape),
                name='raw_actions',
            )

    def _get_Q_target(self):
        next_actions = self._policy.actions([self._next_observations_ph])
        next_log_pis = self._policy.log_pis(
            [self._next_observations_ph], next_actions)

        next_Qs_values = tuple(
            Q([self._next_observations_ph, next_actions]) * self.value_rms.std + self.value_rms.mean
            for Q in self._Q_targets)

        min_next_Q = tf.reduce_min(next_Qs_values, axis=0)
        next_value = min_next_Q - self._alpha * next_log_pis

        Q_target = td_target(
            reward=self._reward_scale * self._rewards_ph,
            discount=self._discount,
            next_value=(1 - self._terminals_ph) * next_value)
        return (Q_target - self.value_rms.mean)/self.value_rms.std, Q_target

    def _init_critic_update(self):
        """Create minimization operation for critic Q-function.

        Creates a `tf.optimizer.minimize` operation for updating
        critic Q-function with gradient descent, and appends it to
        `self._training_ops` attribute.

        See Equations (5, 6) in [1], for further information of the
        Q-function update rule.
        """
        Q_target, self.raw_q_target = [q[0] for q in tf.split(tf.stop_gradient(self._get_Q_target()), axis=0, num_or_size_splits=2)]

        assert Q_target.shape.as_list() == [None, 1]

        Q_values = self._Q_values = tuple(
            Q([self._observations_ph, self._actions_ph])
            for Q in self._Qs)

        Q_losses = self._Q_losses = tuple(
            tf.losses.mean_squared_error(
                labels=Q_target, predictions=Q_value, weights=0.5)
            for Q_value in Q_values)

        self._Q_optimizers = tuple(
            tf.train.AdamOptimizer(
                learning_rate=self._Q_lr,
                name='{}_{}_optimizer'.format(Q._name, i)
            ) for i, Q in enumerate(self._Qs))

        Q_training_ops = tuple(
            Q_optimizer.minimize(loss=Q_loss, var_list=Q.trainable_variables)
            for i, (Q, Q_loss, Q_optimizer)
            in enumerate(zip(self._Qs, Q_losses, self._Q_optimizers)))
        self._training_ops.update({'Q': tf.group(Q_training_ops)})

    def _init_actor_update(self):
        """Create minimization operations for policy and entropy.

        Creates a `tf.optimizer.minimize` operations for updating
        policy and entropy with gradient descent, and adds them to
        `self._training_ops` attribute.

        See Section 4.2 in [1], for further information of the policy update,
        and Section 5 in [1] for further information of the entropy update.
        """

        actions = self._policy.actions([self._observations_ph])
        log_pis = self._policy.log_pis([self._observations_ph], actions)

        assert log_pis.shape.as_list() == [None, 1]

        log_alpha = self._log_alpha = tf.get_variable(
            'log_alpha',
            dtype=tf.float32,
            initializer=0.0)
        alpha = tf.exp(log_alpha)

        if isinstance(self._target_entropy, Number):
            alpha_loss = -tf.reduce_mean(
                log_alpha * tf.stop_gradient(log_pis + self._target_entropy))

            self._alpha_optimizer = tf.train.AdamOptimizer(
                self._policy_lr, name='alpha_optimizer')
            self._alpha_train_op = self._alpha_optimizer.minimize(
                loss=alpha_loss, var_list=[log_alpha])

            self._training_ops.update({
                'temperature_alpha': self._alpha_train_op
            })

        self._alpha = alpha

        if self._action_prior == 'normal':
            policy_prior = tfp.distributions.MultivariateNormalDiag(
                loc=tf.zeros(self._action_shape),
                scale_diag=tf.ones(self._action_shape))
            policy_prior_log_probs = policy_prior.log_prob(actions)
        elif self._action_prior == 'uniform':
            policy_prior_log_probs = 0.0

        Q_log_targets = tuple(
            Q([self._observations_ph, actions])
            for Q in self._Qs)
        min_Q_log_target = tf.reduce_min(Q_log_targets, axis=0)

        if self._reparameterize:
            policy_kl_losses = (
                alpha * log_pis
                - min_Q_log_target
                - policy_prior_log_probs)
        else:
            raise NotImplementedError

        assert policy_kl_losses.shape.as_list() == [None, 1]

        self._policy_losses = policy_kl_losses
        policy_loss = tf.reduce_mean(policy_kl_losses)

        self._policy_optimizer = tf.train.AdamOptimizer(
            learning_rate=self._policy_lr,
            name="policy_optimizer")

        policy_train_op = self._policy_optimizer.minimize(
            loss=policy_loss,
            var_list=self._policy.trainable_variables)

        self._training_ops.update({'policy_train_op': policy_train_op})

    def _init_diagnostics_ops(self):
        diagnosables = OrderedDict((
            ('Q_value', self._Q_values),
            ('Q_loss', self._Q_losses),
            ('policy_loss', self._policy_losses),
            ('alpha', self._alpha)
        ))

        diagnostic_metrics = OrderedDict((
            ('mean', tf.reduce_mean),
            ('std', lambda x: tfp.stats.stddev(x, sample_axis=None)),
        ))

        self._diagnostics_ops = OrderedDict([
            (f'{key}-{metric_name}', metric_fn(values))
            for key, values in diagnosables.items()
            for metric_name, metric_fn in diagnostic_metrics.items()
        ])

    def _init_training(self):
        self._update_target(tau=1.0)

    def _update_target(self, tau=None):
        tau = tau or self._tau

        for Q, Q_target in zip(self._Qs, self._Q_targets):
            source_params = Q.get_weights()
            target_params = Q_target.get_weights()
            Q_target.set_weights([
                tau * source + (1.0 - tau) * target
                for source, target in zip(source_params, target_params)
            ])

    def _do_training(self, iteration, batch):
        """Runs the operations for updating training and target ops."""

        feed_dict = self._get_feed_dict(iteration, batch)
        val = self._session.run([self.raw_q_target, self._training_ops], feed_dict)
        self.value_rms.update(val[0])
        if iteration % self._target_update_interval == 0:
            # Run target ops here.
            self._update_target()

    def _get_feed_dict(self, iteration, batch):
        """Construct TensorFlow feed_dict from sample batch."""

        feed_dict = {
            self._observations_ph: batch['observations'],
            self._actions_ph: batch['actions'],
            self._next_observations_ph: batch['next_observations'],
            self._rewards_ph: batch['rewards'],
            self._terminals_ph: batch['terminals'],
        }

        if self._store_extra_policy_info:
            feed_dict[self._log_pis_ph] = batch['log_pis']
            feed_dict[self._raw_actions_ph] = batch['raw_actions']

        if iteration is not None:
            feed_dict[self._iteration_ph] = iteration

        return feed_dict

    def get_diagnostics(self,
                        iteration,
                        batch,
                        training_paths,
                        evaluation_paths):
        """Return diagnostic information as ordered dictionary.

        Records mean and standard deviation of Q-function and state
        value function, and TD-loss (mean squared Bellman error)
        for the sample batch.

        Also calls the `draw` method of the plotter, if plotter defined.
        """

        feed_dict = self._get_feed_dict(iteration, batch)
        diagnostics = self._session.run(self._diagnostics_ops, feed_dict)

        diagnostics.update(OrderedDict([
            (f'policy/{key}', value)
            for key, value in
            self._policy.get_diagnostics(batch['observations']).items()
        ]))

        if self._plotter:
            self._plotter.draw()

        return diagnostics

    @property
    def tf_saveables(self):
        saveables = {
            '_policy_optimizer': self._policy_optimizer,
            **{
                f'Q_optimizer_{i}': optimizer
                for i, optimizer in enumerate(self._Q_optimizers)
            },
            '_log_alpha': self._log_alpha,
        }

        if hasattr(self, '_alpha_optimizer'):
            saveables['_alpha_optimizer'] = self._alpha_optimizer

        return saveables
示例#8
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 aux_apply='both',
                 aux_tasks=[],
                 aux_lambdas={}):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        self.norm_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                          self.observation_range[0],
                                          self.observation_range[1])
        self.norm_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                          self.observation_range[0],
                                          self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Aux Inputs.
        self.aux_apply = aux_apply
        self.aux_tasks = aux_tasks
        self.aux_lambdas = aux_lambdas

        if 'prop' in self.aux_tasks or 'caus' in self.aux_tasks or 'repeat' in self.aux_tasks:
            self.obs100 = tf.placeholder(tf.float32,
                                         shape=(None, ) + observation_shape,
                                         name='obs100')
            self.obs101 = tf.placeholder(tf.float32,
                                         shape=(None, ) + observation_shape,
                                         name='obs101')
            self.actions100 = tf.placeholder(tf.float32,
                                             shape=(None, ) + action_shape,
                                             name='actions100')
            self.norm_obs100 = tf.clip_by_value(
                normalize(self.obs100, self.obs_rms),
                self.observation_range[0], self.observation_range[1])
            self.norm_obs101 = tf.clip_by_value(
                normalize(self.obs101, self.obs_rms),
                self.observation_range[0], self.observation_range[1])
        if 'caus' in self.aux_tasks:
            self.rewards100 = tf.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='rewards100')

        # Create target networks.
        target_actor = deepcopy(actor)
        target_actor.name = 'target_actor'
        target_actor.repr.name = 'target_actor_repr'
        self.target_actor = target_actor
        target_critic = deepcopy(critic)
        target_critic.name = 'target_critic'
        target_critic.repr.name = 'target_critic_repr'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(self.norm_obs0)
        self.normalized_critic_tf = critic(self.norm_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(self.norm_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        Q_obs1 = denormalize(
            target_critic(self.norm_obs1, target_actor(self.norm_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(self.norm_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        if self.aux_tasks:
            logger.info("aux_tasks:{}".format(self.aux_tasks))
            self.setup_aux_optimizer()

    def setup_aux_optimizer(self):
        logger.info('setting up aux optimizer for actor...')

        # check if unknown or duplicate aux tasks have been given
        for task in self.aux_tasks:
            if not task in ("tc", "prop", "caus", "repeat", "predict"):
                raise ValueError("!! task {} not implemented !!".format(task))
            if self.aux_tasks.count(task) > 1:
                raise ValueError(
                    "!! multiple tasks {} given, not valid !!".format(task))

        self.aux_ops = []
        self.aux_losses = tf.Variable(tf.zeros([], dtype=np.float32),
                                      name="loss")
        self.aux_vars = set([])

        reprowners = []
        if self.aux_apply is 'actor' or 'both':
            reprowners.append(self.actor)
        if self.aux_apply is 'critic' or 'both':
            reprowners.append(self.critic)

        for owner in reprowners:
            if any(task in self.aux_tasks
                   for task in ("tc", "prop", "caus", "repeat")):
                representation = Representation(name=owner.repr.name,
                                                layer_norm=owner.layer_norm)
                self.aux_vars.update(set(representation.trainable_vars))
                s0 = representation(self.norm_obs0, reuse=True)

            if any(task in self.aux_tasks
                   for task in ("tc", "prop", "repeat")):
                s1 = representation(self.norm_obs1, reuse=True)

            if any(task in self.aux_tasks
                   for task in ("prop", "caus", "repeat")):
                s100 = representation(self.norm_obs100, reuse=True)

            if any(task in self.aux_tasks for task in ("prop", "repeat")):
                s101 = representation(self.norm_obs101, reuse=True)

            if 'tc' in self.aux_tasks:
                # temporal coherence loss is the sum of two terms:
                #   a - loss is present for small state changes brought by big actions
                #   b - loss is present for big state changes brought by small actions
                #          (similarity here is used as inversion mechanism)
                tc_loss_a = similarity(magnitude(s1 - s0)) * magnitude(
                    self.actions)
                tc_loss_b = similarity(magnitude(
                    self.actions)) * magnitude(s1 - s0)
                self.tc_loss = tf.reduce_mean(tc_loss_a + tc_loss_b)
                self.aux_losses += normalize_loss(self.tc_loss)

            if 'prop' in self.aux_tasks:
                # proportionality loss:
                #   punish the difference in magnitude of state change, given action similarity
                #   for two unrelated steps
                dsmag0 = magnitude(s1 - s0)
                dsmag100 = magnitude(s101 - s100)
                dsmagdiff = tf.square(dsmag100 - dsmag0)
                actmagsim = similarity(
                    magnitude(self.actions100 - self.actions))
                self.prop_loss = tf.reduce_mean(dsmagdiff * actmagsim)
                self.aux_losses += normalize_loss(self.prop_loss)

            if 'caus' in self.aux_tasks:
                # causality loss:
                #   punish similarity in state, given action similarity and reward difference
                #   for two unrelated steps
                s_sim = similarity(magnitude(s100 - s0))
                a_sim = similarity(magnitude(self.actions100 - self.actions))
                r_diff = magnitude(self.rewards100 - self.rewards)
                self.caus_loss = tf.reduce_mean(s_sim * a_sim * r_diff)
                self.aux_losses += normalize_loss(self.caus_loss)

            if 'repeat' in self.aux_tasks:
                # repeatability loss:
                #   punish difference in state change, given state and action similarity
                #   for two unrelated steps
                ds0 = s1 - s0
                ds100 = s101 - s100
                dsdiff = magnitude(ds100 - ds0)
                s_sim = similarity(magnitude(s100 - s0))
                a_sim = similarity(magnitude(self.actions100 - self.actions))
                self.repeat_loss = tf.reduce_mean(dsdiff * s_sim * a_sim)
                self.aux_losses += normalize_loss(self.repeat_loss)

            if 'predict' in self.aux_tasks:
                # prediction loss:
                #   punish the difference between the actual and predicted next step
                predictor = Predictor(name=owner.name,
                                      layer_norm=owner.layer_norm)
                reconstr = predictor(self.norm_obs0, self.actions, reuse=True)
                self.pred_loss = tf.nn.l2_loss(self.norm_obs1 - reconstr)
                self.aux_losses += normalize_loss(self.pred_loss)
                self.aux_vars.update(set(predictor.trainable_vars))

        self.aux_losses = self.aux_losses / (2 * len(self.aux_tasks))
        self.aux_vars = list(self.aux_vars)
        self.aux_grads = U.flatgrad(self.aux_losses,
                                    self.aux_vars,
                                    clip_norm=self.clip_norm)
        self.aux_optimizer = MpiAdam(var_list=self.aux_vars,
                                     beta1=0.9,
                                     beta2=0.999,
                                     epsilon=1e-08)

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        param_noise_actor.repr.name = 'param_noise_actor_repr'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_param_noise_actor.repr.name = 'adaptive_param_noise_actor_repr'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(normalize_loss(self.actor_loss),
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg

        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(normalize_loss(self.critic_loss),
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        if self.aux_tasks is not None:
            batch = self.memory.sampletwice(batch_size=self.batch_size)
        else:
            batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get gradients DDPG
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        feed_dict = {
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q
        }
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops, feed_dict=feed_dict)

        #print("actor grads norm: {}".format(np.linalg.norm(actor_grads)))
        #print("critic grads norm: {}".format(np.linalg.norm(critic_grads)))
        # Perform a synced update.
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        auxoutputs = []
        # Get gradients AUX
        if self.aux_tasks:
            aux_dict = {}
            aux_ops = {'aux_grads': self.aux_grads}
            for index, auxtask in enumerate(self.aux_tasks):
                if auxtask == 'tc':
                    aux_dict.update({
                        self.obs0: batch['obs0'],
                        self.obs1: batch['obs1'],
                        self.actions: batch['actions']
                    })
                    aux_ops.update({'tc': self.tc_loss})
                if auxtask == 'prop':
                    aux_dict.update({
                        self.obs0: batch['obs0'],
                        self.obs1: batch['obs1'],
                        self.obs100: batch['obs100'],
                        self.obs101: batch['obs101'],
                        self.actions: batch['actions'],
                        self.actions100: batch['actions100']
                    })
                    aux_ops.update({'prop': self.prop_loss})
                if auxtask == 'caus':
                    aux_dict.update({
                        self.obs0: batch['obs0'],
                        self.obs100: batch['obs100'],
                        self.actions: batch['actions'],
                        self.actions100: batch['actions100'],
                        self.rewards: batch['rewards'],
                        self.rewards100: batch['rewards100']
                    })
                    aux_ops.update({'caus': self.caus_loss})
                if auxtask == 'repeat':
                    aux_dict.update({
                        self.obs0: batch['obs0'],
                        self.obs1: batch['obs1'],
                        self.obs100: batch['obs100'],
                        self.obs101: batch['obs101'],
                        self.actions: batch['actions'],
                        self.actions100: batch['actions100']
                    })
                    aux_ops.update({'repeat': self.repeat_loss})
                if auxtask == 'predict':
                    aux_dict.update({
                        self.obs0: batch['obs0'],
                        self.obs1: batch['obs1'],
                        self.actions: batch['actions']
                    })
                    aux_ops.update({'predict': self.pred_loss})
            auxoutputs = self.sess.run(aux_ops, feed_dict=aux_dict)
            auxgrads = auxoutputs['aux_grads']
            # add act and crit grads to auxoutputs
            auxoutputs['actor_grads'] = actor_grads
            auxoutputs['critic_grads'] = critic_grads
            #print("aux grads norm: {}".format(np.linalg.norm(auxgrads)))
            self.aux_optimizer.update(auxgrads, stepsize=self.actor_lr)

        return critic_loss, actor_loss, auxoutputs

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
class ActorLearner(object):
    def __init__(self, name, actor, memory, observation_shape, action_shape,
        gamma=0.95, tau=0.001, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), return_range=(-np.inf, np.inf),
        actor_l2_reg=0., actor_lr=5e-5, clip_norm=None, ):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='expert_actor_obs0')
        self.action_target = tf.placeholder(tf.float32, shape=(None,) + action_shape, name=name+'action_target')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.return_range = return_range
        self.observation_range = observation_range
        self.clip_norm = clip_norm
        self.batch_size = batch_size
        self.stats_sample = None
        self.actor_l2_reg = actor_l2_reg
        self.actor = actor
        self.actor_lr = actor_lr

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope(name + 'obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)

        # Set up parts.
        self.setup_actor_optimizer()
        self.setup_stats()

        self.initial_state = None # recurrent architectures not supported yet

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = tf.reduce_mean(tf.square(self.actor_tf - self.action_target))
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr)
        self.optimize_expr = self.actor_optimizer.minimize(self.actor_loss, var_list=self.actor.trainable_vars)

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        self.stats_ops = ops
        self.stats_names = names

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)
        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss]
        actor_grads, actor_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.action_target: batch['actions'],
        })
        # with self.graph.as_default():
        self.optimize_expr.run(session=self.sess,
            feed_dict={
                self.obs0: batch['obs0'],
                self.action_target: batch['actions'],
            }
            )

        return actor_loss

    def initialize(self, sess):
        self.sess = sess

    def save(self, path):
        save_variables(path)

    def load(self, path):
        load_variables(path)

    def store_transition(self, obs0, action):
        # B = obs0.shape[0]
        # for b in range(B):
        self.memory.append(obs0, action)
        if self.normalize_observations:
            self.obs_rms.update(obs0)
        print("Stored ", obs0.shape)

    def __call__(self, obs):
        # with self.graph.as_default():
        print("Expert Actor call")
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, obs)}
        # import IPython; IPython.embed()
        action = self.sess.run([self.actor_tf], feed_dict=feed_dict)
        print("Expert Actor return")
        return action
示例#10
0
class MADDPG(object):
    def __init__(self,
                 name,
                 actor,
                 critic,
                 memory,
                 obs_space_n,
                 act_space_n,
                 agent_index,
                 obs_rms,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        self.name = name
        self.num_agents = len(obs_space_n)
        self.agent_index = agent_index

        from gym import spaces
        continuous_ctrl = not isinstance(act_space_n[0], spaces.Discrete)
        # TODO: remove after testing
        assert continuous_ctrl

        # Multi-agent inputs
        # self.obs0 = []
        # self.obs1 = []
        self.actions = []
        # self.norm_obs0_ph = []
        # self.norm_obs1_ph = []

        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(
                                       self.num_agents,
                                       None,
                                   ) + obs_space_n[self.agent_index].shape,
                                   name="obs0")
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(
                                       self.num_agents,
                                       None,
                                   ) + obs_space_n[self.agent_index].shape,
                                   name="obs1")

        # if continuous_ctrl:
        #     self.actions = tf.placeholder(tf.float32, shape=(self.num_agents, None,) + act_space_n[self.agent_index].shape, name="action")
        # else:
        #     act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        #     self.actions = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]

        # this is required to reshape obs and actions for concatenation
        obs_shape_list = [self.num_agents] + list(
            obs_space_n[self.agent_index].shape)
        act_shape_list = [self.num_agents] + list(
            act_space_n[self.agent_index].shape)
        self.obs_shape_prod = np.prod(obs_shape_list)
        self.act_shape_prod = np.prod(act_shape_list)

        for i in range(self.num_agents):
            # each obs in obs0,obs1 contains info about ego agent and relative pos/vel of other agents
            # self.obs0.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs0_"+str(i)))
            # self.obs1.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="obs1_"+str(i)))

            if continuous_ctrl:
                self.actions.append(
                    tf.placeholder(tf.float32,
                                   shape=[None] + list(act_space_n[i].shape),
                                   name="action" + str(i)))
            else:
                self.actions.append(
                    make_pdtype(act_space_n[i]).sample_placeholder(
                        [None], name="action" + str(i)))

            # self.norm_obs0_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs0_"+str(i)))
            # self.norm_obs1_ph.append(tf.placeholder(tf.float32, shape=[None] + list(obs_space_n[i].shape), name="norm_obs1_"+str(i)))

        # self.norm_obs0_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs0")
        # self.norm_obs1_ph = tf.placeholder(tf.float32, shape=[self.num_agents, None] + list(obs_space_n[self.agent_index].shape), name="norm_obs1")

        # we only provide single agent inputs for these placeholders
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')

        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        # TODO: need to update the replay buffer storage function to account for multiple agents
        if self.normalize_observations:
            self.obs_rms = obs_rms
        else:
            self.obs_rms = None

        # Need to transpose observations so we can normalize them
        # converts tensor to shape (batch_size, num_agents, space_size)
        # transose on dim 0 and 1, leave dim 2 unchanged
        obs0_t = tf.transpose(self.obs0, perm=[1, 0, 2])
        obs1_t = tf.transpose(self.obs1, perm=[1, 0, 2])
        actions_t = tf.transpose(self.actions, perm=[1, 0, 2])

        # each entry in obs_t is normalized wrt the agent
        normalized_obs0 = tf.clip_by_value(normalize(obs0_t, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1_t, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # convert the obs to original shape after normalization for convenience
        normalized_act_obs0 = tf.transpose(normalized_obs0, perm=[1, 0, 2])
        normalized_act_obs1 = tf.transpose(normalized_obs1, perm=[1, 0, 2])

        # need to specify exact shape, since we dont always pass batch size number of obs/act
        normalized_obs0_flat = tf.reshape(normalized_obs0,
                                          [-1, self.obs_shape_prod])
        normalized_obs1_flat = tf.reshape(normalized_obs1,
                                          [-1, self.obs_shape_prod])
        actions_t_flat = tf.reshape(actions_t, [-1, self.act_shape_prod])

        # Return normalization.
        # TODO: update this to handle multiple agents if required
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        # Each agents gets its own observation
        self.actor_tf = actor(normalized_act_obs0[self.agent_index])
        self.target_actor_tf = target_actor(
            normalized_act_obs1[self.agent_index])

        # Critic gets all observations
        self.normalized_critic_tf = critic(normalized_obs0_flat,
                                           actions_t_flat)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # need to provide critic() with all actions
        act_input_n = self.actions + []  # copy actions
        act_input_n[
            self.
            agent_index] = self.actor_tf  # update current agent action using its actor
        act_input_n_t = tf.transpose(act_input_n, perm=[1, 0, 2])
        act_input_n_t_flat = tf.reshape(act_input_n_t,
                                        [-1, self.act_shape_prod])
        self.normalized_critic_with_actor_tf = critic(normalized_obs0_flat,
                                                      act_input_n_t_flat,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # we need to use actions for all agents
        target_act_input_n = self.actions + []  # copy actions
        target_act_input_n[
            self.
            agent_index] = self.target_actor_tf  # update current agent action using its target actor
        target_act_input_n_t = tf.transpose(target_act_input_n, perm=[1, 0, 2])
        target_act_input_n_t_flat = tf.reshape(target_act_input_n_t,
                                               [-1, self.act_shape_prod])
        Q_obs1 = denormalize(
            target_critic(normalized_obs1_flat, target_act_input_n_t_flat),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            # param noise is added to actor; hence obs for current agent is required
            self.setup_param_noise(normalized_act_obs0[self.agent_index])
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if var.name.endswith('/w:0') and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    # TODO: need to provide all observations to compute q
    def step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        # feed_dict={ph: [data] for ph, data in zip(self.obs0, obs)}
        # feed_dict = {self.obs0: [obs]}

        # Get the normalized obs first
        # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action[0], q, None, None

    # TODO: test this
    # Computing this every time step may slow things
    def get_q_value(self, obs_n, act_n):
        # assuming computing q value for one state; hence need [] around data
        feed_dict = {ph: [data] for ph, data in zip(self.obs0, obs_n)}
        act_dict = {ph: [data] for ph, data in zip(self.actions, act_n)}
        feed_dict.update(act_dict)
        q = self.sess.run(self.critic_with_actor_tf, feed_dict=feed_dict)
        return q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        # print(action)
        B = obs0.shape[0]
        a_idx = self.agent_index
        for b in range(B):
            self.memory.append(obs0[b][a_idx], action[b][a_idx],
                               reward[b][a_idx], obs1[b][a_idx],
                               terminal1[b][a_idx])

            # NOTE: calling update for each agent is ok, since the mean and std are uneffected
            # this is because the same obs are repeated num_agent times, which dont affect value
            if self.normalize_observations:
                # provide full obs for obs_rms update
                obs0_shape = (len(obs0[b]), ) + obs0[b][a_idx].shape
                assert obs0_shape == (self.num_agents, ) + obs0[b][a_idx].shape
                self.obs_rms.update(np.array([obs0[b]]))

    # TODO: not using this right now
    def update_obs_rms(self, obs0):
        if not self.normalize_observations:
            return
        B = obs0.shape[0]
        for b in range(B):
            # provide full obs for obs_rms update
            self.obs_rms.update(np.array([obs0[b]]))
        return

    def train(self, agents):
        # generate indices to access batches from all agents
        replay_sample_index = self.memory.generate_index(self.batch_size)

        # collect replay sample from all agents
        obs0_n = []
        obs1_n = []
        rewards_n = []
        act_n = []
        terminals1_n = []
        for i in range(self.num_agents):
            # Get a batch.
            batch = agents[i].memory.sample(batch_size=self.batch_size,
                                            index=replay_sample_index)
            obs0_n.append(batch['obs0'])
            obs1_n.append(batch['obs1'])
            act_n.append(batch['actions'])
            # rewards_n.append(batch['rewards'])
            # terminals1_n.append(batch['terminals1'])
        batch = self.memory.sample(batch_size=self.batch_size,
                                   index=replay_sample_index)

        # fill placeholders in obs1 with corresponding obs from each agent's replay buffer
        # self.obs1 and obs1_n are lists of size num_agents
        # feed_dict={ph: data for ph, data in zip(self.obs1, obs1_n)}
        feed_dict = {self.obs1: obs1_n}

        # TODO: find a better way to do this
        # Get the normalized obs first
        # norm_obs1 = self.sess.run(self.norm_obs1, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {self.norm_obs1_ph: norm_obs1}
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs1_ph, norm_obs1)}

        # actions required for critic
        act_dict = {ph: data for ph, data in zip(self.actions, act_n)}
        feed_dict.update(act_dict)
        feed_dict.update({self.rewards: batch['rewards']})
        feed_dict.update(
            {self.terminals1: batch['terminals1'].astype('float32')})

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict=feed_dict)
            # old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })

            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict=feed_dict)
            # target_Q = self.sess.run(self.target_Q, feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]

        # generate feed_dict for multiple observations and actions
        # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)}
        feed_dict = {self.obs0: obs0_n}

        # Get the normalized obs first
        # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {self.norm_obs0_ph: norm_obs0}
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

        # act_dict={ph: data for ph, data in zip(self.actions, act_n)}
        feed_dict.update(act_dict)
        feed_dict.update({self.critic_target: target_Q})

        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops, feed_dict=feed_dict)
        # actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
        #     self.obs0: batch['obs0'],
        #     self.actions: batch['actions'],
        #     self.critic_target: target_Q,
        # })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess

    def agent_initialize(self, sess):
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)
        # setup saving and loading functions
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self, agents):
        if self.stats_sample is None:
            replay_sample_index = self.memory.generate_index(self.batch_size)
            # collect replay sample from all agents
            obs0_n, act_n = [], []
            for i in range(self.num_agents):
                batch = agents[i].memory.sample(batch_size=self.batch_size,
                                                index=replay_sample_index)
                obs0_n.append(batch['obs0'])
                act_n.append(batch['actions'])
            # generate feed_dict for multiple observations and actions
            # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)}
            feed_dict = {self.obs0: obs0_n}

            # Get the normalized obs first
            # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
            # use the normalized obs for training
            # feed_dict = {self.norm_obs0_ph: norm_obs0}
            # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

            actions_dict = {ph: data for ph, data in zip(self.actions, act_n)}
            feed_dict.update(actions_dict)

            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = feed_dict
        values = self.sess.run(self.stats_ops, feed_dict=self.stats_sample)

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self, agents):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        replay_sample_index = self.memory.generate_index(self.batch_size)
        obs0_n = []
        for i in range(self.num_agents):
            batch = agents[i].memory.sample(batch_size=self.batch_size,
                                            index=replay_sample_index)
            obs0_n.append(batch['obs0'])
        # feed_dict={ph: data for ph, data in zip(self.obs0, obs0_n)}
        feed_dict = {self.obs0: obs0_n}

        # Get the normalized obs first
        # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict)
        # use the normalized obs for training
        # feed_dict = {self.norm_obs0_ph: norm_obs0}
        # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)}

        feed_dict.update(
            {self.param_noise_stddev: self.param_noise.current_stddev})

        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict=feed_dict)
        # distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
        #     self.obs0: batch['obs0'],
        #     self.param_noise_stddev: self.param_noise.current_stddev,
        # })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#11
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-1., 1.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
        self.saver = self.get_saver()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def get_saver(self):
        exclude = [
            'Variable/ExponentialMovingAverage:0',
            'Variable/Adam:0',
            'Variable/Adam_1:0',
            'Variable_8/ExponentialMovingAverage:0',
            'Variable_8/Adam:0'
            'Variable/Adam_1:0',
        ]
        nodes = tf.trainable_variables()
        mapping = {
            var.name.split(':')[0]: var
            for var in nodes if var.name not in exclude
        }

        return tf.train.Saver()

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def save_model(self, path, episode):
        filename = os.path.join(path, "model.ckpt")
        self.saver.save(self.sess, filename, episode)
        print("Saved model to ", filename)

    def restore_model(self, path):
        try:
            checkpoint = tf.train.latest_checkpoint(path)
            self.saver.restore(self.sess, checkpoint)
            print("Restored model from ", checkpoint)
        except Exception as e:
            print(e)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def q(self, obs):
        """Compute the q value for some observation"""
        return self.sess.run(self.critic_with_actor_tf,
                             feed_dict={self.obs0: obs})

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#12
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None,
                 action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False,
                 normalize_observations=True, batch_size=128, observation_range=(-5., 5.),
                 action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name="obs0")
        self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name="obs1")
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name="terminals1")
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name="rewards")
        self.actions = tf.placeholder(tf.float32, shape=(None, ) + action_shape, name="actions")
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name="critic_target")
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name="param_noise_stddev")

        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        if self.normalize_observations:
            with tf.variable_scope("obs_rms"):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])

        if self.normalize_returns:
            with tf.variable_scope("ret_rms"):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        target_actor = copy(actor)
        target_actor.name = "target_actor"
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = "target_critic"
        self.target_critic = target_critic

        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf,
                                                      self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                                                 self.return_range[0], self.return_range[1]),
                                                self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1


        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()


    def setup_stats(self):
        ops = []
        names = []
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ["ret_rms_mean", "ret_rms_std"]
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ["obs_rms_mean", "obs_rms_std"]

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ["reference_Q_mean"]
        ops += [reduce_std(self.critic_tf)]
        names += ["reference_Q_std"]

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ["reference_actor_Q_mean"]
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ["reference_actor_Q_std"]

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ["reference_action_mean"]
        ops += [reduce_std(self.actor_tf)]
        names += ["reference_action_std"]

        if self.param_noise:
            ops += [tf.reduce_mean(self.pertubed_actor_tf)]
            names += ["reference_perturbed_action_mean"]
            ops += [reduce_std(self.pertubed_actor_tf)]
            names += ["reference_perturbed_action_std"]

        self.stats_ops = ops
        self.stats_names = names

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]


    def setup_popart(self):
        self.old_std = tf.placeholder(tf.float32, shape=[1], name="old_std")
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name="old_mean")
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert "kernel" in M.name
            assert "bias" in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean)/ new_std)]


    def setup_critic_optimizer(self):
        logger.info("setting up critic optimizer")
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                       self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0:
            critic_reg_vars = [var for var in self.critic.trainable_vars if "kernel" in var.name and "output" not in var.name]
            for var in critic_reg_vars:
                logger.info(" regularizing: {}".format(var.name))
            logger.info(" applying l2 regularization with {}".format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg

        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x *y, shape) for shape in critic_shapes])
        logger.info(" critic shapes: {}".format(critic_shapes))
        logger.info(" critic params: {}".format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_actor_optimizer(self):
        logger.info("setting up actor optimizer")
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info(" actor shapes: {}".format(actor_shapes))
        logger.info(" actor params: {}".format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        param_noise_actor = copy(self.actor)
        param_noise_actor.name = "param_noise_actor"
        self.pertubed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info("setting up param noise")
        self.pertub_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = "adaptive_param_noise_actor"
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor,
                                                                       adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))



    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.pertub_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev
            })

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.pertubed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward = self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={self.param_noise_stddev: self.param_noise.current_stddev})
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0 : batch["obs0"],
            self.param_noise_stddev: self.param_noise.current_stddev
        })
        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def train(self):
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std,
                                                         self.target_Q], feed_dict = {
                self.obs1: batch["obs1"],
                self.rewards: batch["rewards"],
                self.terminals1: batch["terminals1"].astype("float32")
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std: np.array([old_std]),
                self.old_mean: np.array([old_mean]),
            })

        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch["obs1"],
                self.rewards: batch["rewards"],
                self.terminals1: batch["terminals1"].astype("float32")
            })

        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch["obs0"],
            self.actions: batch["actions"],
            self.critic_target: target_Q
        })

        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample["obs0"],
            self.actions: self.stats_sample["actions"],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats
示例#13
0
class DDPG(object):
    def __init__(self, **params):
        for k in params:
            setattr(self, k, params[k])
        self.init_args = copy(params)

        if self.her:
            # self.obs_to_goal = None
            # self.goal_idx = None
            # self.reward_fn = None
            self.memory = HERBuffer(limit=int(self.buffer_size),
                                    action_shape=self.action_shape,
                                    observation_shape=self.observation_shape,
                                    obs_to_goal=self.obs_to_goal,
                                    goal_slice=self.goal_idx,
                                    reward_fn=self.reward_fn)
        else:
            self.memory = Memory(limit=int(self.buffer_size),
                                 action_shape=self.action_shape,
                                 observation_shape=self.observation_shape)

        self.critic = Critic(layer_norm=self.layer_norm)
        self.actor = Actor(self.action_shape[-1], layer_norm=self.layer_norm)

        self.action_noise = NormalActionNoise(mu=np.zeros(self.action_shape),
                                              sigma=float(self.noise_sigma) *
                                              np.ones(self.action_shape))
        self.param_noise = None

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + self.observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + self.observation_shape,
                                   name='obs1')
        # self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + self.action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = self.critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = self.critic(normalized_obs0,
                                                           self.actor_tf,
                                                           reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        # self.target_Q = self.rewards + (1. - self.terminals1) * self.gamma * Q_obs1
        self.target_Q = self.rewards + self.gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    # self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(
                self.target_Q,
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    # self.terminals1: batch['terminals1'].astype('float32'),
                })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
        self.flush()

    def flush(self):
        if self.her:
            self.memory.flush()

    def get_save_tf(self):
        all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        return self.sess.run(all_variables)

    def restore_tf(self, save):
        all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        restore_ops = []
        for x, y in zip(all_variables, save):
            restore_ops.append(tf.assign(x, y))
        self.sess.run(restore_ops)

    def __getstate__(self):
        exclude_vars = set(["env"])
        args = {}
        for k in self.init_args:
            if k not in exclude_vars:
                args[k] = self.init_args[k]
        return {'tf': self.get_save_tf(), 'init': args}

    def __setstate__(self, state):
        self.__init__(**state['init'])

        self.sess = tf.InteractiveSession(
        )  # for now just make ourself a session
        self.sess.run(tf.global_variables_initializer())
        self.restore_tf(state['tf'])
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
示例#14
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 dropout_on_v,
                 dropout_tau_V,
                 override_reg,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=64,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        ###  MAIN CHANGES
        self.override_reg = override_reg
        self.dropout_on_v = dropout_on_v
        self.dropout_tau_V = dropout_tau_V
        self.observation_shape = observation_shape
        self.b = tf.placeholder(tf.float32)
        ### END

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)

        ### MAIN CHANGES
        ## Q(s,a)
        if self.dropout_on_v is not None:
            self.normalized_critic_tf, _, self.normalized_critic_tf_mc = critic(
                normalized_obs0, self.actions)

        else:
            self.normalized_critic_tf = critic(normalized_obs0, self.actions)

        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        ## Q(s, mu(s))

        if self.dropout_on_v is not None:
            self.normalized_critic_with_actor_tf, self.normalized_critic_with_actor_tf_avg, _ = critic(
                normalized_obs0, self.actor_tf, reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf_avg,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
        else:
            self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                          self.actor_tf,
                                                          reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)

        self.Q = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        if dropout_on_v is not None:
            Q_obs1, _, _ = denormalize(
                target_critic(normalized_obs1, target_actor(normalized_obs1)),
                self.ret_rms)
        else:
            Q_obs1 = denormalize(
                target_critic(normalized_obs1, target_actor(normalized_obs1)),
                self.ret_rms)

        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1
        ### END OF CHANGES

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
        if replace_masks:
            update_dropout_masks([
                x for x in self.critic.vars
                if 'dropout' in x.name and 'mask' in x.name
            ],
                                 self.critic.keep_prob,
                                 execute=False)
            update_dropout_masks([
                x for x in self.target_critic.vars
                if 'dropout' in x.name and 'mask' in x.name
            ],
                                 self.critic.keep_prob,
                                 execute=False)

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.trainable_vars, self.target_critic.trainable_vars,
            self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')

        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])

        ### MAIN CHANGES

        ### eq 10 of the alpha black box dropout
        if self.dropout_on_v is not None:
            self.alpha = 0.5
            x = normalized_critic_target_tf
            self.flat = self.normalized_critic_tf_mc
            flat_stacked = tf.stack(self.flat)  # K x M x outsize
            # M x B X outsize
            sumsq = U.sum(tf.square(x - flat_stacked), -1)
            sumsq *= (-.5 * self.alpha * self.dropout_tau_V)
            self.critic_loss = (-1.0 * self.alpha**-1.0) * logsumexp(sumsq, 0)
            self.l2_value = self.critic.keep_prob * float(
                self.batch_size) / (float(self.memory.nb_entries) + 1)
            self.critic_l2_reg = tf.Variable(self.l2_value, trainable=False)

        else:
            self.critic_loss = tf.reduce_mean(
                tf.square(self.normalized_critic_tf -
                          normalized_critic_target_tf))
            if self.override_reg is not None:
                self.critic_l2_reg = self.override_reg

        ### END OF CHANGES

        if self.override_reg is not None:

            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            ### MAIN CHANGES
            action, q = self.sess.run([actor_tf, self.Q], feed_dict=feed_dict)
#            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
### END OF CHANGES
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
        if replace_masks:
            update_dropout_masks([
                x for x in self.critic.vars
                if 'dropout' in x.name and 'mask' in x.name
            ], self.critic.keep_prob)
            update_dropout_masks([
                x for x in self.target_critic.vars
                if 'dropout' in x.name and 'mask' in x.name
            ], self.critic.keep_prob)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#15
0
class TransitionClassifier(object):
    def __init__(self,
                 ob_size,
                 ac_size,
                 hidden_size=100,
                 log_reward=False,
                 entcoeff=0.001,
                 scope="adversary",
                 dyn_norm=True):
        self.scope = scope
        self.ob_size = ob_size
        self.ac_size = ac_size
        # self.input_size = ob_size + ac_size
        self.hidden_size = hidden_size
        self.log_reward = log_reward
        self.dyn_norm = dyn_norm
        self.build_ph()
        # Build grpah
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph)
        # Build accuracy
        generator_acc = tf.reduce_mean(
            tf.cast(tf.nn.sigmoid(generator_logits) < 0.5, tf.float32))
        expert_acc = tf.reduce_mean(
            tf.cast(tf.nn.sigmoid(expert_logits) > 0.5, tf.float32))

        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        if log_reward:
            reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8)
        else:
            reward_op = tf.nn.sigmoid(generator_logits)

        self.reward = U.function(
            [self.generator_obs_ph, self.generator_acs_ph], reward_op)

        lr = tf.placeholder(tf.float32, None)
        self.trainer = tf.train.AdamOptimizer(learning_rate=lr)
        gvs = self.trainer.compute_gradients(self.total_loss,
                                             self.get_trainable_variables())
        self._train = U.function([
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph, lr
        ],
                                 self.losses,
                                 updates=[self.trainer.apply_gradients(gvs)])

    def build_ph(self):
        self.generator_obs_ph = tf.placeholder(tf.float32,
                                               (None, self.ob_size),
                                               name="observations_ph")
        self.generator_acs_ph = tf.placeholder(tf.float32,
                                               (None, self.ac_size),
                                               name="actions_ph")
        self.expert_obs_ph = tf.placeholder(tf.float32, (None, self.ob_size),
                                            name="expert_observations_ph")
        self.expert_acs_ph = tf.placeholder(tf.float32, (None, self.ac_size),
                                            name="expert_actions_ph")

    def build_graph(self, obs_ph, acs_ph):
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(shape=[self.ob_size])
            obs = normalize(obs_ph, self.obs_rms)
            _input = tf.concat(
                [obs, acs_ph],
                axis=1)  # concatenate the two input -> form a transition
            p_h1 = tf.contrib.layers.fully_connected(_input,
                                                     self.hidden_size,
                                                     activation_fn=tf.nn.tanh)
            p_h2 = tf.contrib.layers.fully_connected(p_h1,
                                                     self.hidden_size,
                                                     activation_fn=tf.nn.tanh)
            logits = tf.contrib.layers.fully_connected(p_h2,
                                                       1,
                                                       activation_fn=None)
        return logits

    def get_trainable_variables(self):
        return tf.trainable_variables(self.scope)

    def get_reward(self, obs, acs):
        return np.squeeze(self.reward(obs, acs))

    def build_reward_op(self, obs_ph, acs_ph):
        logits = self.build_graph(obs_ph, acs_ph)
        if self.log_reward:
            return -tf.log(1 - tf.nn.sigmoid(logits) + 1e-8)
        return tf.nn.sigmoid(logits)

    def set_expert_data(self, data):
        self.data = Dataset(data, deterministic=False)

    def train(self, rl_ob, rl_ac, steps=1, lr=3e-4):
        n = rl_ob.shape[0]
        loss_buf = []
        batch_size = rl_ob.shape[0] // steps
        for batch in iterbatches([rl_ob, rl_ac],
                                 include_final_partial_batch=False,
                                 batch_size=batch_size):
            exp_ob, exp_ac = self.data.next_batch(batch_size)
            if self.obs_rms and self.dyn_norm:
                self.obs_rms.update(np.concatenate([exp_ob, rl_ob], axis=0))
            loss_buf.append(self._train(*batch, exp_ob, exp_ac, lr))
        logger.info(fmt_row(13, self.loss_name))
        logger.info(fmt_row(13, np.mean(loss_buf, axis=0)))
示例#16
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        """
        Deep Deterministic Policy Gradien (DDPG) model

        DDPG: https://arxiv.org/pdf/1509.02971.pdf

        :param actor: (TensorFlow Tensor) the actor model
        :param critic: (TensorFlow Tensor) the critic model
        :param memory: (Memory) the replay buffer
        :param observation_shape: (tuple) the observation space
        :param action_shape: (tuple) the action space
        :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None)
        :param action_noise: (ActionNoise) the action noise type (can be None)
        :param gamma: (float) the discount rate
        :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
        :param normalize_returns: (bool) should the critic output be normalized
        :param enable_popart: (bool) enable pop-art normalization of the critic output
            (https://arxiv.org/pdf/1602.07714.pdf)
        :param normalize_observations: (bool) should the observation be normalized
        :param batch_size: (int) the size of the batch for learning the policy
        :param observation_range: (tuple) the bounding values for the observation
        :param action_range: (tuple) the bounding values for the actions
        :param return_range: (tuple) the bounding values for the critic output
        :param critic_l2_reg: (float) l2 regularizer coefficient
        :param actor_lr: (float) the actor learning rate
        :param critic_lr: (float) the critic learning rate
        :param clip_norm: (float) clip the gradients (disabled if None)
        :param reward_scale: (float) the value the reward should be scaled by
        """
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.target_init_updates = None
        self.target_soft_updates = None
        self.critic_loss = None
        self.critic_grads = None
        self.critic_optimizer = None
        self.sess = None

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_q = self.rewards + (1. - self.terminals1) * gamma * q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        """
        set the target update operations
        """
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        """
        set the parameter noise operations
        :param normalized_obs0: (TensorFlow Tensor) the normalized observation
        """
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        """
        setup the optimizer for the actor
        """
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = tf_util.flatgrad(self.actor_loss,
                                            self.actor.trainable_vars,
                                            clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        """
        setup the optimizer for the critic
        """
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = tf_util.flatgrad(self.critic_loss,
                                             self.critic.trainable_vars,
                                             clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        """
        setup pop-art normalization of the critic output

        See https://arxiv.org/pdf/1602.07714.pdf for details.
        Preserving Outputs Precisely, while Adaptively Rescaling Targets”.
        """
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_q_outputs_op = []
        for out_vars in [
                self.critic.output_vars, self.target_critic.output_vars
        ]:
            assert len(out_vars) == 2
            # wieght and bias of the last layer
            weight, bias = out_vars
            assert 'kernel' in weight.name
            assert 'bias' in bias.name
            assert weight.get_shape()[-1] == 1
            assert bias.get_shape()[-1] == 1
            self.renormalize_q_outputs_op += [
                weight.assign(weight * self.old_std / new_std)
            ]
            self.renormalize_q_outputs_op += [
                bias.assign(
                    (bias * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        """
        setup the running means and std of the inputs and outputs of the model
        """
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def policy(self, obs, apply_noise=True, compute_q=True):
        """
        Get the actions and critic output, from a given observation

        :param obs: ([float] or [int]) the observation
        :param apply_noise: (bool) enable the noise
        :param compute_q: (bool) compute the critic output
        :return: ([float], float) the action and critic value
        """
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_q:
            action, q_value = self.sess.run(
                [actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q_value = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q_value

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        """
        Store a transition in the replay buffer

        :param obs0: ([float] or [int]) the last observation
        :param action: ([float]) the action
        :param reward: (float] the reward
        :param obs1: ([float] or [int]) the current observation
        :param terminal1: (bool) is the episode done
        """
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        """
        run a step of training from batch
        :return: (float, float) critic loss, actor loss
        """
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_q.flatten())
            self.sess.run(self.renormalize_q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

        else:
            target_q = self.sess.run(self.target_q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_q,
            })
        self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr)
        self.critic_optimizer.update(critic_grads,
                                     learning_rate=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        """
        initialize the model parameters and optimizers

        :param sess: (TensorFlow Session) the current TensorFlow session
        """
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        """
        run target soft update operation
        """
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        """
        Get the mean and standard deviation of the model's inputs and outputs

        :return: (dict) the means and stds
        """
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        """
        calculate the adaptation for the parameter noise

        :return: (float) the mean distance for the parameter noise
        """
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        """
        Reset internal state after an episode is complete.
        """
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#17
0
class DDPG(object):
     def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
                 batch_size=128, observation_range=(-1., 1.), action_range= [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True, critic_l2_reg=0.,
                 adaptive_param_noise_policy_threshold=.1, actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., restore=False):

        # Inputs.
        # self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs0 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs0')
        # self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.obs1 = tf.placeholder(tf.float32, shape=(None, observation_shape), name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None, action_shape), name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        """Filewriter summary"""
        monitor_directory = os.path.join("Experiment_data")
        self.summary_dir = os.path.join(monitor_directory, "summary")
        # if restore:
        #     dirname = 'run20' # The last name
        #     self.summary_dir = os.path.join(self.summary_dir, dirname)
        # else:
        self.summary_dir = utils.new_summary_dir(self.summary_dir)

        # record the detailed parameters
        utils.log_params(self.summary_dir, {
            "actor learning rate": self.actor_lr,
            "critic learning rate": self.critic_lr,
            "batch size": self.batch_size,
            "actor update rate": self.tau,
            "critic update rate": self.tau,
            "action noise": self.action_noise,
            "param noise": self.param_noise,
            "reward function": 'General reward function',
            "result_function": 'The second 100'
        })

        self.merged = tf.summary.merge_all()

     def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

     def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None     ## 确保假设完全正确

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

     def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

     def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

     def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

     def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
        
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

     def setup_stats(self):
        ops = []
        names = []
        
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
        
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
        
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
        
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

     def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.multiply(action, self.action_range)
        # action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

     def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

     def save_data(self):
        self.memory.save_data()

     def train(self, dec_actor_lr, dec_critic_lr):
        # change the learning rate
        self.actor_lr = dec_actor_lr
        self.critic_lr = dec_critic_lr
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })
            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

     def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)
        ## wirte the graph
        self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph)

     def restore_model(self, model_directory, saver, sess):
        ckpt = tf.train.get_checkpoint_state(model_directory)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            self.sess = sess
            logger.info('Load the saved model from the directory!!!')
            self.summary_writer = tf.summary.FileWriter(self.summary_dir)

     def update_target_net(self):
        self.sess.run(self.target_soft_updates)

     def feedback_adptive_explore(self):
        self.param_noise.adapt_variance()

     def ou_adaptive_explore(self):
        self.action_noise.adapt_decrease()

     def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

     def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })

     def log_scalar(self, name, value, index):
        summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=index)
示例#18
0
class Model(object):
    def __init__(self, sess, policy, dynamics, ob_space, ac_space, nenvs,
                 nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c,
                 trust_region, alpha, delta, scope, goal_shape, residual):
        self.sess = sess
        self.nenv = nenvs
        self.residual = residual
        self.goal_shape = goal_shape
        self.goal_as_image = goal_as_image = len(goal_shape) == 3
        if self.goal_as_image:
            assert self.goal_shape == ob_space.shape
        else:
            logger.info("normalize goal using RunningMeanStd")
            with tf.variable_scope("RunningMeanStd", reuse=tf.AUTO_REUSE):
                self.goal_rms = RunningMeanStd(epsilon=1e-4,
                                               shape=self.goal_shape)

        nact = ac_space.n
        nbatch = nenvs * nsteps
        eps = 1e-6

        self.dynamics = dynamics

        self.scope = scope
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self.A = tf.placeholder(tf.int32, [nbatch],
                                    name="action")  # actions
            self.D = tf.placeholder(tf.float32, [nbatch],
                                    name="dones")  # dones
            self.R = tf.placeholder(tf.float32, [nbatch],
                                    name="rewards")  # rewards, not returns
            self.MU = tf.placeholder(tf.float32, [nbatch, nact],
                                     name="mus")  # mu's
            self.LR = tf.placeholder(tf.float32, [], name="lr")
            self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="v_next")

            step_ob_placeholder = tf.placeholder(ob_space.dtype,
                                                 (nenvs, ) + ob_space.shape,
                                                 "step_ob")
            if self.dynamics.dummy:
                step_goal_placeholder, concat_on_latent, step_goal_encoded = None, None, None
            else:
                if goal_as_image:
                    step_goal_placeholder = tf.placeholder(
                        ob_space.dtype, (nenvs, ) + ob_space.shape,
                        "step_goal")
                    concat_on_latent, train_goal_encoded, step_goal_encoded = False, None, None
                else:
                    step_goal_placeholder = tf.placeholder(
                        tf.float32, (nenvs, ) + goal_shape, "step_goal")
                    step_goal_encoded = tf.clip_by_value(
                        (step_goal_placeholder - self.goal_rms.mean) /
                        self.goal_rms.std, -5., 5.)

            train_ob_placeholder = tf.placeholder(
                ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape,
                "train_ob")
            if self.dynamics.dummy:
                train_goal_placeholder, concat_on_latent, train_goal_encoded = None, None, None
            else:
                if goal_as_image:
                    train_goal_placeholder = tf.placeholder(
                        ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape,
                        "train_goal")
                    concat_on_latent, train_goal_encoded = False, None
                else:
                    train_goal_placeholder = tf.placeholder(
                        tf.float32, (nenvs * nsteps, ) + goal_shape,
                        "train_goal")
                    concat_on_latent = True
                    train_goal_encoded = tf.clip_by_value(
                        (train_goal_placeholder - self.goal_rms.mean) /
                        self.goal_rms.std, -5., 5.)
            self.step_model = policy(nbatch=nenvs,
                                     nsteps=1,
                                     observ_placeholder=step_ob_placeholder,
                                     sess=self.sess,
                                     goal_placeholder=step_goal_placeholder,
                                     concat_on_latent=concat_on_latent,
                                     goal_encoded=step_goal_encoded)
            self.train_model = policy(nbatch=nbatch,
                                      nsteps=nsteps,
                                      observ_placeholder=train_ob_placeholder,
                                      sess=self.sess,
                                      goal_placeholder=train_goal_placeholder,
                                      concat_on_latent=concat_on_latent,
                                      goal_encoded=train_goal_encoded)

        variables = find_trainable_variables
        self.params = params = variables(scope)
        logger.info(
            "========================== {} =============================".
            format(scope))
        for var in params:
            logger.info(var)
        logger.info(
            "========================== {} =============================\n".
            format(scope))

        logger.info(
            "======================={}: Aux & Dyna =========================".
            format(scope))
        for var in self.dynamics.params:
            logger.info(var)
        logger.info(
            "======================={}: Aux & Dyna =========================\n"
            .format(scope))

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        # print("========================== Ema =============================")

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            # print(v.name)
            return v

        # print("========================== Ema =============================")

        with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True):
            self.polyak_model = policy(nbatch=nbatch,
                                       nsteps=nsteps,
                                       observ_placeholder=train_ob_placeholder,
                                       goal_placeholder=train_goal_placeholder,
                                       sess=self.sess,
                                       concat_on_latent=concat_on_latent,
                                       goal_encoded=train_goal_encoded)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to self.train_model, self.polyak_model and self.step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(self.train_model.pi)
        polyak_model_p = tf.nn.softmax(self.polyak_model.pi)
        self.step_model_p = tf.nn.softmax(self.step_model.pi)
        v = self.v = tf.reduce_sum(train_model_p * self.train_model.q,
                                   axis=-1)  # shape is [nenvs * (nsteps)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model_p, polyak_model_p, self.train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, self.A)
        q_i = get_by_index(q, self.A)

        # Compute ratios for importance truncation
        rho = f / (self.MU + eps)
        rho_i = get_by_index(rho, self.A)

        # Calculate Q_retrace targets
        qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs,
                         nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  # IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)

        # Goal loss
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  # [nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  # [nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            # print("=========================== gards add ==============================")
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]
            # print("=========================== gards add ==============================\n")
            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _policy_opt_op = trainer.apply_gradients(grads)
        if not self.dynamics.dummy:
            _train_dynamics = trainer.minimize(self.dynamics.loss)
            self.run_ops_dynamics = [
                _train_dynamics,
                self.dynamics.aux_loss,
                self.dynamics.dyna_loss,
            ]
            self.name_ops_dynamics = ["aux_loss", "dyna_loss"]
        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_policy_opt_op]):
            _train_policy = tf.group(ema_apply_op)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        self.run_ops_policy = [
            _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc,
            ev, norm_grads
        ]
        self.names_ops_policy = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            self.run_ops_policy = self.run_ops_policy + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            self.names_ops_policy = self.names_ops_policy + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]
        self.names_ops_policy = [
            scope + "_" + x for x in self.names_ops_policy
        ]  # scope as prefix

        self.save = functools.partial(save_variables,
                                      sess=self.sess,
                                      variables=params)

        self.initial_state = self.step_model.initial_state
        tf.global_variables_initializer().run(session=self.sess)

    def train_policy(self,
                     obs,
                     next_obs,
                     actions,
                     rewards,
                     dones,
                     mus,
                     states,
                     masks,
                     steps,
                     goal_obs,
                     verbose=False):
        cur_lr = self.lr.value_steps(steps)
        # 1. calculate v_{t+1} using obs_{t+1} and g_t
        td_map = {self.train_model.X: next_obs}
        if not self.dynamics.dummy:
            assert hasattr(self.train_model, "goals")
            if self.residual:
                td_map[self.train_model.goals] = goal_obs - next_obs
            else:
                td_map[self.train_model.goals] = goal_obs
        v_next = self.sess.run(self.v, feed_dict=td_map)
        # 2. use obs_t, goal_t, v_{t+1} to train policy
        td_map = {
            self.train_model.X: obs,
            self.polyak_model.X: obs,
            self.A: actions,
            self.R: rewards,
            self.D: dones,
            self.MU: mus,
            self.LR: cur_lr,
            self.V_NEXT: v_next
        }
        if not self.dynamics.dummy:
            assert hasattr(self.train_model, "goals")
            assert hasattr(self.polyak_model, "goals")
            if hasattr(self, "goal_rms"):
                self.goal_rms.update(goal_obs)
            if self.residual:
                td_map[self.train_model.goals] = goal_obs - obs
                td_map[self.polyak_model.goals] = goal_obs - obs
            else:
                td_map[self.train_model.goals] = goal_obs
                td_map[self.polyak_model.goals] = goal_obs
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks
            td_map[self.polyak_model.S] = states
            td_map[self.polyak_model.M] = masks
        if verbose:
            names_ops_policy = self.names_ops_policy.copy()
            values_ops_policy = self.sess.run(self.run_ops_policy,
                                              td_map)[1:]  # strip off _train
        else:
            names_ops_policy = self.names_ops_policy.copy(
            )[:8]  # not including trust region
            values_ops_policy = self.sess.run(self.run_ops_policy,
                                              td_map)[1:][:8]

        unimportant_key = ["loss_f", "loss_bc"]
        for name in names_ops_policy.copy():
            for suffix in unimportant_key:
                if name.endswith(suffix):
                    index = names_ops_policy.index(name)
                    names_ops_policy.pop(index)
                    values_ops_policy.pop(index)
                    break

        return names_ops_policy, values_ops_policy

    def train_dynamics(self, obs, actions, next_obs, steps, nb_epoch=1):
        value_ops_dynamics = []
        for epoch in range(nb_epoch):
            cur_lr = self.lr.value_steps(steps)
            td_map = {
                self.dynamics.obs: obs,
                self.dynamics.next_obs: next_obs,
                self.dynamics.ac: actions,
                self.LR: cur_lr
            }
            value = self.sess.run(self.run_ops_dynamics, td_map)[1:]
            value_ops_dynamics.append(value)
        value_ops_dynamics = np.asarray(value_ops_dynamics)
        value_ops_dynamics = list(np.mean(value_ops_dynamics, axis=0))
        return self.name_ops_dynamics.copy(), value_ops_dynamics

    def step(self, observation, **kwargs):
        if self.residual and not self.dynamics.dummy:
            kwargs["goals"] = kwargs["goals"] - observation
        return self.step_model.evaluate(
            [self.step_model.action, self.step_model_p, self.step_model.state],
            observation, **kwargs)
示例#19
0
class MlpPolicy(object):
    recurrent = False

    def __init__(self, name, *args, **kwargs):
        self.scope = name
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            self._init(*args, **kwargs)

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=False,
              popart=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[None] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope("popart"):
            self.v_rms = RunningMeanStd(shape=[1])

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.norm_vpred = dense(last_out,
                                1,
                                "vffinal",
                                weight_init=U.normc_initializer(1.0))[:, 0]
        if popart:
            self.vpred = denormalize(self.norm_vpred, self.v_rms)
        else:
            self.vpred = self.norm_vpred

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())

        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.mean_and_logstd = U.function([ob], [self.pd.mean, self.pd.logstd])

        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        self.use_popart = popart
        if popart:
            self.init_popart()

        ret = tf.placeholder(tf.float32, [None])
        vferr = tf.reduce_mean(tf.square(self.vpred - ret))
        self.vlossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr,
                                                  self.get_vf_variable()))

    def init_popart(self):
        old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.v_rms.std
        old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.v_rms.mean

        renormalize_Q_outputs_op = []
        vs = self.output_vars
        M, b = vs
        renormalize_Q_outputs_op += [M.assign(M * old_std / new_std)]
        renormalize_Q_outputs_op += [
            b.assign((b * old_std + old_mean - new_mean) / new_std)
        ]
        self.renorm_v = U.function([old_std, old_mean], [],
                                   updates=renormalize_Q_outputs_op)

    def act(self, stochastic, ob):
        ac1, vpred1 = self._act(stochastic, ob[None])
        return ac1[0], vpred1[0]

    def get_mu_logstd(self, ob):
        mean, logstd = self.mean_and_logstd(ob[None])
        return mean[0], logstd[0]

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.trainable_variables(self.scope)

    def get_initial_state(self):
        return []

    def get_vf_variable(self):
        return tf.trainable_variables(self.scope + "/vf")

    def update_popart(self, v_targets):
        old_mean, old_std = U.get_session().run(
            [self.v_rms.mean, self.v_rms.std])
        self.v_rms.update(v_targets)
        self.renorm_v(old_std, old_mean)

    @property
    def output_vars(self):
        output_vars = [
            var for var in self.get_vf_variable() if 'vffinal' in var.name
        ]
        return output_vars

    def save_policy(self, name):
        U.save_variables(name, variables=self.get_variables())

    def load_policy(self, name):
        U.load_variables(name, variables=self.get_variables())
示例#20
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):

        concat_z = np.zeros((observation_shape[0] - 2))
        z = np.zeros((119))

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')

        self.encoding = tf.placeholder(tf.float32,
                                       shape=(None, ) + z.shape,
                                       name='encoding')
        self.kls = tf.placeholder(tf.float32, shape=(None, 1), name='kls')

        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')

        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        self.normalized_obs0 = tf.clip_by_value(
            normalize(self.obs0, self.obs_rms), self.observation_range[0],
            self.observation_range[1])
        self.normalized_obs1 = tf.clip_by_value(
            normalize(self.obs1, self.obs_rms), self.observation_range[0],
            self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor

        # Create target networks.
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        # Main Actor Network
        self.actor_tf = actor(self.normalized_obs0)
        # Main Actor network based kl for initial state
        self.encoder_tf_mu, self.encoder_tf_sigma = actor(
            self.normalized_obs0, True)

        # Target Actor network based kl for initial state
        self.t_encoder_tf_mu, self.t_encoder_tf_sigma = target_actor(
            self.normalized_obs0, True)

        # Main Critic Network
        self.normalized_critic_tf = critic(self.normalized_obs0,
                                           self.encoder_tf_mu, self.actions)
        self.normalized_critic_with_actor_tf = critic(self.normalized_obs0,
                                                      self.encoder_tf_mu,
                                                      self.actor_tf,
                                                      reuse=True)

        # only for stats
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # Main network based kl of initial state for actor optimization
        self.kl = tf.reduce_sum(tf.exp(self.encoder_tf_sigma) +
                                tf.square(self.encoder_tf_mu) - 1. -
                                self.encoder_tf_sigma,
                                axis=1)
        # Target network based kl of initial state for actor optimization
        self.t0_kl = tf.reshape(
            tf.reduce_sum(tf.exp(self.t_encoder_tf_sigma) +
                          tf.square(self.t_encoder_tf_mu) - 1. -
                          self.t_encoder_tf_sigma,
                          axis=1), (-1, 1))

        Q_obs1 = denormalize(
            target_critic(self.normalized_obs1, self.t_encoder_tf_mu,
                          self.target_actor(self.normalized_obs1)),
            self.ret_rms)

        self.target_Q = self.rewards + 0.0 * self.kls + (
            1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(self.normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')

        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))

        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) \
                           - 0.0 * U.flatgrad(self.kl, self.actor.trainable_vars)

        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):

        logger.info('setting up critic optimizer')

        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])

        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))

        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if var.name.endswith('/w:0') and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg

        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))

        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []

        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):

        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf

        # find z here as first thing

        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}

        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def learnt_step(self, obs):

        actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        action = self.sess.run(actor_tf, feed_dict=feed_dict)
        q = None
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q, None, None

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:

            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })

            self.ret_rms.update(target_Q.flatten())

            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            print('running sanity check')

            target_Q_new, new_mean, new_std = self.sess.run(
                [self.target_Q, self.ret_rms.mean, self.ret_rms.std],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })

            print(target_Q_new, target_Q, new_mean, new_std)

            assert (np.abs(target_Q - target_Q_new) < 1e-3).all()

        else:
            t0_kl = self.sess.run(self.t0_kl,
                                  feed_dict={
                                      self.obs0: batch['obs0'],
                                  })

            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.obs0:
                                         batch['obs0'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                         self.kls:
                                         t0_kl,
                                     })

        # Get all gradients and perform a synced update.
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            [
                self.actor_grads, self.actor_loss, self.critic_grads,
                self.critic_loss
            ],
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })

        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    def save(self, save_path):
        U.save_variables(save_path, None, self.sess)

    def load(self, sess, save_path):
        self.sess = sess
        U.load_variables(save_path, None, self.sess)
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b],
                               terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)
示例#21
0
class DDPG_paramnoise(object):
    """
    Implicit Policy Optimization for DDPG
    noise injected in the middle of blackbox (param noise)
    """
    def __init__(self,
                 maxactor,
                 maxentactor,
                 critic,
                 classifier,
                 memory,
                 fifomemory,
                 observation_shape,
                 action_shape,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 classifier_l2_reg=0.,
                 maxactor_lr=1e-4,
                 maxentactor_lr=1e-4,
                 critic_lr=1e-3,
                 classifier_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 entropy_coeff=1.,
                 beta=0.0):
        # Inputs.
        self.obs0_act = tf.placeholder(tf.float32,
                                       shape=(1, ) + observation_shape,
                                       name='obs0_act')
        self.obs0_train = tf.placeholder(tf.float32,
                                         shape=(batch_size, ) +
                                         observation_shape,
                                         name='obs0_train')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(batch_size, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions_act = tf.placeholder(tf.float32,
                                          shape=(1, ) + action_shape,
                                          name='actions_act')
        self.actions_train = tf.placeholder(tf.float32,
                                            shape=(64, ) + action_shape,
                                            name='actions_train')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.fifomemory = fifomemory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.action_shape = action_shape
        self.critic = critic
        self.maxactor = maxactor
        self.maxentactor = maxentactor
        self.classifier = classifier
        self.maxactor_lr = maxactor_lr
        self.maxentactor_lr = maxentactor_lr
        self.critic_lr = critic_lr
        self.classifier_lr = classifier_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.classifier_l2_reg = classifier_l2_reg
        self.entropy_coeff = entropy_coeff

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0_act = tf.clip_by_value(
            normalize(self.obs0_act, self.obs_rms), self.observation_range[0],
            self.observation_range[1])
        normalized_obs0_train = tf.clip_by_value(
            normalize(self.obs0_train, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        self.normalized_obs0_act = normalized_obs0_act  # record normalized_obs0
        self.normalized_obs0_train = normalized_obs0_train
        self.normalized_obs1 = normalized_obs1

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_maxactor = copy(maxactor)
        target_maxentactor = copy(maxentactor)
        target_maxactor.name = 'target_maxactor'
        self.target_maxactor = target_maxactor
        target_maxentactor.name = 'target_maxentactor'
        self.target_maxentactor = target_maxentactor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.maxactor_tf_act = maxactor(normalized_obs0_act)
        self.maxentactor_tf_act = maxentactor(normalized_obs0_act)
        self.maxactor_tf_train = maxactor(normalized_obs0_train, reuse=True)
        self.maxentactor_tf_train = maxentactor(normalized_obs0_train,
                                                reuse=True)
        nb_actions = maxactor.nb_actions

        # Create interpolated action for act
        batch_act = self.maxactor_tf_act.get_shape().as_list()[0]
        mask_act = tf.random_uniform(
            tf.stack([batch_act]), minval=0, maxval=1, dtype=tf.float32) < beta
        self.actor_tf_act = tf.where(mask_act, self.maxactor_tf_act,
                                     self.maxentactor_tf_act)

        # Create interpolated action for train
        batch_train = self.maxactor_tf_train.get_shape().as_list()[0]
        mask_train = tf.random_uniform(
            tf.stack([batch_train
                      ]), minval=0, maxval=1, dtype=tf.float32) < beta
        self.actor_tf_train = tf.where(mask_train, self.maxactor_tf_train,
                                       self.maxentactor_tf_train)

        # Create graphs for critic for train
        self.normalized_critic_tf = critic(normalized_obs0_train,
                                           self.actions_train)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_maxactor_tf = critic(
            normalized_obs0_train, self.maxactor_tf_train, reuse=True)
        self.normalized_critic_with_maxentactor_tf = critic(
            normalized_obs0_train, self.maxentactor_tf_train, reuse=True)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0_act,
                                                      self.actor_tf_act,
                                                      reuse=True)  # act
        self.critic_with_maxactor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_maxactor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        self.critic_with_maxentactor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_maxentactor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # Create interpolated target action for train
        batch_train = normalized_obs0_train.get_shape().as_list()[0]
        mask_train = tf.random_uniform(
            tf.stack([batch_train
                      ]), minval=0, maxval=1, dtype=tf.float32) < beta
        self.target_actions = tf.where(
            mask_train, self.target_maxactor(normalized_obs1),
            self.target_maxentactor(normalized_obs1))
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, self.target_actions), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Create graphs for critic for act
        #self.normalized_critic_tf_act = critic(normalized_obs0_act, self.actions_act)
        #self.critic_tf_act = denormalize(tf.clip_by_value(self.normalized_critic_tf_act, self.return_range[0], self.return_range[1]), self.ret_rms)

        # Classifier Network
        self.random_actions = tf.placeholder(tf.float32,
                                             shape=(None, ) + action_shape,
                                             name='random_actions')
        #self.logit = classifier(normalized_obs0_train, self.actor_tf_train)  # actions produced by policy for backprop
        self.logit = classifier(normalized_obs0_train,
                                self.maxentactor_tf_train)
        self.random_logit = classifier(normalized_obs0_train,
                                       self.random_actions,
                                       reuse=True)

        # Set up parts.
        self.setup_approx_entropy()
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        self.setup_classifier_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.maxactor.vars, self.target_maxactor.vars, self.tau)
        actor_init_updates_, actor_soft_updates_ = get_target_updates(
            self.maxentactor.vars, self.target_maxentactor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [
            actor_init_updates, actor_init_updates_, critic_init_updates
        ]
        self.target_soft_updates = [
            actor_soft_updates, actor_soft_updates_, critic_soft_updates
        ]

    def setup_approx_entropy(self):
        logger.info('setting up approx entropy')
        self.approx_entropy = -tf.reduce_mean(self.logit)

    def setup_actor_optimizer(self):
        # maxactor
        logger.info('setting up maxactor optimizer')
        self.maxactor_loss = -tf.reduce_mean(self.critic_with_maxactor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.maxactor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        # Add entropy into actor loss
        self.maxactor_grads = U.flatgrad(self.maxactor_loss,
                                         self.maxactor.trainable_vars,
                                         clip_norm=self.clip_norm)
        self.maxactor_optimizer = MpiAdam(
            var_list=self.maxactor.trainable_vars,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)

        # maxentactor
        logger.info('setting up maxentactor optimizer')
        self.maxentactor_loss = -tf.reduce_mean(
            self.critic_with_maxentactor_tf)
        actor_shapes = [
            var.get_shape().as_list()
            for var in self.maxentactor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('using entropy coeff {}'.format(self.entropy_coeff))
        self.maxentactor_loss += -self.entropy_coeff * self.approx_entropy
        # Add entropy into actor loss
        self.maxentactor_grads = U.flatgrad(self.maxentactor_loss,
                                            self.maxentactor.trainable_vars,
                                            clip_norm=self.clip_norm)
        self.maxentactor_optimizer = MpiAdam(
            var_list=self.maxentactor.trainable_vars,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_classifier_optimizer(self):
        logger.info('setting up classifier optimizer')
        #self.classifier_loss = - (tf.reduce_mean(tf.log(1e-8 + tf.sigmoid(self.logit)))
        #                          + tf.reduce_mean(tf.log(1e-8 + 1 - tf.sigmoid(self.random_logit))))
        label_zeros = tf.zeros_like(self.logit)
        label_ones = tf.ones_like(self.random_logit)
        self.classifier_loss = (tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logit, labels=label_zeros)) + tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=self.random_logit, labels=label_ones)))
        if self.classifier_l2_reg > 0.:
            classifier_reg_vars = [
                var for var in self.classifier.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in classifier_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.classifier_l2_reg))
            classifier_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.classifier_l2_reg),
                weights_list=classifier_reg_vars)
            self.classifier_loss += classifier_reg
        classifier_shapes = [
            var.get_shape().as_list() for var in self.classifier.trainable_vars
        ]
        classifier_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in classifier_shapes])
        logger.info('  classifier shapes: {}'.format(classifier_shapes))
        logger.info('  classifier params: {}'.format(classifier_nb_params))
        self.classifier_grads = U.flatgrad(self.classifier_loss,
                                           self.classifier.trainable_vars,
                                           clip_norm=self.clip_norm)
        self.classifier_optimizer = MpiAdam(
            var_list=self.classifier.trainable_vars,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        #ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        #names += ['reference_actor_Q_mean']
        #ops += [reduce_std(self.critic_with_actor_tf)]
        #names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf_train)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf_train)]
        names += ['reference_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if apply_noise:
            actor_tf = self.actor_tf_act  # TODO: handle apply_noise=False mode
        else:
            actor_tf = self.actor_tf_act  # should take the mean?? probably not
        feed_dict = {self.obs0_act: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        self.fifomemory.append(obs0, action)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get a batch from recent policy then update classifier
        batch_recent = self.fifomemory.sample(batch_size=self.batch_size)
        random_actions = np.random.uniform(
            low=self.action_range[0],
            high=self.action_range[1],
            size=[self.batch_size,
                  np.prod(np.array(self.action_shape))]).astype('float32')
        ops = [
            self.classifier_grads, self.classifier_loss, self.approx_entropy
        ]
        classifier_grads, classifier_loss, approx_entropy = self.sess.run(
            ops,
            feed_dict={
                self.obs0_train: batch_recent['obs0'],
                self.random_actions: random_actions
            })
        self.classifier_optimizer.update(classifier_grads,
                                         stepsize=self.classifier_lr)

        # Get all gradients and perform a synced update.
        ops = [
            self.maxactor_grads, self.maxactor_loss, self.maxentactor_grads,
            self.maxentactor_loss, self.critic_grads, self.critic_loss
        ]
        maxactor_grads, maxactor_loss, maxentactor_grads, maxentactor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0_train: batch['obs0'],
                self.actions_train: batch['actions'],
                self.critic_target: target_Q,
            })
        self.maxactor_optimizer.update(maxactor_grads,
                                       stepsize=self.maxactor_lr)
        self.maxentactor_optimizer.update(maxentactor_grads,
                                          stepsize=self.maxentactor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, maxactor_loss, maxentactor_loss, classifier_loss, approx_entropy

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.maxactor_optimizer.sync()
        self.maxentactor_optimizer.sync()
        self.critic_optimizer.sync()
        self.classifier_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
            #print(self.stats_sample['obs0'].shape, self.stats_sample['actions'].shape)
            #print(self.obs0_train, self.actions_train)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0_train:
                                   self.stats_sample['obs0'],
                                   self.actions_train:
                                   self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))
        return stats

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
示例#22
0
class BDDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 obs_dim,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.95,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.expert_qv = tf.placeholder(tf.float32,
                                        shape=(None, 1),
                                        name='expert_qv')
        self.expert_qv1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='expert_qv1')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.expert_actions = tf.placeholder(tf.float32,
                                             shape=(None, ) + action_shape,
                                             name='expert_actions')
        self.expert_actions1 = tf.placeholder(tf.float32,
                                              shape=(None, ) + action_shape,
                                              name='expert_actions1')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = copy(actor)
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.obs_dim = obs_dim
        # self.critic_obs0 = self.experts[0].obs0
        # self.critic_obs1 = self.experts[0].obs1
        # self.critic_actor = self.experts[0].use_tf_actor

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        bel0 = self.obs0[:, obs_dim:]
        bel_dim = observation_shape[0] - obs_dim
        entropy = -tf.reduce_sum(
            bel0 * tf.log(bel0 + 1e-3) / math.log(bel_dim), axis=1) / bel_dim
        # entropy = tf.Print(entropy, [entropy], '>>>> entropy :', summarize=10)
        entropy = tf.expand_dims(0.1 * entropy, 1)
        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = entropy * self.actor(
            normalized_obs0,
            self.expert_qv) + (1 - entropy) * self.expert_actions
        self.normalized_critic_tf = critic(normalized_obs0, self.actions,
                                           self.expert_qv)
        self.critic_tf = tf.clip_by_value(self.normalized_critic_tf,
                                          self.return_range[0],
                                          self.return_range[1])

        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      self.expert_qv)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        bel1 = self.obs1[:, obs_dim:]
        entropy1 = -tf.reduce_sum(
            bel1 * tf.log(bel1 + 1e-3) / math.log(bel_dim), axis=1) / bel_dim
        entropy1 = tf.expand_dims(0.1 * entropy1, 1)
        action1 = entropy1 * target_actor(normalized_obs1, self.expert_qv1) + (
            1 - entropy1) * self.expert_actions1
        self.Q_obs1 = target_critic(normalized_obs1, action1, self.expert_qv1)
        # self.Q_obs1 = tf.Print(self.Q_obs1, [self.Q_obs1], '>>>> Q :', summarize=10)
        # self.terminals1 = tf.Print(self.terminals1, [self.terminals1], '>>>> terminal :', summarize=10)

        self.target_Q = self.rewards + (1. -
                                        self.terminals1) * gamma * self.Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0, self.expert_qv)

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        # import IPython; IPython.embed() ; import sys; sys.exit(0)
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.perturbable_vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0, expert_qv0):
        assert self.param_noise is not None
        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)

        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0,
                                                    expert_qv0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0,
                                                       expert_qv0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if var.name.endswith('/w:0') and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self,
             obs,
             expert_qv,
             expert_action,
             apply_noise=True,
             compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {
            self.obs0:
            U.adjust_shape(self.obs0, [obs]),
            self.expert_qv:
            U.adjust_shape(self.expert_qv, [expert_qv]),
            self.expert_actions:
            U.adjust_shape(self.expert_actions, [expert_action])
        }
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, expert_qv, action, expert_action, reward,
                         obs1, expert_qv1, expert_action1, terminal1):
        reward *= self.reward_scale

        # B = obs0.shape[0]
        # for b in range(B):
        #     self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
        #     if self.normalize_observations:
        #         self.obs_rms.update(np.array([obs0[b]]))
        self.memory.append(obs0, expert_qv, action, expert_action, reward,
                           obs1, expert_qv1, expert_action1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(obs0)

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)
        # import IPython; IPython.embed(); import sys; sys.exit(0)
        target_Q, Q_obs1 = self.sess.run(
            [self.target_Q, self.Q_obs1],
            feed_dict={
                self.obs1: batch['obs1'],
                self.expert_qv1: batch['expert_qv1'],
                self.expert_actions1: batch['expert_actions1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.expert_qv: batch['expert_qv'],
                self.actions: batch['actions'],
                self.expert_actions: batch['expert_actions'],
                self.critic_target: target_Q,
            })

        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        # import IPython; IPython.embed(); exit(0)
        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        # self.graph = graph
        self.sess.run(tf.global_variables_initializer())
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()

        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0:
                                   self.stats_sample['obs0'],
                                   self.actions:
                                   self.stats_sample['actions'],
                                   self.expert_qv:
                                   self.stats_sample['expert_qv'],
                                   self.expert_actions:
                                   self.stats_sample['expert_actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.expert_actions:
                                     batch['expert_actions'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    def save(self, path):
        save_variables(path)

    def load(self, path):
        load_variables(path)
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 critic_l2_reg=0.,
                 actor_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 bc_teacher_lambda=0.0,
                 use_qfilter=False):
        """DDPG.

        I changed observation_range to (0, 255) for the image-based RL part
        because we don't divide our images by 255 until later. The action and
        return range should be OK.
        """
        # Inputs. Daniel: for images, cast to a new variable which gets cast to the float.
        # Assumes we detect via observation space; I think MuJoCo envs have obs shape length 1.
        # Then we let the remainder be input to subsequent code that uses observations.
        if len(observation_shape) > 1:
            self.obs0 = tf.placeholder(tf.int32,
                                       shape=(None, ) + observation_shape,
                                       name='obs0_imgs')
            self.obs1 = tf.placeholder(tf.int32,
                                       shape=(None, ) + observation_shape,
                                       name='obs1_imgs')
            self.obs0_f_imgs = tf.cast(self.obs0, tf.float32) / 255.0
            self.obs1_f_imgs = tf.cast(self.obs1, tf.float32) / 255.0
            assert not normalize_observations, 'Why normalize if we already divide by 255?'
            observation_range = (-np.inf, np.inf
                                 )  # We don't want to clip raw pixels here.
            self.use_images = True
            self.bc_teacher_lambda = bc_teacher_lambda
            self.use_qfilter = use_qfilter
        else:
            # Assuming default MuJoCo settings here.
            self.obs0 = tf.placeholder(tf.float32,
                                       shape=(None, ) + observation_shape,
                                       name='obs0')
            self.obs1 = tf.placeholder(tf.float32,
                                       shape=(None, ) + observation_shape,
                                       name='obs1')
            self.use_images = False
            self.bc_teacher_lambda = bc_teacher_lambda
            self.actor_l2_reg = 0.0
            self.use_qfilter = use_qfilter

        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')
        # Daniel: new for demos.
        self.flag_teacher = tf.placeholder(tf.float32,
                                           shape=(None, 1),
                                           name='flag_teacher')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_l2_reg = actor_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Daniel: this is where all the obs are subsequently passed, thus handle image case.
        # That way our feed_dicts in later code can still use self.{obs0,obs1}.
        if self.use_images:
            normalized_obs0 = tf.clip_by_value(
                normalize(self.obs0_f_imgs, self.obs_rms),
                self.observation_range[0], self.observation_range[1])
            normalized_obs1 = tf.clip_by_value(
                normalize(self.obs1_f_imgs, self.obs_rms),
                self.observation_range[0], self.observation_range[1])
        else:
            normalized_obs0 = tf.clip_by_value(
                normalize(self.obs0, self.obs_rms), self.observation_range[0],
                self.observation_range[1])
            normalized_obs1 = tf.clip_by_value(
                normalize(self.obs1, self.obs_rms), self.observation_range[0],
                self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        # One actor. Two critics: action can be from:
        #   (1) itself (supplied via placeholder) -- for critic update, Q(s,a) sampled from RBuffer.
        #   (2) from actor_tf, supplied by the actor -- for actor update which maximizes Q(s,pi(o)).
        # Then create two de-normalized versions of those critics.
        #   self.critic_tf            : Q(s,a) where a is supplied by placeholder
        #   self.critic_with_actor_tf : Q(s,pi(s)) where pi(s) is the actor
        # Finally, get target Q values from target critic/actor.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Daniel: add a Q-filter, 1 if Q(s,a) > Q(s,pi(s)) where former has `a` from *demonstrator*. Only after pre-training?
        self.flag_qfilter = tf.cast(self.critic_tf > self.critic_with_actor_tf,
                                    tf.float32)
        self.during_pretrain = tf.placeholder(tf.float32, (),
                                              name="during_pretrain_flag")

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        """Make actor loss, grads, and optimizer. Several changes:

        We use a behavior cloning loss (with a Q-filter on top of that), using
        actor_tf for the current actor's output given the state, and actions as
        placeholder for what was sampled from the buffer. The latter might have
        student actions, in which case we ignore these w/the flag.

        We apply L2 reg if desired (following DeepMind's DDPGfD). Careful
        w/variable names if we switch network construction code!!

        (Nair et al., 2018) set the `bc_teacher_lambda` term I'm using to 1,
        and average out the BC loss by all items in the batch, *regardless* of
        whether the item passed the Q-filter or not. We're doing the same here
        by dividing by the sum of the number of teacher flags.
        """
        logger.info('\nsetting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        if self.bc_teacher_lambda > 0.:
            # Daniel: add Behavior cloning loss to the actor, but only on teacher samples!
            # I'm doing a reduce_sum and dividing by the total in the flag teacher.
            self._diff_m = self.actor_tf - self.actions
            self._diff_v = tf.reduce_mean(tf.square(self._diff_m),
                                          axis=1,
                                          keepdims=True)
            self._diff_f = self._diff_v * self.flag_teacher
            # Daniel: another idea is to apply q-filters only if we are past pre-training.
            if self.use_qfilter:
                logger.info('  applying Q-filter flag: {}'.format(
                    self.flag_qfilter))
                self._diff_f = tf.cond(
                    self.during_pretrain > 0.5,
                    lambda: self._diff_f,  # pretrain? identity
                    lambda: self._diff_f * self.flag_qfilter
                )  # else? apply filter
            self.bc_loss = tf.reduce_sum(
                self._diff_f) / (tf.reduce_sum(self.flag_teacher) + 1e-6)
            self.actor_loss += self.bc_loss
            logger.info('  applying BC loss to actor with {}'.format(
                self.bc_teacher_lambda))
            logger.info('  diff_matrix: {}'.format(self._diff_m))
            logger.info('  diff_vector: {}'.format(self._diff_v))
            logger.info('  diff_filter: {}'.format(self._diff_f))

        if self.actor_l2_reg > 0.:
            actor_reg_vars = [
                var for var in self.actor.trainable_vars
                if ((var.name.endswith('/w:0') or var.name.endswith(
                    '/kernel:0')) and 'output' not in var.name)
            ]
            for var in actor_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.actor_l2_reg))
            actor_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.actor_l2_reg),
                weights_list=actor_reg_vars)
            self.actor_loss += actor_reg

        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}\n'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        """Make critic loss, grads, and optimizer. Minor change w/L2 regularization.

        I didn't realize that our custom code would name the variables a bit different.
        It actually makes a huge difference, as the critic's default L2 is 0.01. Just be
        careful if we decide to re-name the variables or use a different TF construction.
        """
        logger.info('\nsetting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))

        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if ((var.name.endswith('/w:0') or var.name.endswith(
                    '/kernel:0')) and 'output' not in var.name)
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg

        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}\n'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):
        """Apply the policy.

        Note the noise: for DDPG if we are *deploying* it, we should probably
        set the noise to False, such as for the `--play` option.
        """
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            #assert noise.shape == action[0].shape # daniel: with my fix, both are (numenv, acdim)
            assert noise.shape == action.shape, '{} {}'.format(
                noise.shape, action.shape)
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self,
                         obs0,
                         action,
                         reward,
                         obs1,
                         terminal1,
                         is_teacher=False):
        """Store transitions for DDPG.

        Daniel: collected via VecEnv, so iterate through batch size and append
        individual components. It's serial but shouldn't be a time bottleneck.
        Note that all this seems to be done using one-step returns; I don't see
        n-step returns anywhere. Also, we should add an indication if this is a
        teacher sample.
        """
        reward *= self.reward_scale
        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b],
                               action[b],
                               reward[b],
                               obs1[b],
                               terminal1[b],
                               is_teacher=is_teacher)
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self, during_pretrain=False):
        """Daniel: added during_pretrain in case we want to do anything different there.

        By default it's false (and float(during_pretrain)=0.0) to maintain backwards compatibility.
        """
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = \
                    self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        ## Daniel: use this for debugging extra DDPG features we implemented:
        #ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss,
        #        self.critic_tf, self.critic_with_actor_tf, self.flag_teacher,
        #        self.flag_qfilter, self._diff_f, self.actor_tf, self.actions]
        #actor_grads, actor_loss, critic_grads, critic_loss, Q_demo, Q_actor, flag_t, flag_q, diff_f, act_tf, act_ph = \
        #            self.sess.run(ops, feed_dict={
        #    self.obs0: batch['obs0'],
        #    self.actions: batch['actions'],
        #    self.critic_target: target_Q,
        #    self.flag_teacher: batch['flag_teacher'],
        #    self.during_pretrain: float(during_pretrain),
        #})
        #print('\nQ(s,a), Q(s,pi(s)), act_tf, act_ph, diff_f, flag_q, flag_t')
        #print(Q_demo.T)
        #print(Q_actor.T)
        #print('now actors:')
        #print(act_tf.T)
        #print(act_ph.T)
        #print('now diff/flags:')
        #print(diff_f.T)
        #print(flag_q.T)
        #print(flag_t.T)

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
                self.flag_teacher: batch['flag_teacher'],
                self.during_pretrain: float(during_pretrain),
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        # Daniel: following PPO2 code outline, hoping to save/load models.
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        # Daniel: back to normal.
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(
                distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#24
0
def learn(
        *,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        novelty_reward='AE',
        normalize_int_rew=False,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))
    rff_rms_int = RunningMeanStd()
    nr = NOVELTY_REWARDS[novelty_reward](env.observation_space)

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()

        # Calculate novelty rewards
        bonus = nr.get_batch_bonus_and_update(seg["ob"])
        if normalize_int_rew:
            rff_rms_int.update(bonus.ravel())
            bonus = bonus / rff_rms_int.std.eval()

        seg["orig_rew"] = seg["rew"]
        seg["rew"] = seg["rew"] + bonus
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()

    return pi
示例#25
0
class RND(object):
    def __init__(self, name, ph_ob, args):
        self.convfeat = args.convfeat
        self.rep_size = args.rep_size
        self.enlargement = args.enlargement
        self.proportion_of_exp_used_for_predictor_update = args.proportion_of_exp_used_for_predictor_update
        self.scope = name

        with tf.variable_scope(self.scope):
            self.build_graph = self.build_graph(ph_ob)

    def build_graph(self, ph_ob):
        ob = ph_ob[-1]
        assert len(ob.shape.as_list()) == 4  #B, H, W, C
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob.shape.as_list()[1:3] + [1])

        ob_norm = ob[:, :, :, -1:]
        ob_norm = tf.cast(ob_norm, tf.float32)
        ob_norm = tf.clip_by_value(
            (ob_norm - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # Random target network
        xr = tf.nn.leaky_relu(
            conv(ob_norm,
                 "c1r",
                 nf=self.convfeat * 1,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2)))
        xr = tf.nn.leaky_relu(
            conv(xr,
                 'c2r',
                 nf=self.convfeat * 2 * 1,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2)))
        xr = tf.nn.leaky_relu(
            conv(xr,
                 'c3r',
                 nf=self.convfeat * 2 * 1,
                 rf=3,
                 stride=1,
                 init_scale=np.sqrt(2)))
        rgbr = [to2d(xr)]
        X_r = fc(rgbr[0], 'fc1r', nh=self.rep_size, init_scale=np.sqrt(2))

        # Predictor network
        xrp = tf.nn.leaky_relu(
            conv(ob_norm,
                 'c1rp_pred',
                 nf=self.convfeat,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2)))
        xrp = tf.nn.leaky_relu(
            conv(xrp,
                 'c2rp_pred',
                 nf=self.convfeat * 2,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2)))
        xrp = tf.nn.leaky_relu(
            conv(xrp,
                 'c3rp_pred',
                 nf=self.convfeat * 2,
                 rf=3,
                 stride=1,
                 init_scale=np.sqrt(2)))
        rgbrp = to2d(xrp)

        X_r_hat = tf.nn.relu(
            fc(rgbrp,
               'fc1r_hat1_pred',
               nh=256 * self.enlargement,
               init_scale=np.sqrt(2)))
        X_r_hat = tf.nn.relu(
            fc(X_r_hat,
               'fc1r_hat2_pred',
               nh=256 * self.enlargement,
               init_scale=np.sqrt(2)))
        X_r_hat = fc(X_r_hat,
                     'fc1r_hat3_pred',
                     nh=self.rep_size,
                     init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)

        targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)

        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
        self._predictor = U.function([ob], [self.int_rew])

    def predict(self, ob):
        obf = ob[-1]
        if obf.shape == 3:
            obf = np.expand_dims(obf, 0)
        int_rew = self._predictor(obf)[0]
        return int_rew

    def update_obs_rms(self, ob):
        obf = np.array(zip(*ob.tolist())[1])
        self.ob_rms.update(obf)

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
示例#26
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 state_shape,
                 aux_shape,
                 lambda_obj_conf_predict,
                 lambda_gripper_predict,
                 lambda_target_predict,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 enable_popart=False,
                 normalize_observations=True,
                 normalize_state=True,
                 normalize_aux=True,
                 batch_size=128,
                 observation_range=(-10., 10.),
                 action_range=(-1., 1.),
                 state_range=(-4, 4),
                 return_range=(-250, 10),
                 aux_range=(-10, 10),
                 critic_l2_reg=0.001,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 replay_beta=0.4,
                 lambda_1step=1.0,
                 lambda_nstep=1.0,
                 nsteps=10,
                 run_name="unnamed_run",
                 lambda_pretrain=0.0,
                 target_policy_noise=0.2,
                 target_policy_noise_clip=0.5,
                 policy_and_target_update_period=2,
                 num_critics=2,
                 **kwargs):

        # Inputs.
        self.obs0 = tf.placeholder(
            tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(
            tf.float32, shape=(None,) + observation_shape, name='obs1')

        self.state0 = tf.placeholder(
            tf.float32, shape=(None,) + state_shape, name='state0')
        self.state1 = tf.placeholder(
            tf.float32, shape=(None,) + state_shape, name='state1')

        self.terminals1 = tf.placeholder(
            tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(
            tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(
            tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(
            tf.float32, shape=(None, 1), name='critic_target')

        self.nstep_steps = tf.placeholder(
            tf.float32, shape=(None, 1), name='nstep_reached')
        self.nstep_critic_target = tf.placeholder(
            tf.float32, shape=(None, 1), name='nstep_critic_target')

        #  Memory debug variables - memory and resident set size. Used
        #  for tensorboard plotting.
        self.memory_size = tf.placeholder(
            tf.float32, shape=None, name='memory_size')
        self.rss = tf.placeholder(tf.float32, shape=None, name='rss')

        self.aux0 = tf.placeholder(
            tf.float32, shape=(None,) + aux_shape, name='aux0')
        self.aux1 = tf.placeholder(
            tf.float32, shape=(None,) + aux_shape, name='aux1')

        self.pretraining_tf = tf.placeholder(
            tf.float32, shape=(None, 1),
            name='pretraining_tf')

        self.aux_shape = aux_shape
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_state = normalize_state
        self.normalize_aux = normalize_aux
        self.action_noise = action_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.actor = actor
        self.actor_lr = actor_lr
        self.state_range = state_range
        self.aux_range = aux_range
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.lambda_nstep = lambda_nstep
        self.lambda_1step = lambda_1step
        self.lambda_obj_conf_predict = lambda_obj_conf_predict
        self.lambda_gripper_predict = lambda_gripper_predict
        self.lambda_target_predict = lambda_target_predict
        self.nsteps = nsteps
        self.replay_beta = replay_beta
        self.run_name = run_name
        self.lambda_pretrain = lambda_pretrain
        self.target_policy_noise = target_policy_noise
        self.target_policy_noise_clip = target_policy_noise_clip
        self.ep = 0
        self.policy_and_target_update_period = policy_and_target_update_period

        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        if self.normalize_state:
            with tf.variable_scope('state_rms'):
                self.state_rms = RunningMeanStd(shape=state_shape)
        else:
            self.state_rms = None

        if self.normalize_aux:
            with tf.variable_scope('normalize_aux'):
                self.aux_rms = RunningMeanStd(shape=aux_shape)
        else:
            self.aux_rms = None
        with tf.name_scope('obs_preprocess'):
            self.normalized_obs0 = tf.clip_by_value(
                normalize(self.obs0, self.obs_rms), self.observation_range[0],
                self.observation_range[1])
            self.normalized_obs1 = tf.clip_by_value(
                normalize(self.obs1, self.obs_rms), self.observation_range[0],
                self.observation_range[1])
        with tf.name_scope('state_preprocess'):
            self.normalized_state0 = tf.clip_by_value(
                normalize(self.state0, self.state_rms), self.state_range[0],
                self.state_range[1])
            self.normalized_state1 = tf.clip_by_value(
                normalize(self.state1, self.state_rms), self.state_range[0],
                self.state_range[1])
        with tf.name_scope('aux_preprocess'):
            self.normalized_aux0 = tf.clip_by_value(
                normalize(self.aux0, self.aux_rms), self.aux_range[0],
                self.aux_range[1])
            self.normalized_aux1 = tf.clip_by_value(
                normalize(self.aux1, self.aux_rms), self.aux_range[0],
                self.aux_range[1])

        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor

        self.actor_tf, self.obj_conf, self.gripper, self.target = actor(
            self.normalized_obs0, self.normalized_aux0)
        next_actions, _, _, _ = target_actor(self.normalized_obs1,
                                             self.normalized_aux1)
        noise = tf.distributions.Normal(
            tf.zeros_like(next_actions), self.target_policy_noise).sample()
        noise = tf.clip_by_value(
            noise,
            -self.target_policy_noise_clip,
            self.target_policy_noise_clip,
        )

        # Initialize single/twin critics.
        self.num_critics = num_critics
        assert (num_critics == 1 or num_critics == 2)
        self.critics = [None] * num_critics
        self.target_critics = [None] * num_critics
        self.critic_tfs = [None] * num_critics
        self.critic_with_actor_tfs = [None] * num_critics
        self.step_1_td_losses = [None] * num_critics
        self.n_step_td_losses = [None] * num_critics
        self.td_errors = [None] * num_critics
        self.critic_losses = [None] * num_critics
        self.critic_grads = [None] * num_critics
        self.critic_optimizers = [None] * num_critics
        Q_obs1s = [None] * num_critics
        for i in range(num_critics):
            current_critic = copy(critic)
            current_critic.name = "critic" + str(i)
            self.critics[i] = current_critic
            self.target_critics[i] = copy(current_critic)
            self.target_critics[i].name = 'target_critic' + str(i)
            self.critic_tfs[i] = tf.clip_by_value(
                current_critic(self.normalized_state0, self.actions,
                               self.normalized_aux0), self.return_range[0],
                self.return_range[1])
            self.critic_with_actor_tfs[i] = tf.clip_by_value(
                current_critic(
                    self.normalized_state0,
                    self.actor_tf,
                    self.normalized_aux0,
                    reuse=True), self.return_range[0], self.return_range[1])
            Q_obs1s[i] = self.target_critics[i](self.normalized_state1,
                                                next_actions + noise,
                                                self.normalized_aux1)
        if num_critics == 2:
            minQ = tf.minimum(Q_obs1s[0], Q_obs1s[1])
        else:
            minQ = Q_obs1s[0]
        self.target_Q = self.rewards + \
                        (1. - self.terminals1) * tf.pow(gamma, self.nstep_steps) * minQ
        self.importance_weights = tf.placeholder(
            tf.float32, shape=(None, 1), name='importance_weights')
        self.setup_actor_optimizer()
        self.setup_stats()
        self.setup_target_network_updates()
        for i in range(num_critics):
            self.setup_critic_optimizer(i)
        self.setup_summaries()

    def setup_target_network_updates(self):
        with tf.name_scope('target_net_updates'):
            actor_init_updates, actor_soft_updates = get_target_updates(
                self.actor.vars, self.target_actor.vars, self.tau)
            target_init_updates = [actor_init_updates]
            target_soft_updates = [actor_soft_updates]
            for i in range(self.num_critics):
                init, soft = get_target_updates(self.critics[i].vars,
                                                self.target_critics[i].vars,
                                                self.tau)
                target_init_updates.append(init)
                target_soft_updates.append(soft)
            self.target_init_updates = target_init_updates
            self.target_soft_updates = target_soft_updates

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        with tf.name_scope('actor_optimizer'):
            self.action_diffs = tf.reduce_mean(
                tf.square(self.actions - self.actor_tf), 1)
            demo_better_than_actor = self.critic_tfs[
                                         0] > self.critic_with_actor_tfs[0]
            demo_better_than_actor = self.pretraining_tf * \
                                     tf.cast(demo_better_than_actor, tf.float32)
            self.bc_loss = (
                    tf.reduce_sum(demo_better_than_actor * self.action_diffs) *
                    self.lambda_pretrain /
                    (tf.reduce_sum(self.pretraining_tf) + 1e-6))
            self.original_actor_loss = - tf.reduce_mean(self.critic_with_actor_tfs[0])
            self.obj_conf_loss = tf.reduce_mean(
                tf.square(self.obj_conf -
                          self.state0[:, 8:11])) * self.lambda_obj_conf_predict
            self.gripper_loss = tf.reduce_mean(
                tf.square(self.gripper -
                          self.state0[:, 0:3])) * self.lambda_gripper_predict
            self.target_loss = tf.reduce_mean(
                tf.square(self.target -
                          self.state0[:, 3:6])) * self.lambda_target_predict
            self.actor_loss = self.original_actor_loss + self.bc_loss + \
                              self.obj_conf_loss + self.gripper_loss + self.target_loss
            self.number_of_demos_better = tf.reduce_sum(
                demo_better_than_actor)
            actor_shapes = [
                var.get_shape().as_list() for var in self.actor.trainable_vars
            ]
            actor_nb_params = sum(
                [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
            logger.info('  actor shapes: {}'.format(actor_shapes))
            logger.info('  actor params: {}'.format(actor_nb_params))
            self.actor_grads = U.flatgrad(
                self.actor_loss,
                self.actor.trainable_vars,
                clip_norm=self.clip_norm)
            self.actor_optimizer = MpiAdam(
                var_list=self.actor.trainable_vars,
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-08)

    def setup_critic_optimizer(self, i):
        with tf.name_scope('critic_optimizer' + str(i)):
            critic_target_tf = tf.clip_by_value(
                self.critic_target, self.return_range[0], self.return_range[1])
            nstep_critic_target_tf = tf.clip_by_value(self.nstep_critic_target,
                                                      self.return_range[0],
                                                      self.return_range[1])
            td_error = tf.square(self.critic_tfs[i] - critic_target_tf)
            self.step_1_td_losses[i] = tf.reduce_mean(
                self.importance_weights * td_error) * self.lambda_1step

            nstep_td_error = tf.square(self.critic_tfs[i] -
                                       nstep_critic_target_tf)

            self.n_step_td_losses[i] = tf.reduce_mean(
                self.importance_weights * nstep_td_error) * self.lambda_nstep

            self.td_errors[i] = td_error + nstep_td_error
            self.critic_losses[i] = self.step_1_td_losses[i] + \
                                    self.n_step_td_losses[i]

            if self.critic_l2_reg > 0.:
                critic_reg_vars = [
                    var for var in self.critics[i].trainable_vars
                    if 'kernel' in var.name and 'output' not in var.name
                ]
                for var in critic_reg_vars:
                    logger.info('  regularizing: {}'.format(var.name))
                logger.info('  applying l2 regularization with {}'.format(
                    self.critic_l2_reg))
                critic_reg = tc.layers.apply_regularization(
                    tc.layers.l2_regularizer(self.critic_l2_reg),
                    weights_list=critic_reg_vars)
                self.critic_losses[i] += critic_reg
            critic_shapes = [
                var.get_shape().as_list()
                for var in self.critics[i].trainable_vars
            ]
            critic_nb_params = sum(
                [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
            logger.info('  critic shapes: {}'.format(critic_shapes))
            logger.info('  critic params: {}'.format(critic_nb_params))
            self.critic_grads[i] = U.flatgrad(
                self.critic_losses[i],
                self.critics[i].trainable_vars,
                clip_norm=self.clip_norm)
            self.critic_optimizers[i] = MpiAdam(
                var_list=self.critics[i].trainable_vars,
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-08)

    def setup_summaries(self):
        tf.summary.scalar("actor_loss", self.actor_loss)
        for i in range(self.num_critics):
            name_sufffix = str(i)
            tf.summary.scalar("critic_loss" + name_sufffix,
                              self.critic_losses[i])
            tf.summary.scalar("1step_loss" + name_sufffix,
                              self.step_1_td_losses[i])
            tf.summary.scalar("nstep_loss" + name_sufffix,
                              self.n_step_td_losses[i])

        tf.summary.scalar("percentage_of_demonstrations",
                          tf.reduce_sum(self.pretraining_tf) / self.batch_size)
        tf.summary.scalar("number_of_demos_better_than_actor",
                          self.number_of_demos_better)
        tf.summary.histogram("pretrain_samples", self.pretraining_tf)
        tf.summary.scalar("bc_loss", self.bc_loss)
        tf.summary.scalar("obj_conf_loss", self.obj_conf_loss)
        tf.summary.scalar("target_loss", self.target_loss)
        tf.summary.scalar("gripper_loss", self.gripper_loss)
        tf.summary.scalar("original_actor_loss", self.original_actor_loss)
        tf.summary.scalar("memory_size", self.memory_size)
        tf.summary.scalar("rss", self.rss)
        self.scalar_summaries = tf.summary.merge_all()
        # reward
        self.r_plot_in = tf.placeholder(tf.float32, name='r_plot_in')
        self.r_plot = tf.summary.scalar("returns", self.r_plot_in)
        self.r_plot_in_eval = tf.placeholder(tf.float32, name='r_plot_in_eval')
        self.r_plot_eval = tf.summary.scalar("returns_eval",
                                             self.r_plot_in_eval)

        self.obj_conf_in_eval = tf.placeholder(
            tf.float32, name='obj_conf_in_eval')
        self.obj_conf_eval = tf.summary.scalar("obj_conf_eval",
                                               self.obj_conf_in_eval)

        self.grip_in_eval = tf.placeholder(tf.float32, name='grip_in_eval')
        self.grip_eval = tf.summary.scalar("grip_eval", self.grip_in_eval)

        self.target_in_eval = tf.placeholder(tf.float32, name='target_in_eval')
        self.target_eval = tf.summary.scalar("target_eval",
                                             self.target_in_eval)

        self.writer = tf.summary.FileWriter(
            tmp + '/summaries/' + self.run_name, graph=tf.get_default_graph())

    def save_reward(self, r):
        self.ep += 1
        summary = self.sess.run(self.r_plot, feed_dict={self.r_plot_in: r})
        self.writer.add_summary(summary, self.ep)

    def save_aux_prediction(self, obj_conf, grip, target):
        self.ep += 1
        obj_conf_summ, grip_summ, target_summ = self.sess.run(
            [self.obj_conf_eval, self.grip_eval, self.target_eval],
            feed_dict={
                self.obj_conf_in_eval: obj_conf,
                self.grip_in_eval: grip,
                self.target_in_eval: target
            })
        self.writer.add_summary(obj_conf_summ, self.ep)
        self.writer.add_summary(grip_summ, self.ep)
        self.writer.add_summary(target_summ, self.ep)

    def save_eval_reward(self, r, ep):
        summary = self.sess.run(
            self.r_plot_eval, feed_dict={self.r_plot_in_eval: r})
        self.writer.add_summary(summary, ep)

    def setup_stats(self):
        with tf.name_scope('stats'):
            ops = []
            names = []

            if self.normalize_observations:
                ops += [
                    tf.reduce_mean(self.obs_rms.mean),
                    tf.reduce_mean(self.obs_rms.std)
                ]
                names += ['obs_rms_mean', 'obs_rms_std']

            ops += [tf.reduce_mean(self.critic_tfs[0])]
            names += ['reference_Q_mean']
            ops += [reduce_std(self.critic_tfs[0])]
            names += ['reference_Q_std']

            ops += [tf.reduce_mean(self.critic_with_actor_tfs[0])]
            names += ['reference_actor_Q_mean']
            ops += [reduce_std(self.critic_with_actor_tfs[0])]
            names += ['reference_actor_Q_std']

            ops += [tf.reduce_mean(self.actor_tf)]
            names += ['reference_action_mean']
            ops += [reduce_std(self.actor_tf)]
            names += ['reference_action_std']

            self.stats_ops = ops
            self.stats_names = names

    def pi(self, obs, aux, state0, apply_noise=True, compute_Q=True):
        actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs], self.aux0: [aux], self.state0: [state0]}
        if compute_Q:
            action, q, obj_conf, gripper, target = self.sess.run(
                [
                    actor_tf, self.critic_with_actor_tfs[0], self.obj_conf,
                    self.gripper, self.target
                ],
                feed_dict=feed_dict)
        else:
            action, obj_conf, gripper, target = self.sess.run(
                [actor_tf, self.obj_conf, self.gripper, self.target],
                feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q, obj_conf, gripper, target

    def store_transition(self,
                         state,
                         obs0,
                         action,
                         reward,
                         state1,
                         obs1,
                         terminal1,
                         aux0,
                         aux1,
                         i,
                         demo=False):
        reward *= self.reward_scale
        if demo:
            self.memory.append_demonstration(state, obs0, action, reward,
                                             state1, obs1, terminal1, aux0, aux1, i)
        else:
            assert i is None
            self.memory.append(state, obs0, action, reward, state1, obs1,
                               terminal1, aux0, aux1, i)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

        if self.normalize_state:
            self.state_rms.update(np.array([state]))

        if self.normalize_aux:
            self.aux_rms.update(np.array([aux0]))

    def train(self, iteration, pretrain=False):
        batch, n_step_batch, percentage = self.memory.sample_rollout(
            batch_size=self.batch_size,
            nsteps=self.nsteps,
            beta=self.replay_beta,
            gamma=self.gamma,
            pretrain=pretrain)

        target_Q_1step = self.sess.run(
            self.target_Q,
            feed_dict={
                self.obs1: batch['obs1'],
                self.state1: batch['states1'],
                self.aux1: batch['aux1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
                self.nstep_steps: np.ones((self.batch_size, 1)),
            })

        target_Q_nstep = self.sess.run(
            self.target_Q,
            feed_dict={
                self.obs1: n_step_batch['obs1'],
                self.state1: n_step_batch['states1'],
                self.aux1: n_step_batch['aux1'],
                self.rewards: n_step_batch['rewards'],
                self.nstep_steps: n_step_batch['step_reached'],
                self.terminals1: n_step_batch['terminals1'].astype('float32'),
            })
        critic_grads = [None] * self.num_critics
        critic_losses = [None] * self.num_critics
        td_errors = [None] * self.num_critics
        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, *self.critic_grads,
            *self.critic_losses, *self.td_errors, self.scalar_summaries
        ]
        ret = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.importance_weights: batch['weights'],
                self.state0: batch['states0'],
                self.aux0: batch['aux0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q_1step,
                self.nstep_critic_target: target_Q_nstep,
                self.pretraining_tf: batch['demos'].astype('float32'),
                self.memory_size: len(self.memory.storage),
                self.rss: resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            })
        if self.num_critics == 2:
            actor_grads, actor_loss, critic_grads[0], critic_grads[1], critic_losses[0], critic_losses[1], td_errors[0], \
            td_errors[1], scalar_summaries = ret
        else:
            actor_grads, actor_loss, critic_grads[0], critic_losses[
                0], td_errors[0], scalar_summaries = ret
        self.memory.update_priorities(batch['idxes'], td_errors[0])
        for i in range(self.num_critics):
            self.critic_optimizers[i].update(
                critic_grads[i], stepsize=self.critic_lr)
        self.writer.add_summary(scalar_summaries, iteration)
        if iteration % self.policy_and_target_update_period == 0:
            self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        return critic_losses[0], actor_loss

    def set_sess(self, sess):
        self.sess = sess

    def initialize_vars(self):
        self.sess.run(tf.global_variables_initializer())

    def sync_optimizers(self):
        self.actor_optimizer.sync()
        for i in range(self.num_critics):
            self.critic_optimizers[i].sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            self.stats_sample = self.memory.sample_prioritized(
                batch_size=self.batch_size, replay_beta=self.replay_beta)
        values = self.sess.run(
            self.stats_ops,
            feed_dict={
                self.obs0: self.stats_sample['obs0'],
                self.actions: self.stats_sample['actions'],
                self.aux0: self.stats_sample['aux0'],
                self.state0: self.stats_sample['states0'],
            })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        return stats

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()

    def write_summary(self, summary):
        agent_summary = {
            "gamma": self.gamma,
            "tau": self.tau,
            "normalize_observations": self.normalize_observations,
            "normalize_state": self.normalize_state,
            "normalize_aux": self.normalize_aux,
            "action_noise": self.action_noise,
            "action_range": self.action_range,
            "return_range": self.return_range,
            "observation_range": self.observation_range,
            "actor_lr": self.actor_lr,
            "state_range": self.state_range,
            "critic_lr": self.critic_lr,
            "clip_norm": self.clip_norm,
            "enable_popart": self.enable_popart,
            "reward_scale": self.reward_scale,
            "batch_size": self.batch_size,
            "critic_l2_reg": self.critic_l2_reg,
            "lambda_nstep": self.lambda_nstep,
            "lambda_1step": self.lambda_1step,
            "nsteps": self.nsteps,
            "replay_beta": self.replay_beta,
            "run_name": self.run_name,
            "lambda_pretrain": self.lambda_pretrain,
            "target_policy_noise": self.target_policy_noise,
            "target_policy_noise_clip": self.target_policy_noise_clip,
            "lambda_obj_conf_predict": self.lambda_obj_conf_predict,
            "lambda_target_predict": self.lambda_target_predict,
            "lambda_gripper_predict": self.lambda_gripper_predict,
        }
        summary["agent_summary"] = agent_summary
        md_string = self._markdownize_summary(summary)
        summary_op = tf.summary.text("param_info",
                                     tf.convert_to_tensor(md_string))
        text = self.sess.run(summary_op)
        self.writer.add_summary(text)
        self.writer.flush()
        print(md_string)

    @staticmethod
    def _markdownize_summary(data):
        result = []
        for section, params in data.items():
            result.append("### " + section)
            for param, value in params.items():
                result.append("* {} : {}".format(str(param), str(value)))
        return "\n".join(result)
class DDPG(object):
    def __init__(self,
                 prefix,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 dis_batch_size=512,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 actor_dis_lr=1e-4,
                 critic_lr=1e-3,
                 exp_scale=1.0,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0_' + prefix)
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1_' + prefix)
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1_' + prefix)
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards_' + prefix)
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions_' + prefix)
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target_' + prefix)
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev_' +
                                                 prefix)
        self.EXP_SCALE = tf.placeholder(tf.float32, [])
        # For distillation
        #self.dis_obs = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='dis_obs_' + prefix)
        self.dis_actions = tf.placeholder(tf.float32,
                                          shape=(None, ) + action_shape,
                                          name='dis_actions_' + prefix)
        self.dis_qs = tf.placeholder(tf.float32,
                                     shape=(None, 1),
                                     name='dis_qs_' + prefix)

        self.prefix = prefix

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.actor_dis_lr = actor_dis_lr
        self.critic_lr = critic_lr
        self.exp_scale = exp_scale
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.dis_batch_size = dis_batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms_' + self.prefix):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms_' + self.prefix):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor_' + self.prefix
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic_' + self.prefix
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_actor_dis_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor_' + self.prefix
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor_' + self.prefix
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_actor_dis_optimizer(self):
        logger.info('setting up actor distillation  optimizer')
        self.weights = tf.stop_gradient(
            tf.clip_by_value(
                tf.exp(
                    tf.math.scalar_mul(self.EXP_SCALE,
                                       self.dis_qs - self.critic_tf)), 0.01,
                100))
        self.weights = self.weights / tf.reduce_sum(self.weights)
        self.actor_dis_loss = tf.reduce_sum(
            tf.math.multiply(
                self.weights,
                tf.reduce_mean(tf.square(self.actor_tf - self.dis_actions),
                               axis=1)))
        actor_dis_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        self.actor_dis_grads = U.flatgrad(self.actor_dis_loss,
                                          self.actor.trainable_vars,
                                          clip_norm=self.clip_norm)
        self.actor_dis_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                           beta1=0.9,
                                           beta2=0.999,
                                           epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32,
                                      shape=[1],
                                      name='old_std_' + self.prefix)
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32,
                                       shape=[1],
                                       name='old_mean_' + self.prefix)
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def pi_batch(self, obs_batch):

        actor_tf = self.actor_tf

        feed_dict = {self.obs0: obs_batch}
        action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                  feed_dict=feed_dict)
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })

        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        cl0 = critic_loss
        al0 = actor_loss

        return cl0, al0

    def dis_train(self):
        if self.partner_agent.memory.nb_entries > 0:
            batch = self.partner_agent.memory.sample(
                batch_size=self.dis_batch_size)

            #print('############ Checking ##############')
            #print('Batch: ', batch)
            #print('Batch shape: ', batch['obs0'].shape)

            obs_batch = batch['obs0']
            # Note that, here, the q is denormalized
            partner_action_batch, partner_q_batch = self.partner_agent.pi_batch(
                obs_batch)

            # Actor Distillation
            ops = [
                self.actor_dis_grads, self.actor_dis_loss, self.weights,
                self.critic_tf
            ]
            actor_dis_grads, actor_dis_loss, weights, qs = self.sess.run(
                ops,
                feed_dict={
                    self.obs0: batch['obs0'],
                    self.dis_actions: partner_action_batch,
                    self.actions: partner_action_batch,
                    self.dis_qs: partner_q_batch,
                    self.EXP_SCALE: self.exp_scale,
                })
            #print('########## Checking ###########')
            #for i in range(weights.shape[0]):
            #    print(weights[i], ' ', partner_q_batch[0], ' ', qs[0])
            #print('Sum: ', np.sum(weights))

            #print('Action Batch: ', action_batch)
            #print('Actor Distiallation Loss: ', actor_dis_loss)
            #print('Action Batch: ', action_batch)
            #print('Q Batch: ', q_batch)
            self.actor_dis_optimizer.update(actor_dis_grads,
                                            stepsize=self.actor_dis_lr)

            return actor_dis_loss
        return 0.0

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    ### For dual imitation
    def set_partner_agent(self, agent):
        self.partner_agent = agent
示例#28
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            critic(normalized_obs1,
                   actor(normalized_obs1, reuse=True),
                   reuse=True), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1
        self.lag_mult = tf.Variable(1.0,
                                    name='lag_mult',
                                    trainable=True,
                                    import_scope='lag_mult')

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_optimizer(self):
        logger.info('setting up actor and critic optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.target_Q, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))

        self.actor_grads = U.flatgrad(self.actor_loss +
                                      self.lag_mult * self.critic_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)
        self.critic_grads = U.flatgrad(self.actor_loss +
                                       self.lag_mult * self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)
        self.lag_grads = U.flatgrad(
            self.actor_loss + self.lag_mult * self.critic_loss,
            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                              scope='lag_mult'))
        self.lag_optimizer = MpiAdam(var_list=tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='lag_mult'),
                                     beta1=0.9,
                                     beta2=0.999,
                                     epsilon=1e-08)

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.target_Q, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss, self.lag_grads, self.lag_mult
        ]
        actor_grads, actor_loss, critic_grads, critic_loss, lag_grads, lag_mult = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
        self.lag_optimizer.update(lag_grads, stepsize=self.actor_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.lag_optimizer.sync()

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#29
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma # discount factor
        self.tau = tau # stepsize for (smooth) updating the target network weights
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr # learning rate for network of actor
        self.critic_lr = critic_lr # learning rate for network of critic
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg # regularization coefficient for network of critic

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return (= reward) normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # TODO: 1- terminals1 ??

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1]) # TODO: maybe comment this line
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        # self.sess.run(tf.global_variables_initializer()) # is done in /baselines/ddpg/training.py
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })
        # values = [241.93886, 5.4122686, -3.6965165, 0.00028637942, -3.4581754, 2.3841858e-07, 0.25359547, 0.3071089, -0.18079321, 0.5772085]

        names = self.stats_names[:] # ['obs_rms_mean', 'obs_rms_std', 'reference_Q_mean', 'reference_Q_std', 'reference_actor_Q_mean', 'reference_actor_Q_std', 'reference_action_mean', 'reference_action_std', 'reference_perturbed...tion_mean', 'reference_perturbed...ction_std']
        assert len(names) == len(values)
        stats = dict(zip(names, values)) # {'obs_rms_mean': 241.93886, 'obs_rms_std': 5.4122686, 'reference_Q_mean': -3.6965165, 'reference_Q_std': 0.00028637942, 'reference_action_mean': 0.25359547, 'reference_action_std': 0.3071089, 'reference_actor_Q_mean': -3.4581754, 'reference_actor_Q_std': 2.3841858e-07, 'reference_perturbed...tion_mean': -0.18079321, 'reference_perturbed...ction_std': 0.5772085}

        if self.param_noise is not None:
            stats.update(self.param_noise.get_stats())

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })
示例#30
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.,
                 sigma=None,
                 surrogate=False,
                 expected=False,
                 sigma_num_samples=10,
                 random_actor=False,
                 grad_num_samples=10):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.noises = tf.placeholder(tf.float32,
                                     shape=(None, ) + action_shape,
                                     name='noises')
        self.prev_noises = tf.placeholder(tf.float32,
                                          shape=(None, ) + action_shape,
                                          name='prev_noises')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.sigma = sigma
        self.expected = expected
        self.surrogate = surrogate
        self.random_actor = random_actor

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        if self.sigma is not None and self.action_noise is not None and expected and self.random_actor:
            critic_with_actor_tf_list = []
            for i in range(grad_num_samples):
                # noise = self.action_noise.memory_noise(self.prev_noises)
                noise = tf.random_normal(tf.shape(self.actor_tf),
                                         mean=0.0,
                                         stddev=0.2)
                noisy_action = self.actor_tf + noise
                clipped_action = tf.clip_by_value(noisy_action,
                                                  self.action_range[0],
                                                  self.action_range[1])
                current_critic = critic(normalized_obs0,
                                        clipped_action,
                                        reuse=True)
                critic_with_actor_tf = denormalize(
                    tf.clip_by_value(current_critic, self.return_range[0],
                                     self.return_range[1]), self.ret_rms)
                critic_with_actor_tf_list.append(critic_with_actor_tf)
            self.critic_with_actor_tf = tf.reduce_mean(tf.concat(
                critic_with_actor_tf_list, axis=1),
                                                       axis=1,
                                                       keepdims=True)
        else:
            self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                          self.actor_tf,
                                                          reuse=True)
            self.critic_with_actor_tf = denormalize(
                tf.clip_by_value(self.normalized_critic_with_actor_tf,
                                 self.return_range[0], self.return_range[1]),
                self.ret_rms)
        action = target_actor(normalized_obs1)
        if self.sigma is not None and self.action_noise is not None and expected:
            # noise = self.action_noise.memory_noise(self.noises, num_samples=num_samples) # preparation for OU noise

            Q_obs1_list = []
            for i in range(sigma_num_samples):
                if i > 0:
                    reuse = True
                else:
                    reuse = False
                # noise = self.action_noise.memory_noise(self.noises)
                noise = tf.random_normal(tf.shape(action),
                                         mean=0.0,
                                         stddev=0.2)
                noisy_action = action + noise
                clipped_action = tf.clip_by_value(noisy_action,
                                                  self.action_range[0],
                                                  self.action_range[1])
                Q_obs1_list.append(
                    denormalize(
                        target_critic(normalized_obs1,
                                      clipped_action,
                                      reuse=reuse), self.ret_rms))
            Q_obs1 = tf.reduce_mean(tf.concat(Q_obs1_list, axis=1),
                                    axis=1,
                                    keepdims=True)
        else:
            action = tf.clip_by_value(action, self.action_range[0],
                                      self.action_range[1])
            Q_obs1 = denormalize(target_critic(normalized_obs1, action),
                                 self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)

        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        action = self.sess.run(actor_tf, feed_dict=feed_dict)
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            prev_noise = self.action_noise.prev_noise()
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        else:
            noise = None
            prev_noise = None
        action = np.clip(action, self.action_range[0], self.action_range[1])
        if compute_Q:
            feed_dict = {self.obs0: [obs], self.actions: [action]}
            q = self.sess.run([actor_tf], feed_dict=feed_dict)
        else:
            q = None
        return action, q, noise, prev_noise

    def pi_surrogate(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        actor_action = self.sess.run(actor_tf, feed_dict=feed_dict)
        actor_action = actor_action.flatten()
        if self.action_noise is not None and apply_noise:
            prev_noise = self.action_noise.prev_noise()
            noise = self.action_noise()
            assert noise.shape == actor_action.shape
            action = actor_action + noise
        else:
            noise = None
            prev_noise = None

        action = np.clip(action, self.action_range[0], self.action_range[1])
        if compute_Q:
            feed_dict = {self.obs0: [obs], self.actions: [action]}
            q = self.sess.run([actor_tf], feed_dict=feed_dict)
        else:
            q = None
        return action, q, noise, prev_noise, actor_action

    #
    # def pi(self, obs, apply_noise=True, compute_Q=True):
    #     if self.param_noise is not None and apply_noise:
    #         actor_tf = self.perturbed_actor_tf
    #     else:
    #         actor_tf = self.actor_tf
    #     feed_dict = {self.obs0: [obs]}
    #     action = self.sess.run(actor_tf, feed_dict=feed_dict)
    #     action = action.flatten()
    #     if self.action_noise is not None and apply_noise:
    #         noise = self.action_noise()
    #         assert noise.shape == action.shape
    #     else:
    #         noise = None
    #     action = np.clip(action, self.action_range[0], self.action_range[1])
    #     if compute_Q:
    #         feed_dict = {self.obs0: [obs], self.actions: [action]}
    #         q = self.sess.run([actor_tf], feed_dict=feed_dict)
    #     else:
    #         q = None
    #     if self.action_noise is not None and apply_noise:
    #         noise = self.action_noise()
    #         assert noise.shape == action.shape
    #         action += noise
    #         action = np.clip(action, self.action_range[0], self.action_range[1])
    #     return action , q, noise

    # def pi(self, obs, apply_noise=True, compute_Q=True):
    #     if self.param_noise is not None and apply_noise:
    #         actor_tf = self.perturbed_actor_tf
    #     else:
    #         actor_tf = self.actor_tf
    #     feed_dict = {self.obs0: [obs]}
    #     if compute_Q:
    #         self.sess.run
    #         action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
    #     else:
    #         action = self.sess.run(actor_tf, feed_dict=feed_dict)
    #         q = None
    #     action = action.flatten()
    #     if self.action_noise is not None and apply_noise:
    #         noise = self.action_noise()
    #         assert noise.shape == action.shape
    #         action += noise
    #     action = np.clip(action, self.action_range[0], self.action_range[1])
    #     return action, q, noise

    def store_transition(self, obs0, action, reward, obs1, terminal1, noise,
                         prev_noise):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1, noise,
                           prev_noise)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                    self.noises: batch['noises'],
                    self.prev_noises: batch['prev_noises']
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                         self.noises:
                                         batch['noises'],
                                         self.prev_noises:
                                         batch['prev_noises']
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.noises: batch['noises'],
                self.prev_noises: batch['prev_noises'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0:
                                   self.stats_sample['obs0'],
                                   self.actions:
                                   self.stats_sample['actions'],
                                   self.noises:
                                   self.stats_sample['noises'],
                                   self.prev_noises:
                                   self.stats_sample['prev_noises']
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })
示例#31
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
        
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []
        
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
        
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
        
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
        
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
        
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = mpi_mean(distance)
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })