示例#1
0
def flatten_grads(var_list, grads, clip_grad_range=None):
    """Flattens a variables and their gradients.
    """
    if clip_grad_range is not None:
        return tf.concat([
            tf.reshape(tf.clip_by_value(grad, *clip_grad_range), [U.numel(v)])
            for (v, grad) in zip(var_list, grads)
        ], 0)
    else:
        return tf.concat([
            tf.reshape(grad, [U.numel(v)])
            for (v, grad) in zip(var_list, grads)
        ], 0)
示例#2
0
    def __init__(self,
                 var_list,
                 *,
                 decay=0.9,
                 momentum=0.0,
                 epsilon=1e-08,
                 centered=False,
                 scale_grad_by_procs=True,
                 comm=None):
        self.var_list = var_list
        self.decay = decay
        self.momentum = momentum
        # self.beta1 = beta1
        # self.beta2 = beta2
        self.epsilon = epsilon
        self.centered = centered
        self.scale_grad_by_procs = scale_grad_by_procs
        size = sum(U.numel(v) for v in var_list)

        self.mean_square = np.zeros(size, 'float32')
        self.mom = np.zeros(size, 'float32')
        if centered:
            self.mean_grad = np.zeros(size, 'float32')
        self.t = 0
        self.setfromflat = U.SetFromFlat(var_list)
        self.getflat = U.GetFlat(var_list)
        self.comm = MPI.COMM_WORLD if comm is None else comm
示例#3
0
 def __init__(self, var_list, beta1=0.9, beta2=0.999, epsilon=1e-08):
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, 'float32')
     self.v = np.zeros(size, 'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
示例#4
0
def flatten_grads(var_list, grads):
    """
    Flattens a variables and their gradients.

    :param var_list: ([TensorFlow Tensor]) the variables
    :param grads: ([TensorFlow Tensor]) the gradients
    :return: (TensorFlow Tensor) the flattend variable and gradient
    """
    return tf.concat([
        tf.reshape(grad, [tf_util.numel(v)])
        for (v, grad) in zip(var_list, grads)
    ], 0)
示例#5
0
 def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     self.scale_grad_by_procs = scale_grad_by_procs
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, 'float32')
     self.v = np.zeros(size, 'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
     self.comm = MPI.COMM_WORLD if comm is None else comm
示例#6
0
 def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     self.scale_grad_by_procs = scale_grad_by_procs
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, 'float32')
     self.v = np.zeros(size, 'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
     self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
    def __init__(self, name, reuse=False, *args, **kwargs):
        with tf.variable_scope(name):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            self._init(*args, **kwargs)
            self.scope = tf.get_variable_scope().name

        #print(self.scope)
        #set up for functions to get and set policy parameters:
        var = self.get_trainable_variables()
        var_list = [v for v in var]
        self.flatten_var = tf.concat(
            axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in var_list])

        self.get_flat = U.GetFlat(var_list)
        self.set_from_flat = U.SetFromFlat(var_list)

        self.setupadam(self.l2_reg)
示例#8
0
 def __init__(self, var_list, **_3to2kwargs):
     if 'comm' in _3to2kwargs:
         comm = _3to2kwargs['comm']
         del _3to2kwargs['comm']
     else:
         comm = None
     if 'scale_grad_by_procs' in _3to2kwargs:
         scale_grad_by_procs = _3to2kwargs['scale_grad_by_procs']
         del _3to2kwargs['scale_grad_by_procs']
     else:
         scale_grad_by_procs = True
     if 'epsilon' in _3to2kwargs:
         epsilon = _3to2kwargs['epsilon']
         del _3to2kwargs['epsilon']
     else:
         epsilon = 1e-08
     if 'beta2' in _3to2kwargs:
         beta2 = _3to2kwargs['beta2']
         del _3to2kwargs['beta2']
     else:
         beta2 = 0.999
     if 'beta1' in _3to2kwargs:
         beta1 = _3to2kwargs['beta1']
         del _3to2kwargs['beta1']
     else:
         beta1 = 0.9
     self.var_list = var_list
     self.beta1 = beta1
     self.beta2 = beta2
     self.epsilon = epsilon
     self.scale_grad_by_procs = scale_grad_by_procs
     size = sum(U.numel(v) for v in var_list)
     self.m = np.zeros(size, u'float32')
     self.v = np.zeros(size, u'float32')
     self.t = 0
     self.setfromflat = U.SetFromFlat(var_list)
     self.getflat = U.GetFlat(var_list)
     self.comm = MPI.COMM_WORLD if comm is None else comm
示例#9
0
    def __init__(self,
                 var_list,
                 *,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-08,
                 scale_grad_by_procs=True,
                 comm=None,
                 sess=None):
        """
        A parallel MPI implementation of the Adam optimizer for TensorFlow
        https://arxiv.org/abs/1412.6980

        :param var_list: ([TensorFlow Tensor]) the variables
        :param beta1: (float) Adam beta1 parameter
        :param beta2: (float) Adam beta1 parameter
        :param epsilon: (float) to help with preventing arithmetic issues
        :param scale_grad_by_procs: (bool) if the scaling should be done by processes
        :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
        :param sess: (TensorFlow Session) if None, tf.get_default_session()
        """
        self.var_list = var_list
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.scale_grad_by_procs = scale_grad_by_procs
        size = sum(tf_utils.numel(v) for v in var_list)
        # Exponential moving average of gradient values
        # "first moment estimate" m in the paper
        self.exp_avg = np.zeros(size, 'float32')
        # Exponential moving average of squared gradient values
        # "second raw moment estimate" v in the paper
        self.exp_avg_sq = np.zeros(size, 'float32')
        self.step = 0
        self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess)
        self.getflat = tf_utils.GetFlat(var_list, sess=sess)
        self.comm = MPI.COMM_WORLD if comm is None else comm
示例#10
0
    def __init__(self, env, world, policies, nsteps, load_path, rho, max_kl,
                 ent_coef, vf_coef, max_grad_norm, sync):
        self.sess = sess = U.get_session()
        self.env = env
        self.world = world
        self.sync = sync
        self.max_kl = max_kl
        if hasattr(env, 'num_envs'):
            self.n_batches = n_batches = nsteps * env.num_envs
        else:
            self.n_batches = n_batches = nsteps

        if MPI is not None:
            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
        else:
            self.nworkers = 1
            self.rank = 0

        cpus_per_worker = 1
        U.get_session(config=tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=cpus_per_worker,
            intra_op_parallelism_threads=cpus_per_worker))

        # GLOBAL PLACEHOLDERS
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        self.pi_n, self.oldpi_n, self.vfadam_n, self.exchange_n, self.to_exchange_n = [], [], [], [], []
        self.compute_jtvp_n, self.compute_fvp_n, self.compute_losses_n, self.compute_vfloss_n = [], [], [], []
        self.set_from_flat_n, self.get_flat_n = [], []
        for i in range(world.n):
            name_scope = world.agents[i].name.replace(' ', '')
            with tf.variable_scope(name_scope):
                # OBSERVATION PLACEHOLDER
                ob_dtype = env.observation_space[i].dtype
                ob_shape = env.observation_space[i].shape
                OB = tf.placeholder(dtype=ob_dtype, shape=(None, ) + ob_shape)
                # Policy
                with tf.variable_scope("pi"):
                    pi = policies[i](n_batches, observ_placeholder=OB)
                with tf.variable_scope("oldpi"):
                    oldpi = policies[i](n_batches, observ_placeholder=OB)

                # CREATE OTHER PLACEHOLDERS
                AC = pi.pdtype.sample_placeholder([None])
                ADV = tf.placeholder(dtype=tf.float32, shape=[None])
                R = tf.placeholder(dtype=tf.float32, shape=[None])
                OLDVPRED = tf.placeholder(dtype=tf.float32, shape=[None])
                NB = tf.placeholder(dtype=tf.int32, shape=None)
                A = tf.placeholder(dtype=tf.float32, shape=None)

                ratio = tf.exp(
                    pi.pd.logp(AC) - oldpi.pd.logp(AC)
                )  # Be careful about the dimensionality!!!!!!!!!!!!!!!!
                surrgain = tf.reduce_mean(ADV * ratio)
                kloldnew = oldpi.pd.kl(pi.pd)
                meankl = tf.reduce_mean(kloldnew)
                sync_err = A * tf.reshape(ratio,
                                          (self.n_batches, )) - tf.reshape(
                                              tf.gather(pi.net.z, NB),
                                              (self.n_batches, ))
                sync_loss = tf.reduce_sum(tf.reshape(tf.gather(pi.net.z, NB), (self.n_batches,)) * sync_err) + \
                            0.5 * rho * tf.reduce_sum(tf.square(sync_err))
                lagrange_loss = -surrgain + sync_loss
                losses = [lagrange_loss, surrgain, meankl]
                dist = meankl

                var_list = pi.net.w
                klgrads = tf.gradients(dist, var_list)
                flat_tangent = tf.placeholder(dtype=tf.float32,
                                              shape=[None],
                                              name="flat_tan")

                shapes = [var.get_shape().as_list() for var in var_list]
                start = 0
                tangents = []
                for shape in shapes:
                    sz = U.intprod(shape)
                    tangents.append(
                        tf.reshape(flat_tangent[start:start + sz], shape))
                    start += sz

                jjvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes]
                jtvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes]
                right_b = -ADV + A * tf.gather(
                    pi.net.p, NB) - rho * A * tf.gather(pi.net.z, NB)
                for i in range(self.n_batches):
                    ratio_i_grad = tf.gradients(ratio[i], var_list)
                    jvp_i = tf.add_n([
                        tf.reduce_sum(g * tangent)
                        for (g, tangent) in zipsame(ratio_i_grad, tangents)
                    ])
                    jjvp = [
                        tf.add_n([jj, gg * jvp_i])
                        for (jj, gg) in zipsame(jjvp, ratio_i_grad)
                    ]
                    jtvp = [
                        tf.add_n([jt, gt * right_b[i]])
                        for (jt, gt) in zipsame(jtvp, ratio_i_grad)
                    ]
                    print(i)

                jjvp = tf.concat(
                    axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jjvp])
                jtvp = tf.concat(
                    axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jtvp])
                gvp = tf.add_n([
                    tf.reduce_sum(g * tangent)
                    for (g, tangent) in zipsame(klgrads, tangents)
                ])  #pylint: disable=E1111
                fvp = tf.add_n([U.flatgrad(gvp, var_list), rho * jjvp])

                # Define the value loss
                vpredclipped = OLDVPRED + tf.clip_by_value(
                    pi.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE)
                # vpredclipped = tf.clip_by_value(pi.vf, OLDVPRED*(1-CLIPRANGE), OLDVPRED*(1+CLIPRANGE))
                vferr = tf.square(pi.vf - R)
                vferr2 = tf.square(vpredclipped - R)
                vf_loss = .5 * tf.reduce_mean(tf.maximum(vferr, vferr2))
                vfadam = MpiAdam(pi.net.v)

                compute_jtvp = U.function([OB, AC, ADV, A, NB], jtvp)
                compute_fvp = U.function([flat_tangent, OB, AC, ADV], fvp)
                compute_losses = U.function([OB, AC, ADV, A, NB], losses)
                compute_vfloss = U.function([OB, R, OLDVPRED, CLIPRANGE],
                                            vf_loss)
                exchange = pi.net.exchange(sess, OB, AC, CLIPRANGE, NB, rho)
                to_exchange = U.function(
                    [OB, AC, ADV, NB, CLIPRANGE],
                    [ratio, tf.gather(pi.net.p, NB)])

                get_flat = U.GetFlat(var_list)
                set_from_flat = U.SetFromFlat(var_list)

            self.pi_n.append(pi)
            self.oldpi_n.append(oldpi)
            self.get_flat_n.append(get_flat)
            self.set_from_flat_n.append(set_from_flat)
            self.vfadam_n.append(vfadam)
            self.exchange_n.append(exchange)
            self.to_exchange_n.append(to_exchange)
            self.compute_jtvp_n.append(compute_jtvp)
            self.compute_fvp_n.append(compute_fvp)
            self.compute_losses_n.append(compute_losses)
            self.compute_vfloss_n.append(compute_vfloss)

        # Update old plicy network
        updates = []
        for i in range(len(world.agents)):
            name_scope = world.agents[i].name.replace(' ', '')
            old_vars = get_trainable_variables("{}/oldpi".format(name_scope))
            now_vars = get_trainable_variables("{}/pi".format(name_scope))
            updates += [
                tf.assign(oldv, nowv)
                for (oldv, nowv) in zipsame(old_vars, now_vars)
            ]
            updates += [
                tf.assign(self.pi_n[i].net.z, tf.ones_like(self.pi_n[i].net.z))
            ]
        self.assign_old_eq_new = U.function([], [], updates=updates)

        @contextmanager
        def timed(msg):
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))

        self.timed = timed

        def allmean(x):
            assert isinstance(x, np.ndarray)
            if MPI is not None:
                out = np.empty_like(x)
                MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
                out /= self.nworkers
            else:
                out = np.copy(x)

            return out

        self.allmean = allmean

        # Initialization
        U.initialize()
        if load_path is not None:
            self.load(load_path)

            # for i in range(len(self.pi_n)):
            th_init = self.get_flat_n[i]()
            self.set_from_flat_n[i](th_init)
            print("Init param sum", th_init.sum(), flush=True)

        for vfadam in self.vfadam_n:
            vfadam.sync()
示例#11
0
def flatten_grads(var_list, grads):
    """Flattens a variables and their gradients.
    """
    return tf.concat(
        [tf.reshape(grad, [U.numel(v)]) for (v, grad) in zip(var_list, grads)],
        0)
示例#12
0
def flatten_grads(var_list, grads):
    """Flattens a variables and their gradients.
    """
    return tf.concat([tf.reshape(grad, [U.numel(v)])
                      for (v, grad) in zip(var_list, grads)], 0)
示例#13
0
def get_layer_flat(var_list):
    op = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in var_list])
    return tf.get_default_session().run(op)
示例#14
0
 def nograd(self, var_list):
     return tf.concat(axis=0, values=[
         tf.reshape(tf.zeros_like(v), [U.numel(v)])
         for v in var_list
     ])