def flatten_grads(var_list, grads, clip_grad_range=None): """Flattens a variables and their gradients. """ if clip_grad_range is not None: return tf.concat([ tf.reshape(tf.clip_by_value(grad, *clip_grad_range), [U.numel(v)]) for (v, grad) in zip(var_list, grads) ], 0) else: return tf.concat([ tf.reshape(grad, [U.numel(v)]) for (v, grad) in zip(var_list, grads) ], 0)
def __init__(self, var_list, *, decay=0.9, momentum=0.0, epsilon=1e-08, centered=False, scale_grad_by_procs=True, comm=None): self.var_list = var_list self.decay = decay self.momentum = momentum # self.beta1 = beta1 # self.beta2 = beta2 self.epsilon = epsilon self.centered = centered self.scale_grad_by_procs = scale_grad_by_procs size = sum(U.numel(v) for v in var_list) self.mean_square = np.zeros(size, 'float32') self.mom = np.zeros(size, 'float32') if centered: self.mean_grad = np.zeros(size, 'float32') self.t = 0 self.setfromflat = U.SetFromFlat(var_list) self.getflat = U.GetFlat(var_list) self.comm = MPI.COMM_WORLD if comm is None else comm
def __init__(self, var_list, beta1=0.9, beta2=0.999, epsilon=1e-08): self.var_list = var_list self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon size = sum(U.numel(v) for v in var_list) self.m = np.zeros(size, 'float32') self.v = np.zeros(size, 'float32') self.t = 0 self.setfromflat = U.SetFromFlat(var_list) self.getflat = U.GetFlat(var_list)
def flatten_grads(var_list, grads): """ Flattens a variables and their gradients. :param var_list: ([TensorFlow Tensor]) the variables :param grads: ([TensorFlow Tensor]) the gradients :return: (TensorFlow Tensor) the flattend variable and gradient """ return tf.concat([ tf.reshape(grad, [tf_util.numel(v)]) for (v, grad) in zip(var_list, grads) ], 0)
def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): self.var_list = var_list self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.scale_grad_by_procs = scale_grad_by_procs size = sum(U.numel(v) for v in var_list) self.m = np.zeros(size, 'float32') self.v = np.zeros(size, 'float32') self.t = 0 self.setfromflat = U.SetFromFlat(var_list) self.getflat = U.GetFlat(var_list) self.comm = MPI.COMM_WORLD if comm is None else comm
def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): self.var_list = var_list self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.scale_grad_by_procs = scale_grad_by_procs size = sum(U.numel(v) for v in var_list) self.m = np.zeros(size, 'float32') self.v = np.zeros(size, 'float32') self.t = 0 self.setfromflat = U.SetFromFlat(var_list) self.getflat = U.GetFlat(var_list) self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
def __init__(self, name, reuse=False, *args, **kwargs): with tf.variable_scope(name): if reuse: tf.get_variable_scope().reuse_variables() self._init(*args, **kwargs) self.scope = tf.get_variable_scope().name #print(self.scope) #set up for functions to get and set policy parameters: var = self.get_trainable_variables() var_list = [v for v in var] self.flatten_var = tf.concat( axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in var_list]) self.get_flat = U.GetFlat(var_list) self.set_from_flat = U.SetFromFlat(var_list) self.setupadam(self.l2_reg)
def __init__(self, var_list, **_3to2kwargs): if 'comm' in _3to2kwargs: comm = _3to2kwargs['comm'] del _3to2kwargs['comm'] else: comm = None if 'scale_grad_by_procs' in _3to2kwargs: scale_grad_by_procs = _3to2kwargs['scale_grad_by_procs'] del _3to2kwargs['scale_grad_by_procs'] else: scale_grad_by_procs = True if 'epsilon' in _3to2kwargs: epsilon = _3to2kwargs['epsilon'] del _3to2kwargs['epsilon'] else: epsilon = 1e-08 if 'beta2' in _3to2kwargs: beta2 = _3to2kwargs['beta2'] del _3to2kwargs['beta2'] else: beta2 = 0.999 if 'beta1' in _3to2kwargs: beta1 = _3to2kwargs['beta1'] del _3to2kwargs['beta1'] else: beta1 = 0.9 self.var_list = var_list self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.scale_grad_by_procs = scale_grad_by_procs size = sum(U.numel(v) for v in var_list) self.m = np.zeros(size, u'float32') self.v = np.zeros(size, u'float32') self.t = 0 self.setfromflat = U.SetFromFlat(var_list) self.getflat = U.GetFlat(var_list) self.comm = MPI.COMM_WORLD if comm is None else comm
def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None, sess=None): """ A parallel MPI implementation of the Adam optimizer for TensorFlow https://arxiv.org/abs/1412.6980 :param var_list: ([TensorFlow Tensor]) the variables :param beta1: (float) Adam beta1 parameter :param beta2: (float) Adam beta1 parameter :param epsilon: (float) to help with preventing arithmetic issues :param scale_grad_by_procs: (bool) if the scaling should be done by processes :param comm: (MPI Communicators) if None, MPI.COMM_WORLD :param sess: (TensorFlow Session) if None, tf.get_default_session() """ self.var_list = var_list self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.scale_grad_by_procs = scale_grad_by_procs size = sum(tf_utils.numel(v) for v in var_list) # Exponential moving average of gradient values # "first moment estimate" m in the paper self.exp_avg = np.zeros(size, 'float32') # Exponential moving average of squared gradient values # "second raw moment estimate" v in the paper self.exp_avg_sq = np.zeros(size, 'float32') self.step = 0 self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess) self.getflat = tf_utils.GetFlat(var_list, sess=sess) self.comm = MPI.COMM_WORLD if comm is None else comm
def __init__(self, env, world, policies, nsteps, load_path, rho, max_kl, ent_coef, vf_coef, max_grad_norm, sync): self.sess = sess = U.get_session() self.env = env self.world = world self.sync = sync self.max_kl = max_kl if hasattr(env, 'num_envs'): self.n_batches = n_batches = nsteps * env.num_envs else: self.n_batches = n_batches = nsteps if MPI is not None: self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() else: self.nworkers = 1 self.rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) # GLOBAL PLACEHOLDERS self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) self.pi_n, self.oldpi_n, self.vfadam_n, self.exchange_n, self.to_exchange_n = [], [], [], [], [] self.compute_jtvp_n, self.compute_fvp_n, self.compute_losses_n, self.compute_vfloss_n = [], [], [], [] self.set_from_flat_n, self.get_flat_n = [], [] for i in range(world.n): name_scope = world.agents[i].name.replace(' ', '') with tf.variable_scope(name_scope): # OBSERVATION PLACEHOLDER ob_dtype = env.observation_space[i].dtype ob_shape = env.observation_space[i].shape OB = tf.placeholder(dtype=ob_dtype, shape=(None, ) + ob_shape) # Policy with tf.variable_scope("pi"): pi = policies[i](n_batches, observ_placeholder=OB) with tf.variable_scope("oldpi"): oldpi = policies[i](n_batches, observ_placeholder=OB) # CREATE OTHER PLACEHOLDERS AC = pi.pdtype.sample_placeholder([None]) ADV = tf.placeholder(dtype=tf.float32, shape=[None]) R = tf.placeholder(dtype=tf.float32, shape=[None]) OLDVPRED = tf.placeholder(dtype=tf.float32, shape=[None]) NB = tf.placeholder(dtype=tf.int32, shape=None) A = tf.placeholder(dtype=tf.float32, shape=None) ratio = tf.exp( pi.pd.logp(AC) - oldpi.pd.logp(AC) ) # Be careful about the dimensionality!!!!!!!!!!!!!!!! surrgain = tf.reduce_mean(ADV * ratio) kloldnew = oldpi.pd.kl(pi.pd) meankl = tf.reduce_mean(kloldnew) sync_err = A * tf.reshape(ratio, (self.n_batches, )) - tf.reshape( tf.gather(pi.net.z, NB), (self.n_batches, )) sync_loss = tf.reduce_sum(tf.reshape(tf.gather(pi.net.z, NB), (self.n_batches,)) * sync_err) + \ 0.5 * rho * tf.reduce_sum(tf.square(sync_err)) lagrange_loss = -surrgain + sync_loss losses = [lagrange_loss, surrgain, meankl] dist = meankl var_list = pi.net.w klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + sz], shape)) start += sz jjvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes] jtvp = [tf.zeros(shape, dtype=tf.float32) for shape in shapes] right_b = -ADV + A * tf.gather( pi.net.p, NB) - rho * A * tf.gather(pi.net.z, NB) for i in range(self.n_batches): ratio_i_grad = tf.gradients(ratio[i], var_list) jvp_i = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(ratio_i_grad, tangents) ]) jjvp = [ tf.add_n([jj, gg * jvp_i]) for (jj, gg) in zipsame(jjvp, ratio_i_grad) ] jtvp = [ tf.add_n([jt, gt * right_b[i]]) for (jt, gt) in zipsame(jtvp, ratio_i_grad) ] print(i) jjvp = tf.concat( axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jjvp]) jtvp = tf.concat( axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in jtvp]) gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = tf.add_n([U.flatgrad(gvp, var_list), rho * jjvp]) # Define the value loss vpredclipped = OLDVPRED + tf.clip_by_value( pi.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # vpredclipped = tf.clip_by_value(pi.vf, OLDVPRED*(1-CLIPRANGE), OLDVPRED*(1+CLIPRANGE)) vferr = tf.square(pi.vf - R) vferr2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vferr, vferr2)) vfadam = MpiAdam(pi.net.v) compute_jtvp = U.function([OB, AC, ADV, A, NB], jtvp) compute_fvp = U.function([flat_tangent, OB, AC, ADV], fvp) compute_losses = U.function([OB, AC, ADV, A, NB], losses) compute_vfloss = U.function([OB, R, OLDVPRED, CLIPRANGE], vf_loss) exchange = pi.net.exchange(sess, OB, AC, CLIPRANGE, NB, rho) to_exchange = U.function( [OB, AC, ADV, NB, CLIPRANGE], [ratio, tf.gather(pi.net.p, NB)]) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) self.pi_n.append(pi) self.oldpi_n.append(oldpi) self.get_flat_n.append(get_flat) self.set_from_flat_n.append(set_from_flat) self.vfadam_n.append(vfadam) self.exchange_n.append(exchange) self.to_exchange_n.append(to_exchange) self.compute_jtvp_n.append(compute_jtvp) self.compute_fvp_n.append(compute_fvp) self.compute_losses_n.append(compute_losses) self.compute_vfloss_n.append(compute_vfloss) # Update old plicy network updates = [] for i in range(len(world.agents)): name_scope = world.agents[i].name.replace(' ', '') old_vars = get_trainable_variables("{}/oldpi".format(name_scope)) now_vars = get_trainable_variables("{}/pi".format(name_scope)) updates += [ tf.assign(oldv, nowv) for (oldv, nowv) in zipsame(old_vars, now_vars) ] updates += [ tf.assign(self.pi_n[i].net.z, tf.ones_like(self.pi_n[i].net.z)) ] self.assign_old_eq_new = U.function([], [], updates=updates) @contextmanager def timed(msg): print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) self.timed = timed def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= self.nworkers else: out = np.copy(x) return out self.allmean = allmean # Initialization U.initialize() if load_path is not None: self.load(load_path) # for i in range(len(self.pi_n)): th_init = self.get_flat_n[i]() self.set_from_flat_n[i](th_init) print("Init param sum", th_init.sum(), flush=True) for vfadam in self.vfadam_n: vfadam.sync()
def flatten_grads(var_list, grads): """Flattens a variables and their gradients. """ return tf.concat( [tf.reshape(grad, [U.numel(v)]) for (v, grad) in zip(var_list, grads)], 0)
def flatten_grads(var_list, grads): """Flattens a variables and their gradients. """ return tf.concat([tf.reshape(grad, [U.numel(v)]) for (v, grad) in zip(var_list, grads)], 0)
def get_layer_flat(var_list): op = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in var_list]) return tf.get_default_session().run(op)
def nograd(self, var_list): return tf.concat(axis=0, values=[ tf.reshape(tf.zeros_like(v), [U.numel(v)]) for v in var_list ])