def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now self.q = q def step(ob, *args, **kwargs): # returns actions, mus, states a0, pi0 = sess.run([a, pi], {X: ob}) return a0, pi0, [] # dummy state def out(ob, *args, **kwargs): pi0, q0 = sess.run([pi, q], {X: ob}) return pi0, q0 def act(ob, *args, **kwargs): return sess.run(a, {X: ob}) self.step = step self.out = out self.act = act
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): #h = custom_cnn(X, **conv_kwargs) #print(conv_kwargs) h = policies.nature_cnn(X, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): super().__init__(sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=reuse) nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) self.a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now self.q = q self.sess = sess
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("acer"): h = nature_cnn(X) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) with tf.variable_scope("explore"): # for explore nogradient_h = tf.stop_gradient(h) e_pi_logits = fc(nogradient_h, 'e_pi', nact, init_scale=0.01) e_pi = tf.nn.softmax(e_pi_logits) e_v = fc(nogradient_h, 'e_v', 1)[:, 0] a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now self.q = q # for explore e_a = sample(e_pi_logits) # could change this to use self.pi instead self.e_pi_logits = e_pi_logits self.e_pi = e_pi self.e_v = e_v def step(ob, *args, **kwargs): # returns actions, mus, states a0, pi0 = sess.run([a, pi], {X: ob}) return a0, pi0, [] # dummy state # for explore def e_step(ob, *args, **kwargs): e_a0, pi0, e_v0 = sess.run([e_a, pi, e_v], {X: ob}) return e_a0, pi0, e_v0, [] # dummy state self.e_step = e_step def out(ob, *args, **kwargs): pi0, q0 = sess.run([pi, q], {X: ob}) return pi0, q0 def act(ob, *args, **kwargs): return sess.run(a, {X: ob}) self.step = step self.out = out self.act = act
def _init(self, ob_space, ac_space, gaussian_fixed_var=True, use_bias=True, use_critic=True, seed=None, hidden_W_init=U.normc_initializer(1.0), hidden_b_init=tf.zeros_initializer(), output_W_init=U.normc_initializer(0.01), output_b_init=tf.zeros_initializer()): """Params: ob_space: task observation space ac_space : task action space hid_size: width of hidden layers num_hid_layers: depth gaussian_fixed_var: True->separate parameter for logstd, False->two-headed mlp use_bias: whether to include bias in neurons """ assert isinstance(ob_space, gym.spaces.Box) if seed is not None: tf.set_random_seed(seed) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) #Critic if use_critic: raise Exception("Critic still not supported") with tf.variable_scope('vf'): obz = tf.clip_by_value( (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=hidden_W_init)) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=hidden_W_init)[:, 0] #Actor with tf.variable_scope('pol'): last_out = nature_cnn(ob) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): self.mean = mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=output_W_init, use_bias=use_bias) self.logstd = logstd = tf.get_variable( name="pol_logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=output_b_init) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=output_W_init) #Acting self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if use_critic: self._act = U.function([stochastic, ob], [ac, self.vpred]) else: self._act = U.function([stochastic, ob], [ac, tf.zeros(1)]) #Evaluating self.ob = ob self.ac_in = self.pdtype.sample_placeholder([sequence_length] + list(ac_space.shape), name='ac_in') self.gamma = U.get_placeholder(name="gamma", dtype=tf.float32, shape=[]) self.rew = U.get_placeholder(name="rew", dtype=tf.float32, shape=[sequence_length] + [1]) self.logprobs = self.pd.logp(self.ac_in) # [\log\pi(a|s)] #Fisher with tf.variable_scope('pol') as vs: self.weights = weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=vs.name) self.flat_weights = flat_weights = tf.concat( [tf.reshape(w, [-1]) for w in weights], axis=0) self.n_weights = flat_weights.shape[0].value self.score = score = U.flatgrad(self.logprobs, weights) # \nabla\log p(\tau) self.fisher = tf.einsum('i,j->ij', score, score) #Performance graph initializations self._setting = []
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, num_nonspatial, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs NonspatialX = tf.placeholder(tf.float32, (nbatch, num_nonspatial)) with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("acer"): h = nature_cnn(X) h = tf.concat([h, NonspatialX], axis=1) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) with tf.variable_scope("explore"): # for explore nogradient_h = tf.stop_gradient(h) e_pi_logits = fc(nogradient_h, 'e_pi', nact, init_scale=0.01) e_pi = tf.nn.softmax(e_pi_logits) # e_v = fc(nogradient_h, 'e_v', 1)[:, 0] e_q = fc(nogradient_h, 'e_q', nact) # a = sample(pi_logits) # could change this to use self.pi instead a = tf.squeeze(tf.multinomial(pi_logits, 1), 1) evaluate_a = tf.argmax(pi_logits, 1) self.initial_state = [] # not stateful self.X = X self.NonspatialX = NonspatialX self.pi = pi # actual policy params now self.q = q # for explore # e_a = sample(e_pi_logits) # could change this to use self.pi instead e_a = tf.squeeze(tf.multinomial(e_pi_logits, 1), 1) self.e_pi_logits = e_pi_logits self.e_pi = e_pi # self.e_v = e_v self.e_q = e_q def step(ob, nonspatial, *args, **kwargs): # returns actions, mus, states a0, pi0, e_pi0 = sess.run([a, pi, e_pi], { X: ob, NonspatialX: nonspatial }) return a0, pi0, e_pi0, [] # dummy state def evaluate_step(ob, nonspatial, *args, **kwargs): evaluate_a0, pi0, e_pi0 = sess.run([evaluate_a, pi, e_pi], { X: ob, NonspatialX: nonspatial }) return evaluate_a0, pi0, e_pi0, [] # dummy state self.evaluate_step = evaluate_step # for explore def e_step(ob, nonspatial, *args, **kwargs): e_a0, pi0, e_pi0 = sess.run([e_a, pi, e_pi], { X: ob, NonspatialX: nonspatial }) return e_a0, pi0, e_pi0, [] # dummy state self.e_step = e_step def out(ob, *args, **kwargs): pi0, q0 = sess.run([pi, q], {X: ob}) return pi0, q0 def act(ob, *args, **kwargs): return sess.run(a, {X: ob}) self.step = step self.out = out self.act = act