def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi = fc(h, 'pi', nact, init_scale=0.01) vf = fc(h, 'v', 1)[:, 0] self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **kwargs): #pylint: disable=W0613 self.width = kwargs['mlp_width'] ob_shape = (nbatch,) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope(TF_VAR_SCOPE, reuse=reuse): activ = tf.tanh # Diverges even at super low learning rates # activ = tf.nn.relu # activ = tf.nn.leaky_relu # Diverges p_h1 = activ(fc(X, 'pi_fc1', nh=self.width, init_scale=np.sqrt(2))) p_h2 = activ(fc(p_h1, 'pi_fc2', nh=self.width, init_scale=np.sqrt(2))) pi = fc(p_h2, 'pi', actdim, init_scale=0.01) v_h1 = activ(fc(X, 'vf_fc1', nh=self.width, init_scale=np.sqrt(2))) v_h2 = activ(fc(v_h1, 'vf_fc2', nh=self.width, init_scale=np.sqrt(2))) vf = fc(v_h2, 'vf', 1)[:, 0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.p_h1 = p_h1 self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) # One idea to benefit from SL learned actions would be to average the SL action with the one drawn here, where # the weight of the SL action decays over time. # That way you are still optimizing with respect to the parameters, but there is a nudge towards taking the # correct action which helps narrow down the exploration space. As you decay the SL weight, you offer a chance # to learn actions better than SL. This needs to be balanced with decaying other hyperparams like lr, etc... tho. a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) action_probs0 = tf.exp(-neglogp0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp, action_probs = sess.run([a0, vf, neglogp0, action_probs0], {X: ob}) # For deepdrive we expect outputs to be between -1 and 1 # a = np.tanh(a) # Doesn't work very well, clipping and letting the network learn the valid range is prob better return a, v, self.initial_state, neglogp, action_probs def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.action_probs0 = action_probs0 self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps ob_shape = (nbatch, ob_space.shape[0]) if type(ac_space) is spaces.Box: nact = ac_space.shape[0] else: nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope(TF_VAR_SCOPE, reuse=reuse): h = tf.cast(X, tf.float32) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) if type(ac_space) is spaces.Box: logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer()) else: logstd = None if logstd is not None: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) else: pdparam = pi self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', 1)[:, 0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value