def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv fm = nature_cnn(X, **conv_kwargs) fm_flat = conv_to_fc(fm) h = tf.nn.relu(fc(fm_flat, 'fc1', nh=nh, init_scale=np.sqrt(2))) M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) # states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return fm, h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma): """ Calculates q_retrace targets :param R: Rewards :param D: Dones :param q_i: Q values for actions taken :param v: V values :param rho_i: Importance weight for each action :return: Q_retrace values """ rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True) # list of len steps, shape [nenvs] rs = batch_to_seq(R, nenvs, nsteps, True) # list of len steps, shape [nenvs] ds = batch_to_seq(D, nenvs, nsteps, True) # list of len steps, shape [nenvs] q_is = batch_to_seq(q_i, nenvs, nsteps, True) vs = batch_to_seq(v, nenvs, nsteps + 1, True) v_final = vs[-1] qret = v_final qrets = [] for i in range(nsteps - 1, -1, -1): check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6) qret = rs[i] + gamma * qret * (1.0 - ds[i]) qrets.append(qret) qret = (rho_bar[i] * (qret - q_is[i])) + vs[i] qrets = qrets[::-1] qret = seq_to_batch(qrets, flat=True) return qret
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def network_fn(X, nenv=1): print("") print("IN HERE LSTM and this is X ",str(X)) nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states #T = tf.get_variable(name='init', shape=[1, 2], initializer=tf.constant_initializer(1)) # task desciptor xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) ## TODO: need to change initialization of state! initial_state = np.zeros(S.shape.as_list(), dtype=float) print("") print("HHHHH ",str(S.shape.as_list())) print(nenv) #initial_state = utils.fc(T,'pi_init', [nenv,48], init_scale=0.01, init_bias=0.01) #initial_state = tf.get_variable(name='init_state', shape=initial_state.shape, initializer=tf.zeros_initializer(), trainable=True) # task desciptor return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x: x) vf = fc(h5, 'v', 1, act=lambda x: x) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False): nenv = nbatch // nsteps print(f'{nlstm}') ob_shape = (nbatch,) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): # h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) activ = tf.tanh h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) xs = batch_to_seq(h1, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h2, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h2 = seq_to_batch(h2) pi = fc(h2, 'pi', actdim, init_scale=0.01) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', 1) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() v0 = vf[:, 0] neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) def get_act(ob, state, mask): a = sess.run(a0, {X:ob, S:state, M:mask}) return a def get_mean(ob, state, mask): a, state_new = sess.run([pi, snew], {X:ob, S:state, M:mask}) return a, state_new self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value self.act = get_act self.mean = get_mean
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): super().__init__(sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=reuse) nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n nlstm = self.lstm_units X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): X = tf.cast(X, tf.float32) with tf.variable_scope("Towers", reuse=reuse): with tf.variable_scope("tower_1"): tower1 = tf.layers.conv2d(inputs=X, filters=64, kernel_size=(3, 3), strides=(1, 1), padding='SAME', kernel_initializer=tf.initializers.variance_scaling) tower1 = tf.layers.conv2d(inputs=tower1, filters=32, kernel_size=(3, 3), strides=(1, 1), padding='SAME', kernel_initializer=tf.initializers.variance_scaling) tower1 = tf.layers.max_pooling2d(tower1, pool_size=(22, 80), strides=(22, 80)) with tf.variable_scope("tower_2"): tower2 = tf.layers.max_pooling2d(X, pool_size=(2, 2), strides=(2, 2)) for _ in range(self.depth): tower2 = tf.layers.conv2d(inputs=tower2, filters=32, kernel_size=(3, 3), strides=(1, 1), padding='SAME', kernel_initializer=tf.initializers.variance_scaling) tower2 = tf.nn.relu(tower2) tower2 = tf.layers.max_pooling2d(tower2, pool_size=(11, 40), strides=(11, 40)) with tf.variable_scope("tower_3"): tower3 = tf.layers.max_pooling2d(X, pool_size=(3, 6), strides=(3, 6), padding='SAME') for _ in range(self.depth): tower3 = tf.layers.conv2d(inputs=tower3, filters=32, kernel_size=(3, 3), strides=(1, 1), padding='SAME', kernel_initializer=tf.initializers.variance_scaling) tower3 = tf.nn.relu(tower3) tower3 = tf.layers.max_pooling2d(tower3, pool_size=(8, 14), strides=(8, 14), padding='SAME') concat = tf.concat([tower1, tower2, tower3], axis=-1) # lstm xs = batch_to_seq(concat, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) self.a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.snew = snew self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q self.sess = sess
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=128, reuse=False): scope = "model" nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape, name="observations") #obs M = tf.placeholder(tf.float32, [nbatch], name="mask") #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="states") #states with tf.variable_scope(scope, reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) self._saver = tf.train.Saver(trainable_vars) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) def save(path, name): try: os.makedirs(path) except FileExistsError: pass self._saver.save(sess, path+name) def load(path, name): if os.path.exists(path+name+'.index'): self._saver.restore(sess, path+name) else: tf.logging.warn('Failed restoring vars from %s' % path) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value self.save = save self.load = load
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = X with tf.variable_scope('mlp_in', reuse=tf.AUTO_REUSE): for i in range(num_layers_in): h = fc(h, 'mlp_in_fc{}'.format(i), nh=num_hidden_in, init_scale=np.sqrt(2)) if layer_norm_in: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm_lstm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) with tf.variable_scope('mlp_out', reuse=tf.AUTO_REUSE): for i in range(num_layers_out): h = fc(h, 'mlp_out_fc{}'.format(i), nh=num_hidden_out, init_scale=np.sqrt(2)) if layer_norm_out: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) # lstm xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact, act=lambda x: x) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step # For Mujoco. Taken from PPOSGD
def strip(var, n_envs, n_steps, flat=False): """ Removes the last step in the batch :param var: (TensorFlow Tensor) The input Tensor :param n_envs: (int) The number of environments :param n_steps: (int) The number of steps to run for each environment :param flat: (bool) If the input Tensor is flat :return: (TensorFlow Tensor) the input tensor, without the last step in the batch """ out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat) return seq_to_batch(out_vars[:-1], flat)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, create_additional=True, nlstm=256): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) if (create_additional): vf = fc(h5, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h5) a0 = self.pd.sample() if (create_additional): neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): if (create_additional): a, v, s, neglogp = sess.run([a0, vf, snew, neglogp0], { X: ob, S: state, M: mask }) else: a, s = sess.run([a0, snew], {X: ob, S: state, M: mask}) v = np.zeros_like(a) neglogp = np.zeros_like(a) return a, v, s, neglogp def value(ob, state, mask): return sess.run(vf, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S if (create_additional): self.vf = vf self.value = value self.step = step
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, size_mem=256, reuse=False): nenv = nbatch // nsteps # nh, nw, nc = ob_space.shape # ob_shape = (nbatch, nh, nw, nc) ob_shape = (nbatch, ) + ob_space.shape nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, size_mem * 2]) # states with tf.variable_scope("model", reuse=reuse): h = self.preprocess(X) h = fc(h, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = self.memory_fn(xs, ms, S, nh=size_mem) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x: x) vf = fc(h5, 'v', 1, act=lambda x: x) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False, param=None): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=8, reuse=False): # assume ob_space, ac_space to be flattned # e.g. original action_space (3,2,3) -> new action_space (36) nenv = nbatch // nsteps print ("envs and steps and batch:", nenv, nsteps, nbatch) #nh, nw, nc = ob_space.shape #ob_shape = (nbatch, nh, nw, nc) ob_shape = (nbatch,) + ob_space.shape #nact = ac_space.high.size pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): #h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) #h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) #h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) #h3 = conv_to_fc(h3) #h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) h4 = fc(X, 'fc1', nh=16, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pdparam = fc(h5, 'pi', pdtype.param_shape()[0], act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) #logstd = tf.get_variable(name="logstd", shape=[1, nact], # initializer=tf.zeros_initializer()) #pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = pdtype #make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False): nenv = nbatch // nsteps ob_shape = add_batch_dimension(ob_space.shape, nbatch) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape, name="X") #obs M = tf.placeholder(tf.float32, [nbatch], name="M") #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="S") #states with tf.variable_scope("model", reuse=reuse): xs = batch_to_seq(X, nenv, nsteps) # Observation sequences ms = batch_to_seq(M, nenv, nsteps) # Done sequences h0, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h0 = seq_to_batch(h0) h0 = tf.concat([h0,X],1) # Policy h1 = fc(h0, 'pi_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu) pi = fc(h1, 'pi', nact, act=tf.tanh, init_scale=0.01) # Value function h1 = fc(h0, 'vf_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu) vf = fc(h1, 'vf', 1, act=lambda x:x) # Current policy variance logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) # Comments by Fei: xs is list of nsteps, each is nenv * nh ms = batch_to_seq(M, nenv, nsteps) # Comments by Fei: ms is list of nsteps, each is nenv vector h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) # Comment by Fei: h5 is the same dimension as xs, but with value changed by LSTM. snew is new S h5 = seq_to_batch(h5) # Comments by Fei: h5 is nbatch * nh again, just like h4 pi = fc(h5, 'pi', nact, act=lambda x:x) # Comments by Fei: pi is nbatch * nact vf = fc(h5, 'v', 1, act=lambda x:x) # Comments by Fei: vf is nbatch * 1 v0 = vf[:, 0] # Comments by Fei: v0 is nbatch vector, each value is the value function of a state a0 = sample(pi) # Comments by Fei: a0 is nbatch vector, each value is the best choice of action, at that state self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv ob_g, ob_l = tf.split(X, 2, axis=1) ob_g = tf.squeeze(ob_g, axis=1) - 128.0 ob_l = tf.squeeze(ob_l, axis=1) - 128.0 # Conv layer net_g = vggm1234(ob_g) net_l = vggm1234(ob_l) feat = tf.concat([net_g, net_l], 1) # LSTM M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(feat, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) # FC h = slim.fully_connected(h, 4, scope='fc', activation_fn=tf.nn.tanh) return (feat, h), { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma): """ Calculates the target Q-retrace :param rewards: ([TensorFlow Tensor]) The rewards :param dones: ([TensorFlow Tensor]) :param q_i: ([TensorFlow Tensor]) The Q values for actions taken :param values: ([TensorFlow Tensor]) The output of the value functions :param rho_i: ([TensorFlow Tensor]) The importance weight for each action :param n_envs: (int) The number of environments :param n_steps: (int) The number of steps to run for each environment :param gamma: (float) The discount value :return: ([TensorFlow Tensor]) the target Q-retrace """ rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps, True) # list of len steps, shape [n_envs] reward_seq = batch_to_seq(rewards, n_envs, n_steps, True) # list of len steps, shape [n_envs] done_seq = batch_to_seq(dones, n_envs, n_steps, True) # list of len steps, shape [n_envs] q_is = batch_to_seq(q_i, n_envs, n_steps, True) value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True) final_value = value_sequence[-1] qret = final_value qrets = [] for i in range(n_steps - 1, -1, -1): check_shape([ qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i], value_sequence[i] ], [[n_envs]] * 6) qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i]) qrets.append(qret) qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i] qrets = qrets[::-1] qret = seq_to_batch(qrets, flat=True) return qret
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) for i in range(len(hiddens) - 1): h = utils.fc(h, 'mlp_fc{}'.format(i), nh=hiddens[i], init_scale=np.sqrt(2)) if layer_norm: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) nlstm = hiddens[-1] M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = utils.batch_to_seq(h, nenv, nsteps) ms = utils.batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = utils.seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def strip(var, nenvs, nsteps, flat = False): vars = batch_to_seq(var, nenvs, nsteps + 1, flat) return seq_to_batch(vars[:-1], flat)
def strip(var, nenvs, nsteps, flat=False): vars = batch_to_seq(var, nenvs, nsteps, flat) return seq_to_batch(vars, flat)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False, feature_mlp=True): nenv = nbatch // nsteps ob_shape = (nbatch, ) + ob_space.shape if len(ac_space.shape) == 0: # discrete set of actions nact = ac_space.n discrete = True else: # continuous actdim = ac_space.shape[0] discrete = False X = tf.placeholder(tf.float32, ob_shape, name="Ob") M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh if feature_mlp: print("Using feature network in front of LSTM") h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2))) h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2))) xs = batch_to_seq(h2, nenv, nsteps) else: print("No feature network in front of LSTM") xs = batch_to_seq(X, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, "vf", 1) if discrete: pi = fc(h5, "pi", nact, init_scale=0.01) else: pi = fc(h5, "pi", actdim, init_scale=0.01) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) self.pdtype = make_pdtype(ac_space) if discrete: self.pd = self.pdtype.pdfromflat(pi) else: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, size_mem=256, reuse=False): # pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape if ac_space.shape == (): actdim = 1 else: actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs nenv = nbatch // nsteps M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, size_mem * 2]) # states with tf.variable_scope("model", reuse=reuse): # h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) # h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2 = tf.cast(X, tf.float32) xs = batch_to_seq(h2, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm', nh=size_mem) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', actdim, act=lambda x: x, init_scale=0.01) h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) vf = fc(h5, 'vf', 1, act=lambda x: x)[:, 0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) # v0 = vf[0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, vf, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(vf, {X: ob, S: state, M: mask}) # def step(ob, *_args, **_kwargs): # a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) # return a, v, self.initial_state, neglogp # # def value(ob, *_args, **_kwargs): # return sess.run(vf, {X: ob}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet, reuse=False, flownet=None, train_from_scratch=False, recurrent=None, large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False): ob_shape_vec = (nbatch,) + ob_space["vector"].shape nh, nw, nc = ob_space["image"].shape ob_shape_im = (nbatch, nh, nw, nc) actdim = ac_space.shape[0] X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec') # obs X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im') if add_flownet: # adding previous image placeholder: X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p') # obs t-1 else: X_p = None if recurrent: nenv = nbatch // nsteps M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h_im = mujoco_cnn( X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec, X_p, flownet, train_from_scratch, large_cnn, diff_frames) if add_predicted_flow_to_vec: flow_vec = get_flow_vec( X_im, 'pi', nbatch, add_flownet, X_p, flownet, train_from_scratch, large_cnn, diff_frames) h_vec = tf.concat([X_vec, flow_vec], axis=-1) h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) else: h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h1 = tf.concat([h_im, h_vec], 1) if recurrent: xs = batch_to_seq(h1, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if recurrent == 'lstm': h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) else: assert recurrent == 'lnlstm' h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h2 = seq_to_batch(h5) else: h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) vf = fc(h2, 'vf', 1) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() a0_r = self.pd.mode() neglogp0 = self.pd.neglogp(a0) if not recurrent: self.initial_state = None else: self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.placeholder_dict = { "image": X_im, "vector": X_vec } if add_flownet: self.placeholder_dict["last_image"] = X_p if not recurrent: def step(ob, *_args, remove_noise=False, **_kwargs): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] if not remove_noise: a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict) else: a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] return sess.run(v0, feed_dict=feed_dict) else: def step(ob, state, mask, remove_noise=False): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] feed_dict[S] = state feed_dict[M] = mask if not remove_noise: a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict) else: a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict) return a, v, s, neglogp def value(ob, state, mask): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] feed_dict[S] = state feed_dict[M] = mask return sess.run(v0, feed_dict=feed_dict) self.X_im = X_im self.X_vec = X_vec self.X_p = X_p self.pi = pi if not recurrent: self.vf = v0 else: self.vf = vf self.M = M self.S = S self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): super().__init__(sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=reuse) nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n nlstm = self.lstm_units X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): X = tf.cast(X, tf.float32) h = conv(X, 'c1', nf=16, rf=3, stride=1, pad='SAME', init_scale=np.sqrt(2)) h = tf.nn.relu(h) h = conv(h, 'c2', nf=32, rf=3, stride=1, pad='SAME', init_scale=np.sqrt(2)) h = tf.nn.relu(h) h = conv_to_fc(h) h = fc(h, 'fc1', nh=self.dense_units, init_scale=np.sqrt(2)) h = tf.nn.relu(h) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) self.a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.snew = snew self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q self.sess = sess
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): nenv = nbatch // nsteps qmdp_param = {} # qmdp_param['K'] = 3 qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n qmdp_param['num_action'] = ac_space.n qmdp_param['num_state'] = 32 qmdp_param['num_obs'] = 17 input_len = ob_space.shape input_shape = (nbatch, ) + input_len num_action = qmdp_param["num_action"] obs_len = qmdp_param["obs_len"] num_state = qmdp_param['num_state'] num_obs = qmdp_param['num_obs'] self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, input_shape) #[nbatch,obs+prev action] M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, num_state]) #beliefs with tf.variable_scope("model", reuse=reuse): xs = batch_to_seq(X, nenv, nsteps) #xs originaly [nbatch,input_len] #reshape xs to [nenv,nsteps,input_len] #split xs along axis=1 to nsteps #xs becomes [nsteps,nenv,input_len] #dived xs to obs and pre_action obs = [x[:, 0:obs_len] for x in xs] acts = [x[:, obs_len:] for x in xs] ms = batch_to_seq(M, nenv, nsteps) #same as xs #ms has shape [nsteps,nenv] #build variabels self.planner_net = PlannerNet("planner", qmdp_param) self.filter_net = FilterNet("filter", qmdp_param) #calculate action value q, and belief bnew s_hist, snew = self.filter_net.beliefupdate(obs, acts, ms, S) # s_hist, snew, w_O, Z_o, b_prime_a, b_f = self.filter_net.beliefupdate(obs, acts, ms, S) #s_hist: [nstep,nenv,num_state] Q = self.planner_net.VI(nbatch) # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) # h5 = seq_to_batch(h5) #calculate action and value s_hist = seq_to_batch(s_hist) #[nbatch,num_state] q = self.planner_net.policy(Q, s_hist) self.pd, self.pi = self.pdtype.pdfromlatent(q) vf = fc(q, 'v', 1) #critic value function #pi = fc(h5, 'pi', nact) #actor #vf = fc(h5, 'v', 1) #critic value function v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) # self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.initial_state = np.ones( (nenv, num_state), dtype=np.float32) / num_state def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) # a,b,c,d,w_O_val,Z_o_val,b_prime_a_val,b_f_val = sess.run([a0, v0, snew, neglogp0, w_O, Z_o, b_prime_a, b_f], {X:ob, S:state, M:mask}) # print("w_O: ",w_O_val) # print("Z_o: ",Z_o_val) # print("b_prime_a_val: ",b_prime_a_val) # print("b_f_val: ",b_prime_a_val) # return a,b,c,d def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, tf_session, ob_space, ac_space, nbatch, reward_redistribution_config, observation_network_config, lstm_network_config, training_config, exploration_config, nsteps, nlstm=64, reuse=False): """LSTM policy network, as described in RUDDER paper Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and the features from the reward redistribution observation network; Parameters ------- tf_session : tensorflow session tensorflow session to compute the graph in ob_space Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes; ac_space Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions; nbatch : int Batchsize nsteps : int Fixed number of timesteps to process at once reward_redistribution_config : dict Dictionary containing config for reward redistribution: ----- lambda_eligibility_trace : float Eligibility trace value for redistributed reward vf_contrib : float Weighting of original value function (vf) vs. redistributed reward (rr), s.t. :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)` use_reward_redistribution_quality_threshold : float Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used; use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction error, as described in RUDDER paper; use_reward_redistribution : bool Use reward redistribution? rr_junksize : int Junksize for reward redistribution; Junks overlap by 1 half each cont_pred_w : float Weighting of continous prediciton loss vs. prediction loss of final return at last timestep intgrd_steps : int Stepsize for integrated gradients intgrd_batchsize : int Integrated gradients is computed batch-wise if intgrd_batchsize > 1 observation_network_config : dict Dictionary containing config for observation network that processes observations and feeds them to LSTM network: ----- show_states : bool Show frames to network? show_statedeltas : bool Show frame deltas to network? prepoc_states : list of dicts Network config to preprocess frames prepoc_deltas : list of dicts Network config to preprocess frame deltas prepoc_observations : list of dicts Network config to preprocess features from frame and frame-delta preprocessing networks lstm_network_config : dict Dictionary containing config for LSTM network: ----- show_actions : bool Show taken actions to LSTM? reversed : bool Process game sequence in reversed order? layers : list of dicts Network config for LSTM network and optional additional dense layers initializations : dict Initialization config for LSTM network timestep_encoding : dict Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class training_config : dict Dictionary containing config for training and update procedure: ----- n_no_rr_updates : int Number of updates to perform without training or using reward redistribution network n_pretrain_games : int Number of games to pretrain the reward redistribution network without using it; downscale_lr_policylag : bool Downscale learningrate permanently if policy lag gets too large? optimizer : tf.train optimizer Optimizer in tf.train, e.g. "AdamOptimizer" optimizer_params : dict Kwargs for optimizer l1 : float Weighting for l1 weight regularization l2 : float Weighting for l2 weight regularization clip_gradients : float Threshold for clipping gradients (clipping by norm) exploration_config : dict Dictionary containing config for exploration: ----- sample_actions_from_softmax : bool True: Apply softmax to policy network output and use it as probabilities to pick an action False: Use the max. policy network output as action temporal_safe_exploration : bool User RUDDER safe exploration save_pi_threshold : float Threshold value in range [0,1] for safe actions in RUDDER safe exploration nlstm : int Number of LSTM units (=memory cells) reuse : bool Reuse tensorflow variables? """ # # Shapes # nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) seq_ob_shape = (nenv, -1, nh, nw, 1) nact = ac_space.n # # Placeholders for inputs # X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states # # Prepare input # single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32) delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32) # # Get observation features from RR model # rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config, observation_network_config=observation_network_config, lstm_network_config=lstm_network_config, training_config=training_config, scopename="RR") self.rr_observation_model = rr_model rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames, additional_inputs=[]) # # Build policy network # with tf.variable_scope("model", reuse=reuse): temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False, name='temperature') additional_inputs = [StopGradientLayer(rr_observation_layer)] observation_layers, observation_features = observation_network( single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs, observation_network_config=observation_network_config) self.observation_features_shape = observation_features.get_output_shape() xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=tf.reshape(observation_layers[-1].get_output(), [nenv, nsteps, -1]))] ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) h6 = h5 pi = fc(h6, 'pi', nact) vf = fc(h6, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) if exploration_config['sample_actions_from_softmax']: a0 = self.pd.sample_temp(temperature=temperature) else: a0 = tf.argmax(pi, axis=-1) v0 = vf[:, 0] neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) return a, v, s, neglogp def value(ob, state, mask): return tf_session.run(v0, {X:ob, S:state, M:mask}) def action(ob, state, mask, *_args, **_kwargs): a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask}) return a, s, neglogp # # Placeholders for exploration # n_envs = pi.shape.as_list()[0] exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,)) gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,)) prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,)) exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) # # Setting up safe exploration # explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl), tf.less_equal(gamelengths_pl, exploration_timesteps_pl + exploration_durations_pl)), tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32))) safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True) safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True) save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1) / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold'])) safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32) safe_pi /= tf.reduce_sum(safe_pi) rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0] safe_pi_flat = tf.reshape(safe_pi, (-1,)) prev_action_is_safe = tf.gather(safe_pi_flat, prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64) * safe_pi.shape.as_list()[1]) prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32)) a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl, tf.not_equal(gamelengths_pl, exploration_timesteps_pl)), prev_action_is_safe), prev_actions_pl, rand_safe_a) a_explore = tf.where(explore, a_explore, a0) # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start) rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype) a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore) if not exploration_config['temporal_safe_exploration']: a_explore = a0 neglogp_explore = self.pd.neglogp(a_explore) def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths, keep_prev_action, prev_action_count, exploration_durations, **_kwargs): """Get actions with exploration for long-term reward""" a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore], {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps, prev_actions_pl: prev_actions, gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations, keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count}) return a, s, neglogp self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value self.action = action self.action_exploration = action_exploration self.seq_ob_shape = seq_ob_shape self.exploration_config = exploration_config
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape) #obs I = tf.placeholder(tf.int32, [nbatch, 5]) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states # Model with tf.variable_scope("model", reuse=reuse): # Image Processing with tf.variable_scope("cnn"): x_image_rep = nature_cnn(X) # Instructioin Processing with tf.variable_scope("GRU"): embedding = tf.get_variable( 'word_embedding', shape=[12, 32], initializer=tf.random_uniform_initializer(-1e-3, 1e-3)) gru_cell = tf.contrib.rnn.GRUCell( num_units=256, kernel_initializer=tf.random_uniform_initializer( -1e-3, 1e-3), bias_initializer=tf.random_uniform_initializer( -1e-3, 1e-3)) encoder_hidden = gru_cell.zero_state(nbatch, dtype=tf.float32) for i in range(5): word_embedding = tf.nn.embedding_lookup(embedding, I[:, i]) output, encoder_hidden = gru_cell.call( word_embedding, encoder_hidden) x_insts_rep = encoder_hidden # Gated-Attention layers with tf.variable_scope("x-attn"): x_attention = tf.sigmoid( fc(x_insts_rep, 'x-attn', 64, init_scale=1.0)) x_attention = tf.expand_dims(x_attention, 1) x_attention = tf.expand_dims(x_attention, 2) with tf.variable_scope("Gated-Attention"): x = x_image_rep * x_attention x = conv_to_fc(x) x = tf.nn.relu(fc(x, 'x-Ga', 256, init_scale=1.0)) xs = batch_to_seq(x, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h20, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm, init_scale=1.0) h20 = seq_to_batch(h20) with tf.variable_scope("pi"): pi = tf.layers.dense( h20, nact, kernel_initializer=normalized_columns_initializer(0.01)) with tf.variable_scope("vf"): vf = tf.layers.dense( h20, 1, kernel_initializer=normalized_columns_initializer(0.01)) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, insts, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, I: insts, S: state, M: mask }) def value(ob, insts, state, mask): return sess.run(v0, {X: ob, I: insts, S: state, M: mask}) self.X = X self.I = I # self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value # start logging # ============= if reuse: self.var_summary('./Asset/logdir', sess)
def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, nlstm=256, reuse=False, feature_mlp=True): # Here the batch size is 1, i.e. one trajectory # also assume nenvs=1 if nsteps is None: ob_shape = (None, ) + ob_space.shape M = tf.placeholder(tf.float32, [None]) else: ob_shape = (nsteps, ) + ob_space.shape M = tf.placeholder(tf.float32, [nsteps]) if len(ac_space.shape) == 0: # discrete set of actions nact = ac_space.n discrete = True else: actdim = ac_space.shape[0] discrete = False X = tf.placeholder(tf.float32, ob_shape, name="Ob") S = tf.placeholder(tf.float32, [1, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh if feature_mlp: h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2))) h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2))) xs = batch_to_seq(h2, 1, nsteps) else: xs = batch_to_seq(X, 1, nsteps) ms = batch_to_seq(M, 1, nsteps) h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, "vf", 1) if discrete: pi = fc(h5, "pi", nact, init_scale=0.01) else: pi = fc(h5, "pi", actdim, init_scale=0.01) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) self.pdtype = make_pdtype(ac_space) if discrete: self.pd = self.pdtype.pdfromflat(pi) else: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((1, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value