def network_fn(X, nenv=1): print("") print("IN HERE LSTM and this is X ",str(X)) nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states #T = tf.get_variable(name='init', shape=[1, 2], initializer=tf.constant_initializer(1)) # task desciptor xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) ## TODO: need to change initialization of state! initial_state = np.zeros(S.shape.as_list(), dtype=float) print("") print("HHHHH ",str(S.shape.as_list())) print(nenv) #initial_state = utils.fc(T,'pi_init', [nenv,48], init_scale=0.01, init_bias=0.01) #initial_state = tf.get_variable(name='init_state', shape=initial_state.shape, initializer=tf.zeros_initializer(), trainable=True) # task desciptor return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv ob_g, ob_l = tf.split(X, 2, axis=1) ob_g = tf.squeeze(ob_g, axis=1) - 128.0 ob_l = tf.squeeze(ob_l, axis=1) - 128.0 # Conv layer net_g = vggm1234(ob_g) net_l = vggm1234(ob_l) feat = tf.concat([net_g, net_l], 1) # LSTM M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(feat, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return (feat, h), { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = nature_cnn(X, **conv_kwargs) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) # states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope="lnlstm", nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope="lstm", nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { "S": S, "M": M, "state": snew, "initial_state": initial_state }
def network_fn(X, nenv=1): # TODO(akadian): modify the below code to adapt for depth nbatch = X[0].shape[0] nsteps = nbatch // nenv if X[0].shape[3] == 3: h = nature_cnn(X[0], **conv_kwargs) # rgb elif X[0].shape[3] == 1: h = depth_cnn(X[0], **conv_kwargs) # depth else: raise ValueError h = tf.concat([h, X[1]], 1) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False, scope_name="model"): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope(scope_name, reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x: x) vf = fc(h5, 'v', 1, act=lambda x: x) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = X with tf.variable_scope('mlp_in', reuse=tf.AUTO_REUSE): for i in range(num_layers_in): h = fc(h, 'mlp_in_fc{}'.format(i), nh=num_hidden_in, init_scale=np.sqrt(2)) if layer_norm_in: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm_lstm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) with tf.variable_scope('mlp_out', reuse=tf.AUTO_REUSE): for i in range(num_layers_out): h = fc(h, 'mlp_out_fc{}'.format(i), nh=num_hidden_out, init_scale=np.sqrt(2)) if layer_norm_out: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False, param=None): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False): nenv = nbatch // nsteps ob_shape = add_batch_dimension(ob_space.shape, nbatch) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape, name="X") #obs M = tf.placeholder(tf.float32, [nbatch], name="M") #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="S") #states with tf.variable_scope("model", reuse=reuse): xs = batch_to_seq(X, nenv, nsteps) # Observation sequences ms = batch_to_seq(M, nenv, nsteps) # Done sequences h0, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h0 = seq_to_batch(h0) h0 = tf.concat([h0,X],1) # Policy h1 = fc(h0, 'pi_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu) pi = fc(h1, 'pi', nact, act=tf.tanh, init_scale=0.01) # Value function h1 = fc(h0, 'vf_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu) vf = fc(h1, 'vf', 1, act=lambda x:x) # Current policy variance logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) # Comments by Fei: xs is list of nsteps, each is nenv * nh ms = batch_to_seq(M, nenv, nsteps) # Comments by Fei: ms is list of nsteps, each is nenv vector h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) # Comment by Fei: h5 is the same dimension as xs, but with value changed by LSTM. snew is new S h5 = seq_to_batch(h5) # Comments by Fei: h5 is nbatch * nh again, just like h4 pi = fc(h5, 'pi', nact, act=lambda x:x) # Comments by Fei: pi is nbatch * nact vf = fc(h5, 'v', 1, act=lambda x:x) # Comments by Fei: vf is nbatch * 1 v0 = vf[:, 0] # Comments by Fei: v0 is nbatch vector, each value is the value function of a state a0 = sample(pi) # Comments by Fei: a0 is nbatch vector, each value is the best choice of action, at that state self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) for i in range(len(hiddens) - 1): h = utils.fc(h, 'mlp_fc{}'.format(i), nh=hiddens[i], init_scale=np.sqrt(2)) if layer_norm: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) nlstm = hiddens[-1] M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = utils.batch_to_seq(h, nenv, nsteps) ms = utils.batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = utils.seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet, reuse=False, flownet=None, train_from_scratch=False, recurrent=None, large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False): ob_shape_vec = (nbatch,) + ob_space["vector"].shape nh, nw, nc = ob_space["image"].shape ob_shape_im = (nbatch, nh, nw, nc) actdim = ac_space.shape[0] X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec') # obs X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im') if add_flownet: # adding previous image placeholder: X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p') # obs t-1 else: X_p = None if recurrent: nenv = nbatch // nsteps M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h_im = mujoco_cnn( X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec, X_p, flownet, train_from_scratch, large_cnn, diff_frames) if add_predicted_flow_to_vec: flow_vec = get_flow_vec( X_im, 'pi', nbatch, add_flownet, X_p, flownet, train_from_scratch, large_cnn, diff_frames) h_vec = tf.concat([X_vec, flow_vec], axis=-1) h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) else: h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h1 = tf.concat([h_im, h_vec], 1) if recurrent: xs = batch_to_seq(h1, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if recurrent == 'lstm': h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) else: assert recurrent == 'lnlstm' h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h2 = seq_to_batch(h5) else: h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) vf = fc(h2, 'vf', 1) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() a0_r = self.pd.mode() neglogp0 = self.pd.neglogp(a0) if not recurrent: self.initial_state = None else: self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.placeholder_dict = { "image": X_im, "vector": X_vec } if add_flownet: self.placeholder_dict["last_image"] = X_p if not recurrent: def step(ob, *_args, remove_noise=False, **_kwargs): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] if not remove_noise: a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict) else: a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] return sess.run(v0, feed_dict=feed_dict) else: def step(ob, state, mask, remove_noise=False): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] feed_dict[S] = state feed_dict[M] = mask if not remove_noise: a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict) else: a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict) return a, v, s, neglogp def value(ob, state, mask): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] feed_dict[S] = state feed_dict[M] = mask return sess.run(v0, feed_dict=feed_dict) self.X_im = X_im self.X_vec = X_vec self.X_p = X_p self.pi = pi if not recurrent: self.vf = v0 else: self.vf = vf self.M = M self.S = S self.step = step self.value = value
def memory_fn(xs, ms, S, nh): return lnlstm(xs, ms, S, 'lnlstm1', nh=nh)
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x: x) pix = fc(h5, 'pix', FLAGS.screen_resolution, act=lambda x: x) piy = fc(h5, 'piy', FLAGS.screen_resolution, act=lambda x: x) vf = fc(h5, 'v', 1, act=lambda x: x) v0 = vf[:, 0] a0 = sample(pi) x0 = sample(pix) y0 = sample(piy) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): a, x, y, v, s = sess.run([a0, x0, y0, v0, snew], { X: ob, S: state, M: mask }) return a, x, y, v, s def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.pix = pix self.piy = piy self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps # nh, nw, nc = ob_space.shape # (nh, nw, nc) = (height, width, channels) ob_shape = (nbatch, ob_space.shape[0]) # nact = ac_space.n # X = tf.placeholder(tf.uint8, ob_shape) # obs actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='phOb') M = tf.placeholder(tf.float32, [nbatch], name='phMaskDone') # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2], name='phCellState') # states and output: (c, h) with tf.variable_scope("model", reuse=reuse): # h = nature_cnn(X) # h = tf.add(X, 0, name='h') # need more network to power enough h = mlp(X) xs = batch_to_seq( h, nenv, nsteps) # A List contain tensors all with shape [nenv, -1] ms = batch_to_seq( M, nenv, nsteps) # A List contain tensors all with shape [nenv, 1] h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'fc_pi', actdim) # acs = fc(h5, 'actions', actdim, init_scale=0.01) # move = tf.multiply(tf.nn.sigmoid(acs[:, 1:2]), 20, name='movement') # pi = tf.concat([acs[:, 0:1], move], axis=1, name='pi') vf = fc(h5, 'v', 1) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) # self.pdtype = make_pdtype(ac_space) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() action = tf.add( a0, 0, name='action') # use this tensor as action when inference newState = tf.add(snew, 0, name='newCellState') print('sel.pd.shape', self.pd.shape, a0.shape) neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value