def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces, nenv, nsteps, nstack, reuse=False, name='model'): nbins = 11 nbatch = nenv * nsteps ob_shape = (nbatch, ob_space.shape[0] * nstack) all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack) nact = ac_space.shape[0] all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack) obs_x = tf.placeholder(tf.float32, ob_shape) # obs X = obs_x X_v = tf.placeholder(tf.float32, all_ob_shape) A_v = tf.placeholder(tf.float32, all_ac_shape) with tf.variable_scope('policy_{}'.format(name), reuse=reuse): h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2)) h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2)) pi = fc(h2, 'pi', nact * nbins, act=lambda x: x) with tf.variable_scope('value_{}'.format(name), reuse=reuse): if len(ob_spaces) > 1: Y = tf.concat([X_v, A_v], axis=1) else: Y = X_v h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2)) h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2)) vf = fc(h4, 'v', 1, act=lambda x: x) v0 = vf[:, 0] pi = tf.reshape(pi, [nbatch, nact, nbins]) a0 = sample(pi, axis=2) self.initial_state = [] # not stateful def step(ob, obs, a_v, *_args, **_kwargs): # output continuous actions within [-1, 1] if a_v is not None: a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v}) else: a, v = sess.run([a0, v0], {X: ob, X_v: obs}) a = transform(a) return a, v, [] # dummy state def value(ob, a_v, *_args, **_kwargs): if a_v is not None: return sess.run(v0, {X_v: ob, A_v: a_v}) else: return sess.run(v0, {X_v: ob}) def transform(a): # transform from [0, 9] to [-0.8, 0.8] a = np.array(a, dtype=np.float32) a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0 return a self.obs_x = obs_x self.X = X self.X_v = X_v self.A_v = A_v self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces, nenv, nsteps, nstack, reuse=False, name='model'): nbatch = nenv * nsteps ob_shape = (nbatch, ob_space.shape[0] * nstack) all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack) nact = ac_space.n all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack) X = tf.placeholder(tf.float32, ob_shape, name='X') # obs X_v = tf.placeholder(tf.float32, all_ob_shape, name='X_v') A_v = tf.placeholder(tf.float32, all_ac_shape, name='A_v') with tf.variable_scope('policy_{}'.format(name), reuse=reuse): h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2)) h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2)) pi = fc(h2, 'pi', nact, act=lambda x: x) with tf.variable_scope('value_{}'.format(name), reuse=reuse): if len(ob_spaces) > 1: Y = tf.concat([X_v, A_v], axis=1) else: Y = X_v h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2)) h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2)) vf = fc(h4, 'v', 1, act=lambda x: x) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = [] # not stateful def step(ob, obs, a_v, *_args, **_kwargs): if a_v is not None: a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v}) else: a, v = sess.run([a0, v0], {X: ob, X_v: obs}) return a, v, [] # dummy state def value(ob, a_v, *_args, **_kwargs): if a_v is not None: return sess.run(v0, {X_v: ob, A_v: a_v}) else: return sess.run(v0, {X_v: ob}) self.X = X self.X_v = X_v self.A_v = A_v self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces, nenv, nsteps, nstack, reuse=False, name='model'): nbatch = nenv * nsteps ob_shape = (nbatch, ob_space.shape[0] * nstack) all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack) nact = ac_space.n actions = tf.placeholder(tf.int32, (nbatch)) all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack) obs_x = tf.placeholder(tf.float32, ob_shape) # obs X = obs_x X_v = tf.placeholder(tf.float32, all_ob_shape) A_v = tf.placeholder(tf.float32, all_ac_shape) with tf.variable_scope('policy_{}'.format(name), reuse=reuse): h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2)) h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2)) pi = fc(h2, 'pi', nact, act=lambda x: x) with tf.variable_scope('value_{}'.format(name), reuse=reuse): if len(ob_spaces) > 1: Y = tf.concat([X_v, A_v], axis=1) else: Y = X_v h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2)) h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2)) vf = fc(h4, 'v', 1, act=lambda x: x) print(pi, actions) self.log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=actions) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = [] # not stateful def step_log_prob(ob, acts): log_prob = sess.run(self.log_prob, {X: ob, actions: acts}) return log_prob.reshape([-1, 1]) def step(ob, obs, a_v, *_args, **_kwargs): if a_v is not None: a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v}) else: a, v = sess.run([a0, v0], {X: ob, X_v: obs}) return a, v, [] # dummy state def value(ob, a_v, *_args, **_kwargs): if a_v is not None: return sess.run(v0, {X_v: ob, A_v: a_v}) else: return sess.run(v0, {X_v: ob}) self.obs_x = obs_x self.X = X self.X_v = X_v self.A_v = A_v self.pi = pi self.vf = vf self.step_log_prob = step_log_prob self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces, nenv, nsteps, nstack, reuse=False, name='model'): self.agent_id = agent_id nbins = 11 nbatch = nenv * nsteps ob_shape = (nbatch, ob_space.shape[0] * nstack) all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack) nact = ac_space.shape[0] all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack) obs_x = tf.placeholder(tf.float32, ob_shape) # obs X = obs_x X_v = tf.placeholder(tf.float32, all_ob_shape) A_v = tf.placeholder(tf.float32, all_ac_shape) with tf.variable_scope('oppo_{}'.format(name), reuse=reuse): h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2)) h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2)) pi = [] for k in range(len(ob_spaces)): if k == agent_id: continue pi.append( fc(h2, 'pi%d' % k, ac_spaces[k] * nbins, act=lambda x: x)) pi = tf.reshape(pi, [nbatch, nact, nbins]) a0 = sample(pi, axis=2) self.initial_state = [] # not stateful def step(ob, obs, *_args, **_kwargs): a = sess.run(a0, {X: ob, X_v: obs}) return a def transform(a): # transform from [0, 9] to [-0.8, 0.8] a = np.array(a, dtype=np.float32) a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0 return a self.obs_x = obs_x self.X = X self.X_v = X_v self.A_v = A_v self.pi = pi self.step = step
def __init__(self, sess, obe_space, obn_space, ac_space, nenv, nsteps, nstack, nedges, nnodes, reuse=False, name='model'): nbatch = nenv * nsteps obns_shape = (nbatch * nedges, obn_space * nstack) all_obns_shape = obns_shape obnr_shape = (nbatch * nedges, obn_space * nstack) all_obnr_shape = obnr_shape obe_shape = (nbatch * nedges, obe_space * nstack) all_obe_shape = obe_shape obn_shape = (nbatch * nnodes, obn_space * nstack) all_obn_shape = obn_shape nbnn = nbatch * nnodes if nnodes else None nbne = nbatch * nedges if nedges else None nact = ac_space.n all_ac_shape = (nbatch, nact * nstack) #all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack) nfs = efs = 12 X_ns = tf.placeholder(tf.float32, obns_shape) X_nr = tf.placeholder(tf.float32, obnr_shape) X_e = tf.placeholder(tf.float32, obe_shape) X_n = tf.placeholder(tf.float32, obn_shape) e2ns = tf.placeholder(tf.float32, (nbne, nbnn)) e2nr = tf.placeholder(tf.float32, (nbne, nbnn)) ns2e = tf.placeholder(tf.float32, (nbnn, nbne)) b2e = tf.placeholder(tf.float32, (nbatch, nbne)) b2n = tf.placeholder(tf.float32, (nbatch, nbnn)) X_ns_v = tf.placeholder(tf.float32, all_obns_shape) X_nr_v = tf.placeholder(tf.float32, all_obnr_shape) X_e_v = tf.placeholder(tf.float32, all_obe_shape) X_n_v = tf.placeholder(tf.float32, all_obn_shape) e2ns_v = tf.placeholder(tf.float32, (nbne, nbnn)) e2nr_v = tf.placeholder(tf.float32, (nbne, nbnn)) ns2e_v = tf.placeholder(tf.float32, (nbnn, nbne)) b2e_v = tf.placeholder(tf.float32, (nbatch, nbne)) b2n_v = tf.placeholder(tf.float32, (nbatch, nbnn)) A_v = tf.placeholder(tf.float32, all_ac_shape) with tf.variable_scope('policy_{}'.format(name), reuse=reuse): x = [X_ns, X_nr, X_e, X_n, e2ns, e2nr, ns2e] g1 = graphlayer(x, 'gl1', 128, nfs, efs, init_scale=np.sqrt(2)) g2 = graphlayer(g1, 'gl2', 128, nfs, efs, init_scale=np.sqrt(2)) f_e, f_n = g2[2], g2[3] y = [f_e, f_n, b2e, b2n] pi = graphblock(y, 'pi', 128, nact, init_scale=np.sqrt(2)) with tf.variable_scope('value_{}'.format(name), reuse=reuse): Y = [X_ns_v, X_nr_v, X_e_v, X_n_v, e2ns_v, e2nr_v, ns2e_v] g3 = graphlayer(Y, 'gl3', 128, nfs, efs, init_scale=np.sqrt(2)) g4 = graphlayer(g3, 'gl4', 128, nfs, efs, init_scale=np.sqrt(2)) f_e_v, f_n_v = g4[2], g4[3] Z = [f_e_v, f_n_v, b2e_v, b2n_v] vf = graphblock(Z, 'v', 128, 1, init_scale=np.sqrt(2)) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = [] # not stateful def step(ob, obs, a_v, *_args, **_kwargs): ob_ns, ob_nr, ob_e, ob_n, ob_e2ns, ob_e2nr, ob_ns2e, ob_b2e, ob_b2n = ob ob_ns_v, ob_nr_v, ob_e_v, ob_n_v, ob_e2ns_v, ob_e2nr_v, ob_ns2e_v, ob_b2e_v, ob_b2n_v = obs a, v = sess.run( [a0, v0], { X_ns: ob_ns, X_nr: ob_nr, X_e: ob_e, X_n: ob_n, e2ns: ob_e2ns, e2nr: ob_e2nr, ns2e: ob_ns2e, b2e: ob_b2e, b2n: ob_b2n, X_ns_v: ob_ns_v, X_nr_v: ob_nr_v, X_e_v: ob_e_v, X_n_v: ob_n_v, e2ns_v: ob_e2ns_v, e2nr_v: ob_e2nr_v, ns2e_v: ob_ns2e_v, b2e_v: ob_b2e_v, b2n_v: ob_b2n_v }) return a, v, [] # dummy state def value(ob, a_v, *_args, **_kwargs): ob_ns_v, ob_nr_v, ob_e_v, ob_n_v, ob_e2ns_v, ob_e2nr_v, ob_ns2e_v, ob_b2e_v, ob_b2n_v = ob return sess.run( v0, { X_ns_v: ob_ns_v, X_nr_v: ob_nr_v, X_e_v: ob_e_v, X_n_v: ob_n_v, e2ns_v: ob_e2ns_v, e2nr_v: ob_e2nr_v, ns2e_v: ob_ns2e_v, b2e_v: ob_b2e_v, b2n_v: ob_b2n_v }) self.X = { "X_ns": X_ns, "X_nr": X_nr, "X_e": X_e, "X_n": X_n, "e2ns": e2ns, "e2nr": e2nr, "ns2e": ns2e, "b2e": b2e, "b2n": b2n } self.X_v = { "X_ns": X_ns_v, "X_nr": X_nr_v, "X_e": X_e_v, "X_n": X_n_v, "e2ns": e2ns_v, "e2nr": e2nr_v, "ns2e": ns2e_v, "b2e": b2e_v, "b2n": b2n_v } self.A_v = A_v self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, oppo_policy, ob_space, ac_space, op_ac_n, ob_spaces, ac_spaces, nenv, nsteps, nstack, reuse=False, name='model'): # nstack always = 1 self.oppo_policy = oppo_policy nbatch = nenv * nsteps ob_shape = (nbatch, ob_space.shape[0] * nstack) op_ac_shape = (nbatch, op_ac_n * nstack) all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack) nact = ac_space.n actions = tf.placeholder(tf.int32, (nbatch)) all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack) # oppo_a0 = [sample(_) for _ in self.oppo_policy.pi] oppo_a_list = self.oppo_policy.pi # (k, batch, act_nums) -> (batch, \sum_k(act_nums)) oppo_a0 = oppo_a_list[0] for k in range(1, len(oppo_a_list)): oppo_a0 = tf.concat([oppo_a0, oppo_a_list[k]], axis=1) obs_x = tf.placeholder(tf.float32, ob_shape) # obs, not state(all obs) op_act_x = oppo_a0 # tf.placeholder(tf.float32, op_ac_shape) # opponents' act X = tf.concat([obs_x, op_act_x], axis=1) # input X_v = tf.placeholder(tf.float32, all_ob_shape) A_v = tf.placeholder(tf.float32, all_ac_shape) # A_v = tf.concat([tf.expand_dims(actions, axis=1), op_act_x], axis=1) with tf.variable_scope('policy_{}'.format(name), reuse=reuse): h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2)) h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2)) pi = fc(h2, 'pi', nact, act=lambda x: x) with tf.variable_scope('value_{}'.format(name), reuse=reuse): if len(ob_spaces) > 1: Y = tf.concat([X_v, A_v], axis=1) else: Y = X_v h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2)) h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2)) vf = fc(h4, 'v', 1, act=lambda x: x) self.log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=actions) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = [] # not stateful def step_log_prob(ob, acts): log_prob = sess.run(self.log_prob, {X: ob, actions: acts}) return log_prob.reshape([-1, 1]) def step(ob, obs, a_v, *_args, **_kwargs): oppo_a = sess.run(oppo_a0, {oppo_policy.obs_x: ob}) if a_v is not None: a, v = sess.run([a0, v0], { obs_x: ob, op_act_x: oppo_a, X_v: obs, A_v: a_v }) else: a, v = sess.run([a0, v0], { obs_x: ob, op_act_x: oppo_a, X_v: obs }) return a, v, [] # dummy state def value(ob, obs, a_v, *_args, **_kwargs): oppo_a = sess.run(oppo_a0, {oppo_policy.obs_x: ob}) if a_v is not None: return sess.run(v0, {X_v: obs, A_v: a_v, op_act_x: oppo_a}) else: return sess.run(v0, {X_v: obs, op_act_x: oppo_a}) self.obs_x = obs_x self.op_act_x = op_act_x self.X = obs_x self.X_v = X_v self.A_v = A_v self.pi = pi self.vf = vf self.step_log_prob = step_log_prob self.step = step self.value = value self.oppo_a = oppo_a0
def __init__(self, sess, agent_id, ob_space, ac_space, ob_spaces, ac_spaces, nenv, nsteps, nstack, reuse=False, name='model'): self.agent_id = agent_id nbatch = nenv * nsteps ob_shape = (nbatch, ob_space.shape[0] * nstack) all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack) nact = ac_space actions = [ tf.placeholder(tf.int32, (nbatch)) for _ in range(len(ob_spaces) - 1) ] all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack) obs_x = tf.placeholder(tf.float32, ob_shape) # obs X = obs_x X_v = tf.placeholder(tf.float32, all_ob_shape) A_v = tf.placeholder(tf.float32, all_ac_shape) with tf.variable_scope('oppo_{}'.format(name), reuse=reuse): h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2)) h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2)) pi = [] for k in range(len(ob_spaces)): if k == agent_id: continue pi.append(fc(h2, 'pi_%d' % k, ac_spaces[k].n, act=lambda x: x)) self.log_prob = [ -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi[i], labels=actions[i]) for i in range(len(pi)) ] a0 = [sample(_) for _ in pi] self.initial_state = [] # not stateful def step_log_prob(ob, acts_n): acts = [ acts_n[i] for i in range(len(acts_n)) if i != self.agent_id ] feed_dict = {X: ob} feed_dict.update(zip(actions, acts)) log_prob = sess.run(self.log_prob, feed_dict) return log_prob.reshape([-1, 1]) def step(ob, obs, a_v, *_args, **_kwargs): a = sess.run(a0, {X: ob, X_v: obs}) return a self.obs_x = obs_x self.X = X self.X_v = X_v self.A_v = A_v self.pi = pi self.step_log_prob = step_log_prob self.step = step