Пример #1
0
    def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
                 nenv, nsteps, nstack, reuse=False, name='model'):
        nbins = 11
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
        nact = ac_space.shape[0]
        all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact * nbins, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        pi = tf.reshape(pi, [nbatch, nact, nbins])
        a0 = sample(pi, axis=2)
        self.initial_state = []  # not stateful

        def step(ob, obs, a_v, *_args, **_kwargs):
            # output continuous actions within [-1, 1]
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            a = transform(a)
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        def transform(a):
            # transform from [0, 9] to [-0.8, 0.8]
            a = np.array(a, dtype=np.float32)
            a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.n
        all_ac_shape = (nbatch, (sum([ac.n
                                      for ac in ac_spaces]) - nact) * nstack)
        X = tf.placeholder(tf.float32, ob_shape, name='X')  # obs
        X_v = tf.placeholder(tf.float32, all_ob_shape, name='X_v')
        A_v = tf.placeholder(tf.float32, all_ac_shape, name='A_v')
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step(ob, obs, a_v, *_args, **_kwargs):
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #3
0
    def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
                 nenv, nsteps, nstack, reuse=False, name='model'):
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
        nact = ac_space.n
        actions = tf.placeholder(tf.int32, (nbatch))
        all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        print(pi, actions)
        self.log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=actions)
        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step_log_prob(ob, acts):
            log_prob = sess.run(self.log_prob, {X: ob, actions: acts})
            return log_prob.reshape([-1, 1])

        def step(ob, obs, a_v, *_args, **_kwargs):
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step_log_prob = step_log_prob
        self.step = step
        self.value = value
Пример #4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        self.agent_id = agent_id
        nbins = 11
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.shape[0]
        all_ac_shape = (nbatch, (sum([ac.shape[0]
                                      for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = []
            for k in range(len(ob_spaces)):
                if k == agent_id:
                    continue
                pi.append(
                    fc(h2, 'pi%d' % k, ac_spaces[k] * nbins, act=lambda x: x))

        pi = tf.reshape(pi, [nbatch, nact, nbins])
        a0 = sample(pi, axis=2)
        self.initial_state = []  # not stateful

        def step(ob, obs, *_args, **_kwargs):
            a = sess.run(a0, {X: ob, X_v: obs})
            return a

        def transform(a):
            # transform from [0, 9] to [-0.8, 0.8]
            a = np.array(a, dtype=np.float32)
            a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.step = step
Пример #5
0
    def __init__(self,
                 sess,
                 obe_space,
                 obn_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 nedges,
                 nnodes,
                 reuse=False,
                 name='model'):
        nbatch = nenv * nsteps
        obns_shape = (nbatch * nedges, obn_space * nstack)
        all_obns_shape = obns_shape
        obnr_shape = (nbatch * nedges, obn_space * nstack)
        all_obnr_shape = obnr_shape
        obe_shape = (nbatch * nedges, obe_space * nstack)
        all_obe_shape = obe_shape
        obn_shape = (nbatch * nnodes, obn_space * nstack)
        all_obn_shape = obn_shape
        nbnn = nbatch * nnodes if nnodes else None
        nbne = nbatch * nedges if nedges else None

        nact = ac_space.n
        all_ac_shape = (nbatch, nact * nstack)
        #all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack)
        nfs = efs = 12
        X_ns = tf.placeholder(tf.float32, obns_shape)
        X_nr = tf.placeholder(tf.float32, obnr_shape)
        X_e = tf.placeholder(tf.float32, obe_shape)
        X_n = tf.placeholder(tf.float32, obn_shape)
        e2ns = tf.placeholder(tf.float32, (nbne, nbnn))
        e2nr = tf.placeholder(tf.float32, (nbne, nbnn))
        ns2e = tf.placeholder(tf.float32, (nbnn, nbne))
        b2e = tf.placeholder(tf.float32, (nbatch, nbne))
        b2n = tf.placeholder(tf.float32, (nbatch, nbnn))

        X_ns_v = tf.placeholder(tf.float32, all_obns_shape)
        X_nr_v = tf.placeholder(tf.float32, all_obnr_shape)
        X_e_v = tf.placeholder(tf.float32, all_obe_shape)
        X_n_v = tf.placeholder(tf.float32, all_obn_shape)
        e2ns_v = tf.placeholder(tf.float32, (nbne, nbnn))
        e2nr_v = tf.placeholder(tf.float32, (nbne, nbnn))
        ns2e_v = tf.placeholder(tf.float32, (nbnn, nbne))
        b2e_v = tf.placeholder(tf.float32, (nbatch, nbne))
        b2n_v = tf.placeholder(tf.float32, (nbatch, nbnn))

        A_v = tf.placeholder(tf.float32, all_ac_shape)

        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            x = [X_ns, X_nr, X_e, X_n, e2ns, e2nr, ns2e]
            g1 = graphlayer(x, 'gl1', 128, nfs, efs, init_scale=np.sqrt(2))
            g2 = graphlayer(g1, 'gl2', 128, nfs, efs, init_scale=np.sqrt(2))
            f_e, f_n = g2[2], g2[3]
            y = [f_e, f_n, b2e, b2n]
            pi = graphblock(y, 'pi', 128, nact, init_scale=np.sqrt(2))

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            Y = [X_ns_v, X_nr_v, X_e_v, X_n_v, e2ns_v, e2nr_v, ns2e_v]
            g3 = graphlayer(Y, 'gl3', 128, nfs, efs, init_scale=np.sqrt(2))
            g4 = graphlayer(g3, 'gl4', 128, nfs, efs, init_scale=np.sqrt(2))
            f_e_v, f_n_v = g4[2], g4[3]
            Z = [f_e_v, f_n_v, b2e_v, b2n_v]
            vf = graphblock(Z, 'v', 128, 1, init_scale=np.sqrt(2))

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step(ob, obs, a_v, *_args, **_kwargs):
            ob_ns, ob_nr, ob_e, ob_n, ob_e2ns, ob_e2nr, ob_ns2e, ob_b2e, ob_b2n = ob
            ob_ns_v, ob_nr_v, ob_e_v, ob_n_v, ob_e2ns_v, ob_e2nr_v, ob_ns2e_v, ob_b2e_v, ob_b2n_v = obs

            a, v = sess.run(
                [a0, v0], {
                    X_ns: ob_ns,
                    X_nr: ob_nr,
                    X_e: ob_e,
                    X_n: ob_n,
                    e2ns: ob_e2ns,
                    e2nr: ob_e2nr,
                    ns2e: ob_ns2e,
                    b2e: ob_b2e,
                    b2n: ob_b2n,
                    X_ns_v: ob_ns_v,
                    X_nr_v: ob_nr_v,
                    X_e_v: ob_e_v,
                    X_n_v: ob_n_v,
                    e2ns_v: ob_e2ns_v,
                    e2nr_v: ob_e2nr_v,
                    ns2e_v: ob_ns2e_v,
                    b2e_v: ob_b2e_v,
                    b2n_v: ob_b2n_v
                })
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            ob_ns_v, ob_nr_v, ob_e_v, ob_n_v, ob_e2ns_v, ob_e2nr_v, ob_ns2e_v, ob_b2e_v, ob_b2n_v = ob
            return sess.run(
                v0, {
                    X_ns_v: ob_ns_v,
                    X_nr_v: ob_nr_v,
                    X_e_v: ob_e_v,
                    X_n_v: ob_n_v,
                    e2ns_v: ob_e2ns_v,
                    e2nr_v: ob_e2nr_v,
                    ns2e_v: ob_ns2e_v,
                    b2e_v: ob_b2e_v,
                    b2n_v: ob_b2n_v
                })

        self.X = {
            "X_ns": X_ns,
            "X_nr": X_nr,
            "X_e": X_e,
            "X_n": X_n,
            "e2ns": e2ns,
            "e2nr": e2nr,
            "ns2e": ns2e,
            "b2e": b2e,
            "b2n": b2n
        }
        self.X_v = {
            "X_ns": X_ns_v,
            "X_nr": X_nr_v,
            "X_e": X_e_v,
            "X_n": X_n_v,
            "e2ns": e2ns_v,
            "e2nr": e2nr_v,
            "ns2e": ns2e_v,
            "b2e": b2e_v,
            "b2n": b2n_v
        }
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #6
0
    def __init__(self,
                 sess,
                 oppo_policy,
                 ob_space,
                 ac_space,
                 op_ac_n,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        # nstack always = 1
        self.oppo_policy = oppo_policy
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        op_ac_shape = (nbatch, op_ac_n * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.n
        actions = tf.placeholder(tf.int32, (nbatch))
        all_ac_shape = (nbatch, (sum([ac.n
                                      for ac in ac_spaces]) - nact) * nstack)

        # oppo_a0 = [sample(_) for _ in self.oppo_policy.pi]
        oppo_a_list = self.oppo_policy.pi
        # (k, batch, act_nums) -> (batch, \sum_k(act_nums))
        oppo_a0 = oppo_a_list[0]
        for k in range(1, len(oppo_a_list)):
            oppo_a0 = tf.concat([oppo_a0, oppo_a_list[k]], axis=1)

        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs, not state(all obs)
        op_act_x = oppo_a0  # tf.placeholder(tf.float32, op_ac_shape)  # opponents' act
        X = tf.concat([obs_x, op_act_x], axis=1)  # input
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        # A_v = tf.concat([tf.expand_dims(actions, axis=1), op_act_x], axis=1)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        self.log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi, labels=actions)
        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step_log_prob(ob, acts):
            log_prob = sess.run(self.log_prob, {X: ob, actions: acts})
            return log_prob.reshape([-1, 1])

        def step(ob, obs, a_v, *_args, **_kwargs):
            oppo_a = sess.run(oppo_a0, {oppo_policy.obs_x: ob})

            if a_v is not None:
                a, v = sess.run([a0, v0], {
                    obs_x: ob,
                    op_act_x: oppo_a,
                    X_v: obs,
                    A_v: a_v
                })
            else:
                a, v = sess.run([a0, v0], {
                    obs_x: ob,
                    op_act_x: oppo_a,
                    X_v: obs
                })
            return a, v, []  # dummy state

        def value(ob, obs, a_v, *_args, **_kwargs):
            oppo_a = sess.run(oppo_a0, {oppo_policy.obs_x: ob})

            if a_v is not None:
                return sess.run(v0, {X_v: obs, A_v: a_v, op_act_x: oppo_a})
            else:
                return sess.run(v0, {X_v: obs, op_act_x: oppo_a})

        self.obs_x = obs_x
        self.op_act_x = op_act_x
        self.X = obs_x
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step_log_prob = step_log_prob
        self.step = step
        self.value = value
        self.oppo_a = oppo_a0
Пример #7
0
    def __init__(self,
                 sess,
                 agent_id,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        self.agent_id = agent_id
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space
        actions = [
            tf.placeholder(tf.int32, (nbatch))
            for _ in range(len(ob_spaces) - 1)
        ]
        all_ac_shape = (nbatch, (sum([ac.n
                                      for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = []
            for k in range(len(ob_spaces)):
                if k == agent_id:
                    continue
                pi.append(fc(h2, 'pi_%d' % k, ac_spaces[k].n, act=lambda x: x))
        self.log_prob = [
            -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi[i],
                                                            labels=actions[i])
            for i in range(len(pi))
        ]
        a0 = [sample(_) for _ in pi]
        self.initial_state = []  # not stateful

        def step_log_prob(ob, acts_n):
            acts = [
                acts_n[i] for i in range(len(acts_n)) if i != self.agent_id
            ]
            feed_dict = {X: ob}
            feed_dict.update(zip(actions, acts))
            log_prob = sess.run(self.log_prob, feed_dict)
            return log_prob.reshape([-1, 1])

        def step(ob, obs, a_v, *_args, **_kwargs):
            a = sess.run(a0, {X: ob, X_v: obs})
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.step_log_prob = step_log_prob
        self.step = step