Пример #1
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=4, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=4, init_scale=np.sqrt(2)))

            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=4, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=4, init_scale=np.sqrt(2)))

            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        a0 = self.pd.sample()

        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            #print ob
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Пример #2
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  # pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi = fc(h, 'pi', nact, init_scale=0.01)
            vf = fc(h, 'v', 1)[:, 0]

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #3
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        #obz = ob

        #with tf.variable_scope("obfilter"):
        #    self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = ob
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer)
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Пример #4
0
    def build_policy_network(self, sy_ob_no):

        net = sy_ob_no
        net = tf.layers.dense(
            net,
            64,
            activation=tf.nn.tanh,
            kernel_initializer=tf.truncated_normal_initializer(stddev=1.0))
        net = tf.layers.dense(
            net,
            64,
            activation=tf.nn.tanh,
            kernel_initializer=tf.truncated_normal_initializer(stddev=1.0))
        sy_mean_na = tf.layers.dense(
            net,
            self.ac_dim,
            activation=None,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))

        sy_logstd = tf.Variable(
            tf.zeros([self.ac_dim]), name='action/logstd', dtype=tf.float32
        )  # logstd should just be a trainable variable, not a network output.
        # construct distribution
        pdparam = tf.concat([sy_mean_na, sy_mean_na * 0.0 + sy_logstd], axis=1)
        pdtype = make_pdtype(self.ac_dim)
        pd = pdtype.pdfromflat(pdparam)

        return pd
Пример #5
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Пример #6
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X = tf.placeholder(shape=(nbatch,) + ob_space.shape, dtype=tf.float32)
            activ = tf.tanh
            processed_x = tf.layers.flatten(X)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'model')

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        def neg_log_prob(actions):
            return self.pd.neglogp(actions)

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.neg_log_prob = neg_log_prob
        self.entropy = self.pd.entropy()
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, deterministic = False): #pylint: disable=W0613
        
        # Assign action as Gaussian Distribution
        self.pdtype = make_pdtype(ac_space)
        #print("action_space: {}".format(ac_space))
        with tf.variable_scope("model", reuse=reuse):
            phero_values = tf.placeholder(shape=(None, 8), dtype=tf.float32, name="phero_values")
            #velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities")

            # Actor neural net
            pi_net = self.net(phero_values)
            # Critic neural net
            vf_h2 = self.net(phero_values)
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_net, init_scale=0.01)

        if deterministic:
            a0 = self.pd.mode()
        else:
            a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None
        self.phero = phero_values
        #self.velocities = velocities

        self.vf = vf

        def step(ob, *_args, **_kwargs):
            '''
            Generate action & value & log probability by inputting one observation into the policy neural net
            '''
            # 20201009 Should I get just array or single value?
            phero = ob 
            # lb = [o["laser"] for o in ob]
            # rb = [o["rel_goal"] for o in ob]
            # vb = [o["velocities"] for o in ob]

            #print(rb)
            #print("mean: {}, std: {}".format(self.pd.mean, self.pd.std))
            a, v, neglogp = sess.run([a0, vf, neglogp0], {self.phero: phero})
            # Action clipping (normalising action within the range (-1, 1) for better training)
            # The network will learn what is happening as the training goes.
            # for i in range(a.shape[1]):
            #     a[0][i] = min(1.0, max(-1.0, a[0][i]))
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            phero = ob
            # lb = [o["laser"] for o in ob]
            # rb = [o["rel_goal"] for o in ob]
            # vb = [o["velocities"] for o in ob]
            return sess.run(vf, {self.phero: phero})

        self.step = step
        self.value = value
Пример #8
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        self.ob_space = ob_space
        self.ac_space = ac_space
        self.ac_pdtype = make_pdtype(ac_space)

        self.pd = self.vpred = None
        self.hidsize = hidsize
        self.feat_dim = feat_dim
        self.scope = scope
        pdparamsize = self.ac_pdtype.param_shape()[0]

        self.features_model = small_convnet(self.ob_space,
                                            nl=self.nl,
                                            feat_dim=self.feat_dim,
                                            last_nl=None,
                                            layernormalize=self.layernormalize)

        self.pd_hidden = torch.nn.Sequential(
            torch.nn.Linear(feat_dim, hidsize),
            torch.nn.ReLU(),
            torch.nn.Linear(hidsize, hidsize),
            torch.nn.ReLU(),
        )
        self.pd_head = torch.nn.Linear(hidsize, pdparamsize)
        self.vf_head = torch.nn.Linear(hidsize, 1)

        self.param_list = [
            dict(params=self.features_model.parameters()),
            dict(params=self.pd_hidden.parameters()),
            dict(params=self.pd_head.parameters()),
            dict(params=self.vf_head.parameters())
        ]

        self.flat_features = None
        self.pd = None
        self.vpred = None
        self.ac = None
        self.ob = None
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 deterministic=False):  #pylint: disable=W0613

        # Assign action as Gaussian Distribution
        self.pdtype = make_pdtype(ac_space)
        self.num_obs = 8
        #print("action_space: {}".format(ac_space))
        with tf.variable_scope("model", reuse=reuse):
            phero_values = tf.placeholder(shape=(None, self.num_obs),
                                          dtype=tf.float32,
                                          name="phero_values")
            #velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities")

            # Actor neural net
            pi_net = self.net(phero_values)
            # Critic neural net
            vf_h2 = self.net(phero_values)
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_net,
                                                        init_scale=0.01)

        if deterministic:
            a0 = self.pd.mode()
        else:
            a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None
        self.phero = phero_values
        #self.velocities = velocities

        self.vf = vf

        def step(ob, *_args, **_kwargs):
            '''
            Generate action & value & log probability by inputting one observation into the policy neural net
            '''
            phero = [o for o in ob]
            a, v, neglogp = sess.run([a0, vf, neglogp0], {self.phero: phero})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            phero = [o for o in ob]
            return sess.run(vf, {self.phero: phero})

        self.step = step
        self.value = value
Пример #10
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:,0]
Пример #12
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, deterministic = False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            laser = tf.placeholder(shape=(None, 512, 3), dtype=tf.float32, name="laser")
            rel_goal = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="rel_goal")
            velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities")

            pi_net = self.net(laser, rel_goal, velocities)
            vf_h2 = self.net(laser, rel_goal, velocities)
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_net, init_scale=0.01)

        if deterministic:
            a0 = self.pd.mode()
        else:
            a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        self.laser = laser
        self.rel_goal = rel_goal
        self.velocities = velocities

        self.vf = vf

        def step(ob, *_args, **_kwargs):
            lb = [o["laser"] for o in ob]
            rb = [o["rel_goal"] for o in ob]
            vb = [o["velocities"] for o in ob]

            #print(rb)

            a, v, neglogp = sess.run([a0, vf, neglogp0],
                                     {self.laser: lb, self.rel_goal: rb, self.velocities: vb})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            lb = [o["laser"] for o in ob]
            rb = [o["rel_goal"] for o in ob]
            vb = [o["velocities"] for o in ob]
            return sess.run(vf, {self.laser: lb, self.rel_goal: rb, self.velocities: vb})

        self.step = step
        self.value = value
Пример #13
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        sy_ob = U.get_placeholder(name="sy_ob",
                                  dtype=tf.float32,
                                  shape=[sequence_length] +
                                  list(ob_space.shape))

        obscaled = sy_ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x,
                             pdtype.param_shape()[0], "logits",
                             U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        sy_ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, sy_ob], [sy_ac, self.vpred])
Пример #14
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Пример #15
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.bool_actionclip = True  #TODO Need to make this flexible
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        #self.ac_range = ac_range
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(
                ac_space
            )  #RS: Should give a continuous action space, given  a continuous action env
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x,
                             name='pd',
                             units=pdparamsize,
                             activation=tf.nn.tanh)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.a_samp = self.clip_action(
                self.a_samp) if self.bool_actionclip else self.a_samp
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
            self.pd_logstd = pd.logstd
            self.pd_std = pd.std
            self.pd_mean = pd.mean
Пример #16
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 reuse=False,
                 training=True):  # pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape  # 增加nbatch行
        actdim = ac_space.shape[0]
        # X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        X = tf.placeholder(tf.float32, [None, ob_space.shape[0]], name='Ob')
        with tf.variable_scope("model", reuse=reuse):
            # activ = tf.tanh
            bn = tf.layers.batch_normalization
            activ = lkrelu
            # h1 = activ(bn(fc(X, 'pi_fc1', nh=512, init_scale=np.sqrt(2)), training=training))
            # h2 = activ(bn(fc(h1, 'pi_fc2', nh=512, init_scale=np.sqrt(2)), training=training))
            # h3 = activ(bn(fc(h2, 'pi_fc3', nh=256, init_scale=np.sqrt(2)), training=training))
            h1 = activ(fc(X, 'pi_fc1', nh=100, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=100, init_scale=np.sqrt(2)))
            # pi0 = tf.nn.tanh(fc(h2, 'pi0', 1, init_scale=0.01))*3  # (-3, 3)
            # pi1 = tf.nn.sigmoid(fc(h2, 'pi1', 1, init_scale=0.01))*10  # (0, 10)
            # pi = tf.concat([pi0, pi1], axis=1, name='pi')
            pi = tf.nn.tanh(fc(h2, 'pi', nh=actdim)) * 10
            # h1 = activ(bn(fc(X, 'vf_fc1', nh=512, init_scale=np.sqrt(2)), training=training))
            # h2 = activ(bn(fc(h1, 'vf_fc2', nh=512, init_scale=np.sqrt(2)), training=training))
            # h3 = activ(bn(fc(h2, 'vf_fc3', nh=256, init_scale=np.sqrt(2)), training=training))
            h1 = activ(fc(X, 'vf_fc1', nh=100, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=100, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)[:, 0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
            # logstd = tf.layers.dense(inputs=h2, activation=None, units=actdim, name='logstd')

        pdparam = tf.concat([pi, pi * 0.0 + logstd],
                            axis=1)  # pi * 0.0 + logstd的作用是使得qi有相同的形状
        self.pdtype = make_pdtype(
            ac_space)  # Probability distribution function  pd
        '''返回DiagGaussianPd的类'''
        self.pd = self.pdtype.pdfromflat(pdparam)
        a0 = self.pd.sample()
        self.action = tf.identity(
            a0, name='action')  # use this tensor as action when inference
        # if I need action clipping?
        # a1 = tf.clip_by_value(a0[:, 0:1], -3, 3)
        # a2 = tf.clip_by_value(a0[:, 1:2], 0, 10)
        # a0 = tf.concat([a1, a2], axis=1)
        neglogp0 = self.pd.neglogp(a0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            # {X: ob}给placeholder赋值
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #17
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        # nh, nw, nc = ob_space.shape  # (nh, nw, nc) = (height, width, channels)
        ob_shape = (nbatch, ob_space.shape[0])
        # nact = ac_space.n
        # X = tf.placeholder(tf.uint8, ob_shape)  # obs
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='phOb')
        M = tf.placeholder(tf.float32, [nbatch],
                           name='phMaskDone')  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2],
                           name='phCellState')  # states and output: (c, h)
        with tf.variable_scope("model", reuse=reuse):
            # h = nature_cnn(X)
            # h = tf.add(X, 0, name='h')  # need more network to power enough
            h = mlp(X)
            xs = batch_to_seq(
                h, nenv,
                nsteps)  # A List contain tensors all with shape [nenv, -1]
            ms = batch_to_seq(
                M, nenv,
                nsteps)  # A List contain tensors all with shape [nenv, 1]
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            # pi = fc(h5, 'fc_pi', actdim)
            pi0 = tf.nn.tanh(h5[:, :1]) * 3
            pi1 = tf.nn.sigmoid(h5[:, 1:2]) * 10
            pi = tf.concat([pi0, pi1], axis=1, name='pi')

            vf = fc(h5, 'v', 1)
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
        # self.pdtype = make_pdtype(ac_space)
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        action = tf.add(
            a0, 0, name='action')  # use this tensor as action when inference
        newState = tf.add(snew, 0, name='newCellState')
        print('sel.pd.shape', self.pd.shape, a0.shape)
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #18
0
    def __init__(self, sess, ob_space, loc_space, ac_space, nbatch, nsteps, max_timesteps, reuse=False, seed=0):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            G = tf.placeholder(tf.float32, [nbatch, max_timesteps, loc_space])
            X = tf.placeholder(tf.float32, (nbatch, )+ob_space.shape)
            Y = tf.placeholder(tf.float32, [nbatch, loc_space])
            M = tf.placeholder(tf.float32, [nbatch])
            S = tf.placeholder(tf.float32, [nenv, 128])
            ys = batch_to_seq(Y, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)

            tf.set_random_seed(seed)
            self.embed_W = tf.get_variable("embed_w", [loc_space, 64], initializer=ortho_init(1.0, seed))
            self.embed_b = tf.get_variable("embed_b", [64,])
            self.wa = tf.get_variable("wa", [128, 128], initializer=ortho_init(1.0, seed))
            self.wb = tf.get_variable("wb", [128,])
            self.ua = tf.get_variable("ua", [128, 128], initializer=ortho_init(1.0, seed))
            self.ub = tf.get_variable("ub", [128,])
            self.va = tf.get_variable("va", [128])
            self.rnn = tf.nn.rnn_cell.GRUCell(128, kernel_initializer=ortho_init(1.0, seed))
            enc_hidden = tf.zeros((nbatch, 128))
            embed_G = tf.matmul(tf.reshape(G, (-1, loc_space)),self.embed_W)+self.embed_b
            embed_G = tf.reshape(embed_G, (nbatch, max_timesteps, -1))
            enc_output, _ = tf.nn.dynamic_rnn(cell=self.rnn, inputs=embed_G, dtype=tf.float32)
            gs = batch_to_seq(enc_output, nenv, nsteps)
            dec_hidden = S
            h = []
            for idx, (y, m, g) in enumerate(zip(ys, ms, gs)):
                dec_hidden = dec_hidden*(1-m)
                embed_y = tf.matmul(y,self.embed_W)+self.embed_b
                dec_output, dec_hidden = tf.nn.dynamic_rnn(cell=self.rnn, inputs=tf.expand_dims(embed_y,axis=1), initial_state=dec_hidden)

                tmp = tf.reshape(tf.matmul(tf.reshape(g, (-1, 128)), self.ua)+self.ub,(nenv, max_timesteps, 128))
                tmp = tf.tanh(tf.expand_dims(tf.matmul(dec_hidden, self.wa)+self.wb,axis=1) + tmp)
                score = tf.reduce_sum(tmp*tf.expand_dims(tf.expand_dims(self.va, axis=0), axis=1), axis=2, keepdims=True)
                attention_weights = tf.nn.softmax(score, axis=1)
                context_vector = attention_weights * g
                context_vector = tf.reduce_sum(context_vector, axis=1)
                x = tf.concat([context_vector, dec_hidden], axis=-1)
                h.append(x)
            h = seq_to_batch(h)
            vf = fc(h, 'v', 1, seed=seed)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, seed=seed, init_scale=0.01)
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv,128))

        def step(ob, loc, goal, state, mask):
            a, v, state, neglogp = sess.run([a0, vf, dec_hidden, neglogp0], {X:ob, Y:loc, G:goal, M:mask, S:state})
            return a, v, state, neglogp

        def value(ob, loc, goal, state, mask):
            return sess.run(vf, {X:ob, Y:loc, G:goal, M:mask, S:state})

        self.G = G
        self.X = X
        self.Y = Y
        self.S = S
        self.M = M
        self.vf = vf
        self.step = step
        self.value = value