示例#1
0
class MLPPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'):  # pylint: disable=W0613
        # this method is called with nbatch = nenvs*nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        # actdim = ac_space.shape[0]
        # Todo check initialization
        # Input and Output dimensions
        nd, = ob_space.shape
        nbatch = nenvs * nsteps
        ob_shape = (nbatch, nd)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        with tf.variable_scope("model", reuse=reuse):
            if activ_fcn == 'relu6':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

                h3 = tf.nn.relu6(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'elu':
                h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

                h3 = tf.nn.elu(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'mixed':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  #, init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  #, init_scale=np.sqrt(2)))

                h3 = tf.nn.tanh(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  #, init_scale=np.sqrt(2)))

            pi_logit = fc(h3, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logit)

            vf = fc(h2, 'vf', 1)[:, 0]  # predicted value of input state

        self.pd = CategoricalPd(pi_logit)  # pdparam
        a0 = self.pd.sample()  # returns action index: 0,1
        # a0 = tf.argmax(pi, axis=1)
        neglogp0 = self.pd.neglogp(a0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, pi, v, neglogp = sess.run([a0, pi_logit, vf, neglogp0], {X: ob})
            return a, pi, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.pi_logit = pi_logit
        self.vf = vf
        self.ac = a0
        self.step = step
        self.value = value
示例#2
0
class LSTMPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'):  # pylint: disable=W0613
        # this method is called with nbatch = nenvs*nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        # actdim = ac_space.shape[0]
        # Todo check initialization
        # Input and Output dimensions
        nd, = ob_space.shape
        nbatch = nenvs * nsteps
        ob_shape = (nbatch, nd)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        with tf.variable_scope("model", reuse=reuse):
            if activ_fcn == 'relu6':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'elu':
                h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'mixed':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  #, init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  #, init_scale=np.sqrt(2)))

            # The output matrix [nbatch x trace_length, h_units] of layer 2 needs to be reshaped to a vector with
            # dimensions: [nbatch , trace_length , h_units] for rnn processing.
            rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=units_per_hlayer[1], state_is_tuple=True)
            rnn_input = tf.reshape(h2, shape=[nenvs, nsteps, units_per_hlayer[1]])
            rnn_state_in = rnn_cell.zero_state(batch_size=nenvs,
                                               dtype=tf.float32)  # reset the state in every training iteration
            rnn_output, rnn_state_out = tf.nn.dynamic_rnn(inputs=rnn_input,
                                                          cell=rnn_cell,
                                                          initial_state=rnn_state_in,
                                                          dtype=tf.float32,
                                                          scope="model" + '_rnn')
            # The output of the recurrent cell then needs to be reshaped to the original matrix shape.
            rnn_output = tf.reshape(rnn_output, shape=[-1, units_per_hlayer[1]])

            if activ_fcn == 'relu6':
                activ = tf.nn.relu6
            elif activ_fcn == 'elu':
                activ = tf.nn.elu
            elif activ_fcn == 'mixed':
                activ = tf.nn.tanh
            h3 =activ(fc(rnn_output, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            pi_logit = fc(h3, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logit)

            vf = fc(rnn_output, 'vf', 1)[:, 0]  # predicted value of input state

        self.pd = CategoricalPd(pi_logit)  # pdparam
        a0 = self.pd.sample()  # returns action index: 0,1
        # a0 = tf.argmax(pi_logit, axis=1)
        neglogp0 = self.pd.neglogp(a0)

        # The rnn state consists of the "cell state" c and the "input vector" x_t = h_{t-1}
        self.initial_state = (np.zeros([nenvs, units_per_hlayer[1]]), np.zeros([nenvs, units_per_hlayer[1]]))

        def step(ob, r_state, *_args, **_kwargs):
            a, pi, v, r_state_out, neglogp = sess.run([a0, pi_logit, vf, rnn_state_out, neglogp0], {X: ob, rnn_state_in: r_state})
            return a, pi, v, r_state_out, neglogp

        def value(ob, r_state, *_args, **_kwargs):
            return sess.run(vf, {X: ob, rnn_state_in: r_state})

        self.X = X
        self.pi = pi
        self.pi_logit = pi_logit
        self.vf = vf
        self.ac = a0
        self.rnn_state_in = rnn_state_in
        self.rnn_state_out = rnn_state_out
        self.step = step
        self.value = value