Python seq_to_batch примеры, baselines.a2c.utils.seq_to_batch Python примеры использования

Пример #1

0

Показать файл

    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        fm = nature_cnn(X, **conv_kwargs)
        fm_flat = conv_to_fc(fm)
        h = tf.nn.relu(fc(fm_flat, 'fc1', nh=nh, init_scale=np.sqrt(2)))

        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  # states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return fm, h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }

Пример #2

0

Показать файл

Файл: policies.py Проект: Divyankpandey/baselines

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value

Пример #3

0

Показать файл

Файл: acer.py Проект: MrGoogol/baselines

def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
    """
    Calculates q_retrace targets

    :param R: Rewards
    :param D: Dones
    :param q_i: Q values for actions taken
    :param v: V values
    :param rho_i: Importance weight for each action
    :return: Q_retrace values
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
    v_final = vs[-1]
    qret = v_final
    qrets = []
    for i in range(nsteps - 1, -1, -1):
        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
        qret = rs[i] + gamma * qret * (1.0 - ds[i])
        qrets.append(qret)
        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret

Пример #4

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value

Пример #5

0

Показать файл

Файл: policies.py Проект: Divyankpandey/baselines

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step

Пример #6

0

Показать файл

    def network_fn(X, nenv=1):
        print("")
        print("IN HERE LSTM and this is X ",str(X))
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
        #T = tf.get_variable(name='init', shape=[1, 2], initializer=tf.constant_initializer(1)) # task desciptor

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)

        ## TODO:  need to change initialization of state!
        initial_state = np.zeros(S.shape.as_list(), dtype=float)


        print("")
        print("HHHHH ",str(S.shape.as_list()))
        print(nenv)

        #initial_state = utils.fc(T,'pi_init', [nenv,48], init_scale=0.01, init_bias=0.01)
        #initial_state = tf.get_variable(name='init_state', shape=initial_state.shape, initializer=tf.zeros_initializer(), trainable=True) # task desciptor

        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}

Пример #7

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step

Пример #8

0

Показать файл

def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
    """
    Calculates q_retrace targets

    :param R: Rewards
    :param D: Dones
    :param q_i: Q values for actions taken
    :param v: V values
    :param rho_i: Importance weight for each action
    :return: Q_retrace values
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps,
                           True)  # list of len steps, shape [nenvs]
    rs = batch_to_seq(R, nenvs, nsteps,
                      True)  # list of len steps, shape [nenvs]
    ds = batch_to_seq(D, nenvs, nsteps,
                      True)  # list of len steps, shape [nenvs]
    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
    v_final = vs[-1]
    qret = v_final
    qrets = []
    for i in range(nsteps - 1, -1, -1):
        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]],
                    [[nenvs]] * 6)
        qret = rs[i] + gamma * qret * (1.0 - ds[i])
        qrets.append(qret)
        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret

Пример #9

0

Показать файл

Файл: models.py Проект: fiorenza2/baselines

    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }

Пример #10

0

Показать файл

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #11

0

Показать файл

Файл: policies.py Проект: ShikhaSurana/bp_tr-ppo-rb

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False):
        nenv = nbatch // nsteps
        print(f'{nlstm}')
        ob_shape = (nbatch,) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            # h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            activ = tf.tanh
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            xs = batch_to_seq(h1, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h2, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h2 = seq_to_batch(h2)
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                initializer=tf.zeros_initializer())

            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        def get_act(ob, state, mask):
            a = sess.run(a0, {X:ob, S:state, M:mask})
            return a

        def get_mean(ob, state, mask):
            a, state_new = sess.run([pi, snew], {X:ob, S:state, M:mask})
            return a, state_new


        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.act = get_act
        self.mean = get_mean

Пример #12

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        super().__init__(sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=reuse)
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        nlstm = self.lstm_units
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            X = tf.cast(X, tf.float32)
            with tf.variable_scope("Towers", reuse=reuse):
                with tf.variable_scope("tower_1"):
                    tower1 = tf.layers.conv2d(inputs=X, filters=64, kernel_size=(3, 3), strides=(1, 1),
                                              padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                    tower1 = tf.layers.conv2d(inputs=tower1, filters=32, kernel_size=(3, 3), strides=(1, 1),
                                              padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                    tower1 = tf.layers.max_pooling2d(tower1, pool_size=(22, 80), strides=(22, 80))

                with tf.variable_scope("tower_2"):
                    tower2 = tf.layers.max_pooling2d(X, pool_size=(2, 2), strides=(2, 2))
                    for _ in range(self.depth):
                        tower2 = tf.layers.conv2d(inputs=tower2, filters=32, kernel_size=(3, 3), strides=(1, 1),
                                                  padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                        tower2 = tf.nn.relu(tower2)
                    tower2 = tf.layers.max_pooling2d(tower2, pool_size=(11, 40), strides=(11, 40))

                with tf.variable_scope("tower_3"):
                    tower3 = tf.layers.max_pooling2d(X, pool_size=(3, 6), strides=(3, 6), padding='SAME')
                    for _ in range(self.depth):
                        tower3 = tf.layers.conv2d(inputs=tower3, filters=32, kernel_size=(3, 3), strides=(1, 1),
                                                  padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                        tower3 = tf.nn.relu(tower3)
                    tower3 = tf.layers.max_pooling2d(tower3, pool_size=(8, 14), strides=(8, 14), padding='SAME')

                concat = tf.concat([tower1, tower2, tower3], axis=-1)

            # lstm
            xs = batch_to_seq(concat, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        self.a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.snew = snew
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q
        self.sess = sess

Пример #13

0

Показать файл

Файл: acktr.py Проект: Washeh/msc_thesis

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=128, reuse=False):
        scope = "model"
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape, name="observations") #obs
        M = tf.placeholder(tf.float32, [nbatch], name="mask") #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="states") #states
        with tf.variable_scope(scope, reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
            self._saver = tf.train.Saver(trainable_vars)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        def save(path, name):
            try:
                os.makedirs(path)
            except FileExistsError:
                pass
            self._saver.save(sess, path+name)

        def load(path, name):
            if os.path.exists(path+name+'.index'):
                self._saver.restore(sess, path+name)
            else:
                tf.logging.warn('Failed restoring vars from %s' % path)

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.save = save
        self.load = load

Пример #14

0

Показать файл

            def network_fn(X, nenv=1):
                nbatch = X.shape[0]
                nsteps = nbatch // nenv

                h = X
                with tf.variable_scope('mlp_in', reuse=tf.AUTO_REUSE):
                    for i in range(num_layers_in):
                        h = fc(h,
                               'mlp_in_fc{}'.format(i),
                               nh=num_hidden_in,
                               init_scale=np.sqrt(2))
                        if layer_norm_in:
                            h = tf.contrib.layers.layer_norm(h,
                                                             center=True,
                                                             scale=True)
                        h = activation(h)

                h = tf.layers.flatten(X)

                M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
                S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

                xs = batch_to_seq(h, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)

                if layer_norm_lstm:
                    h5, snew = utils.lnlstm(xs,
                                            ms,
                                            S,
                                            scope='lnlstm',
                                            nh=nlstm)
                else:
                    h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

                h = seq_to_batch(h5)

                with tf.variable_scope('mlp_out', reuse=tf.AUTO_REUSE):
                    for i in range(num_layers_out):
                        h = fc(h,
                               'mlp_out_fc{}'.format(i),
                               nh=num_hidden_out,
                               init_scale=np.sqrt(2))
                        if layer_norm_out:
                            h = tf.contrib.layers.layer_norm(h,
                                                             center=True,
                                                             scale=True)
                        h = activation(h)

                initial_state = np.zeros(S.shape.as_list(), dtype=float)

                return h, {
                    'S': S,
                    'M': M,
                    'state': snew,
                    'initial_state': initial_state
                }

Пример #15

0

Показать файл

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))

            # lstm
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact, act=lambda x: x)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step


# For Mujoco. Taken from PPOSGD

Пример #16

0

Показать файл

def strip(var, n_envs, n_steps, flat=False):
    """
    Removes the last step in the batch

    :param var: (TensorFlow Tensor) The input Tensor
    :param n_envs: (int) The number of environments
    :param n_steps: (int) The number of steps to run for each environment
    :param flat: (bool) If the input Tensor is flat
    :return: (TensorFlow Tensor) the input tensor, without the last step in the batch
    """
    out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat)
    return seq_to_batch(out_vars[:-1], flat)

Пример #17

0

Показать файл

Файл: policies.py Проект: oehm/coinrun

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 create_additional=True,
                 nlstm=256):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            if (create_additional):
                vf = fc(h5, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        a0 = self.pd.sample()
        if (create_additional):
            neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            if (create_additional):
                a, v, s, neglogp = sess.run([a0, vf, snew, neglogp0], {
                    X: ob,
                    S: state,
                    M: mask
                })
            else:
                a, s = sess.run([a0, snew], {X: ob, S: state, M: mask})
                v = np.zeros_like(a)
                neglogp = np.zeros_like(a)
            return a, v, s, neglogp

        def value(ob, state, mask):
            return sess.run(vf, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        if (create_additional):
            self.vf = vf
            self.value = value
        self.step = step

Пример #18

0

Показать файл

Файл: policies.py Проект: ethanabrooks/baselines

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 size_mem=256,
                 reuse=False):
        nenv = nbatch // nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        ob_shape = (nbatch, ) + ob_space.shape
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, size_mem * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = self.preprocess(X)
            h = fc(h, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = self.memory_fn(xs, ms, S, nh=size_mem)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #19

0

Показать файл

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 param=None):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #20

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=8, reuse=False):
        # assume ob_space, ac_space to be flattned
        # e.g. original action_space (3,2,3) -> new action_space (36)
        nenv = nbatch // nsteps
        print ("envs and steps and batch:", nenv, nsteps, nbatch)
        #nh, nw, nc = ob_space.shape
        #ob_shape = (nbatch, nh, nw, nc)
        ob_shape = (nbatch,) + ob_space.shape
        #nact = ac_space.high.size
        pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            #h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            #h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            #h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            #h3 = conv_to_fc(h3)
            #h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            h4 = fc(X, 'fc1', nh=16, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pdparam = fc(h5, 'pi', pdtype.param_shape()[0], act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)
            #logstd = tf.get_variable(name="logstd", shape=[1, nact],
            #    initializer=tf.zeros_initializer())

        #pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = pdtype #make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value

Пример #21

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False):
        nenv = nbatch // nsteps
        ob_shape = add_batch_dimension(ob_space.shape, nbatch)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name="X") #obs
        M = tf.placeholder(tf.float32, [nbatch], name="M") #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="S") #states
        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps) # Observation sequences
            ms = batch_to_seq(M, nenv, nsteps) # Done sequences
            h0, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h0 = seq_to_batch(h0)
            h0 = tf.concat([h0,X],1)
            # Policy
            h1 = fc(h0, 'pi_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu)
            pi = fc(h1, 'pi', nact, act=tf.tanh, init_scale=0.01)
            # Value function
            h1 = fc(h0, 'vf_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu)
            vf = fc(h1, 'vf', 1, act=lambda x:x)
            # Current policy variance
            logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #22

0

Показать файл

Файл: policies.py Проект: IcarusTan/baselines

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #23

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps) # Comments by Fei: xs is list of nsteps, each is nenv * nh
            ms = batch_to_seq(M, nenv, nsteps) # Comments by Fei: ms is list of nsteps, each is nenv vector
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) # Comment by Fei: h5 is the same dimension as xs, but with value changed by LSTM. snew is new S
            h5 = seq_to_batch(h5) # Comments by Fei: h5 is nbatch * nh again, just like h4
            pi = fc(h5, 'pi', nact, act=lambda x:x) # Comments by Fei: pi is nbatch * nact
            vf = fc(h5, 'v', 1, act=lambda x:x) # Comments by Fei: vf is nbatch * 1

        v0 = vf[:, 0] # Comments by Fei: v0 is nbatch vector, each value is the value function of a state
        a0 = sample(pi) # Comments by Fei: a0 is nbatch vector, each value is the best choice of action, at that state
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #24

0

Показать файл

Файл: policies.py Проект: xzblueofsky/baselines

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #25

0

Показать файл

    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        ob_g, ob_l = tf.split(X, 2, axis=1)
        ob_g = tf.squeeze(ob_g, axis=1) - 128.0
        ob_l = tf.squeeze(ob_l, axis=1) - 128.0

        # Conv layer
        net_g = vggm1234(ob_g)
        net_l = vggm1234(ob_l)
        feat = tf.concat([net_g, net_l], 1)

        # LSTM
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(feat, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        # FC
        h = slim.fully_connected(h, 4, scope='fc', activation_fn=tf.nn.tanh)

        return (feat, h), {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }

Пример #26

0

Показать файл

def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma):
    """
    Calculates the target Q-retrace

    :param rewards: ([TensorFlow Tensor]) The rewards
    :param dones: ([TensorFlow Tensor])
    :param q_i: ([TensorFlow Tensor]) The Q values for actions taken
    :param values: ([TensorFlow Tensor]) The output of the value functions
    :param rho_i: ([TensorFlow Tensor]) The importance weight for each action
    :param n_envs: (int) The number of environments
    :param n_steps: (int) The number of steps to run for each environment
    :param gamma: (float) The discount value
    :return: ([TensorFlow Tensor]) the target Q-retrace
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps,
                           True)  # list of len steps, shape [n_envs]
    reward_seq = batch_to_seq(rewards, n_envs, n_steps,
                              True)  # list of len steps, shape [n_envs]
    done_seq = batch_to_seq(dones, n_envs, n_steps,
                            True)  # list of len steps, shape [n_envs]
    q_is = batch_to_seq(q_i, n_envs, n_steps, True)
    value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True)
    final_value = value_sequence[-1]
    qret = final_value
    qrets = []
    for i in range(n_steps - 1, -1, -1):
        check_shape([
            qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i],
            value_sequence[i]
        ], [[n_envs]] * 6)
        qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i])
        qrets.append(qret)
        qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret

Пример #27

0

Показать файл

    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)
        for i in range(len(hiddens) - 1):
            h = utils.fc(h,
                         'mlp_fc{}'.format(i),
                         nh=hiddens[i],
                         init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)

        nlstm = hiddens[-1]

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = utils.batch_to_seq(h, nenv, nsteps)
        ms = utils.batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = utils.seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }

Пример #28

0

Показать файл

Файл: acer.py Проект: MrGoogol/baselines

def strip(var, nenvs, nsteps, flat = False):
    vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
    return seq_to_batch(vars[:-1], flat)

Пример #29

0

Показать файл

def strip(var, nenvs, nsteps, flat=False):
    vars = batch_to_seq(var, nenvs, nsteps, flat)
    return seq_to_batch(vars, flat)

Пример #30

0

Показать файл

Файл: base.py Проект: wwxFromTju/rl-generalization

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 feature_mlp=True):

        nenv = nbatch // nsteps
        ob_shape = (nbatch, ) + ob_space.shape
        if len(ac_space.shape) == 0:
            # discrete set of actions
            nact = ac_space.n
            discrete = True
        else:  # continuous
            actdim = ac_space.shape[0]
            discrete = False
        X = tf.placeholder(tf.float32, ob_shape, name="Ob")
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            if feature_mlp:
                print("Using feature network in front of LSTM")
                h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2)))
                h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2)))
                xs = batch_to_seq(h2, nenv, nsteps)
            else:
                print("No feature network in front of LSTM")
                xs = batch_to_seq(X, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, "vf", 1)
            if discrete:
                pi = fc(h5, "pi", nact, init_scale=0.01)
            else:
                pi = fc(h5, "pi", actdim, init_scale=0.01)
                logstd = tf.get_variable(name="logstd",
                                         shape=[1, actdim],
                                         initializer=tf.zeros_initializer())

        self.pdtype = make_pdtype(ac_space)
        if discrete:
            self.pd = self.pdtype.pdfromflat(pi)
        else:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #31

0

Показать файл

Файл: policies.py Проект: ethanabrooks/baselines

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 size_mem=256,
                 reuse=False):  # pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        if ac_space.shape == ():
            actdim = 1
        else:
            actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs

        nenv = nbatch // nsteps
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, size_mem * 2])  # states

        with tf.variable_scope("model", reuse=reuse):
            # h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            # h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)

            h2 = tf.cast(X, tf.float32)
            xs = batch_to_seq(h2, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm', nh=size_mem)
            h5 = seq_to_batch(h5)

            pi = fc(h5, 'pi', actdim, act=lambda x: x, init_scale=0.01)
            h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            vf = fc(h5, 'vf', 1, act=lambda x: x)[:, 0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        # v0 = vf[0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, vf, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(vf, {X: ob, S: state, M: mask})

        # def step(ob, *_args, **_kwargs):
        #     a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
        #     return a, v, self.initial_state, neglogp
        #
        # def value(ob, *_args, **_kwargs):
        #     return sess.run(vf, {X: ob})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Пример #32

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet,
                 reuse=False,
                 flownet=None, train_from_scratch=False,
                 recurrent=None,
                 large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False):
        ob_shape_vec = (nbatch,) + ob_space["vector"].shape
        nh, nw, nc = ob_space["image"].shape
        ob_shape_im = (nbatch, nh, nw, nc)

        actdim = ac_space.shape[0]
        X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec')  # obs
        X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im')

        if add_flownet:
            # adding previous image placeholder:
            X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p')  # obs t-1
        else:
            X_p = None

        if recurrent:
            nenv = nbatch // nsteps
            M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
            S = tf.placeholder(tf.float32, [nenv, nlstm*2])  # states

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h_im = mujoco_cnn(
                X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec,
                X_p, flownet,
                train_from_scratch,
                large_cnn, diff_frames)

            if add_predicted_flow_to_vec:
                flow_vec = get_flow_vec(
                    X_im, 'pi', nbatch, add_flownet,
                    X_p, flownet,
                    train_from_scratch,
                    large_cnn, diff_frames)
                h_vec = tf.concat([X_vec, flow_vec], axis=-1)
                h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            else:
                h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h1 = tf.concat([h_im, h_vec], 1)

            if recurrent:
                xs = batch_to_seq(h1, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)
                if recurrent == 'lstm':
                    h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
                else:
                    assert recurrent == 'lnlstm'
                    h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
                h2 = seq_to_batch(h5)
            else:
                h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)

            vf = fc(h2, 'vf', 1)
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        a0_r = self.pd.mode()
        neglogp0 = self.pd.neglogp(a0)
        if not recurrent:
            self.initial_state = None
        else:
            self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)


        self.placeholder_dict = {
            "image": X_im,
            "vector": X_vec
        }
        if add_flownet:
            self.placeholder_dict["last_image"] = X_p

        if not recurrent:
            def step(ob, *_args, remove_noise=False, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                if not remove_noise:
                    a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict)
                return a, v, self.initial_state, neglogp

            def value(ob, *_args, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                return sess.run(v0, feed_dict=feed_dict)
        else:
            def step(ob, state, mask, remove_noise=False):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                if not remove_noise:
                    a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict)
                return a, v, s, neglogp

            def value(ob, state, mask):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                return sess.run(v0, feed_dict=feed_dict)

        self.X_im = X_im
        self.X_vec = X_vec
        self.X_p = X_p
        self.pi = pi
        if not recurrent:
            self.vf = v0
        else:
            self.vf = vf
            self.M = M
            self.S = S
        self.step = step
        self.value = value

Пример #33

0

Показать файл

Файл: cnn_lstm.py Проект: zjucsphd/openai_acer

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        super().__init__(sess,
                         ob_space,
                         ac_space,
                         nenv,
                         nsteps,
                         nstack,
                         reuse=reuse)
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        nlstm = self.lstm_units
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            X = tf.cast(X, tf.float32)
            h = conv(X,
                     'c1',
                     nf=16,
                     rf=3,
                     stride=1,
                     pad='SAME',
                     init_scale=np.sqrt(2))
            h = tf.nn.relu(h)
            h = conv(h,
                     'c2',
                     nf=32,
                     rf=3,
                     stride=1,
                     pad='SAME',
                     init_scale=np.sqrt(2))
            h = tf.nn.relu(h)
            h = conv_to_fc(h)
            h = fc(h, 'fc1', nh=self.dense_units, init_scale=np.sqrt(2))
            h = tf.nn.relu(h)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        self.a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.snew = snew
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q
        self.sess = sess

Пример #34

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):
        nenv = nbatch // nsteps

        qmdp_param = {}
        # qmdp_param['K'] = 3
        qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n
        qmdp_param['num_action'] = ac_space.n
        qmdp_param['num_state'] = 32
        qmdp_param['num_obs'] = 17

        input_len = ob_space.shape
        input_shape = (nbatch, ) + input_len
        num_action = qmdp_param["num_action"]
        obs_len = qmdp_param["obs_len"]
        num_state = qmdp_param['num_state']
        num_obs = qmdp_param['num_obs']

        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, input_shape)  #[nbatch,obs+prev action]
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, num_state])  #beliefs

        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps)
            #xs originaly [nbatch,input_len]
            #reshape xs to [nenv,nsteps,input_len]
            #split xs along axis=1 to nsteps
            #xs becomes [nsteps,nenv,input_len]
            #dived xs to obs and pre_action
            obs = [x[:, 0:obs_len] for x in xs]
            acts = [x[:, obs_len:] for x in xs]
            ms = batch_to_seq(M, nenv, nsteps)
            #same as xs
            #ms has shape [nsteps,nenv]

            #build variabels
            self.planner_net = PlannerNet("planner", qmdp_param)
            self.filter_net = FilterNet("filter", qmdp_param)

            #calculate action value q, and belief bnew
            s_hist, snew = self.filter_net.beliefupdate(obs, acts, ms, S)
            # s_hist, snew, w_O, Z_o, b_prime_a, b_f = self.filter_net.beliefupdate(obs, acts, ms, S)
            #s_hist: [nstep,nenv,num_state]
            Q = self.planner_net.VI(nbatch)

            # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            # h5 = seq_to_batch(h5)

            #calculate action and value
            s_hist = seq_to_batch(s_hist)  #[nbatch,num_state]
            q = self.planner_net.policy(Q, s_hist)

            self.pd, self.pi = self.pdtype.pdfromlatent(q)
            vf = fc(q, 'v', 1)  #critic value function

            #pi = fc(h5, 'pi', nact) #actor
            #vf = fc(h5, 'v', 1) #critic value function

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        # self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.initial_state = np.ones(
            (nenv, num_state), dtype=np.float32) / num_state

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })
            # a,b,c,d,w_O_val,Z_o_val,b_prime_a_val,b_f_val = sess.run([a0, v0, snew, neglogp0, w_O, Z_o, b_prime_a, b_f], {X:ob, S:state, M:mask})
            # print("w_O: ",w_O_val)
            # print("Z_o: ",Z_o_val)
            # print("b_prime_a_val: ",b_prime_a_val)
            # print("b_f_val: ",b_prime_a_val)
            # return a,b,c,d

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value

Пример #35

0

Показать файл

Файл: policies.py Проект: renly/baselines-rudder

    def __init__(self, tf_session, ob_space, ac_space, nbatch,
                 reward_redistribution_config, observation_network_config, lstm_network_config, training_config,
                 exploration_config, nsteps, nlstm=64, reuse=False):
        """LSTM policy network, as described in RUDDER paper
        
        Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and
        the features from the reward redistribution observation network;
        
        Parameters
        -------
        tf_session : tensorflow session
            tensorflow session to compute the graph in
        ob_space
            Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes;
        ac_space
            Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions;
        nbatch : int
            Batchsize
        nsteps : int
            Fixed number of timesteps to process at once
        reward_redistribution_config : dict
            Dictionary containing config for reward redistribution:
            -----
            lambda_eligibility_trace : float
                Eligibility trace value for redistributed reward
            vf_contrib : float
                Weighting of original value function (vf) vs. redistributed reward (rr), s.t.
                :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)`
            use_reward_redistribution_quality_threshold : float
                Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used;
                use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction
                error, as described in RUDDER paper;
            use_reward_redistribution : bool
                Use reward redistribution?
            rr_junksize : int
                Junksize for reward redistribution; Junks overlap by 1 half each
            cont_pred_w : float
                Weighting of continous prediciton loss vs. prediction loss of final return at last timestep
            intgrd_steps : int
                Stepsize for integrated gradients
            intgrd_batchsize : int
                Integrated gradients is computed batch-wise if intgrd_batchsize > 1
        observation_network_config : dict
            Dictionary containing config for observation network that processes observations and feeds them to LSTM
            network:
            -----
            show_states : bool
                Show frames to network?
            show_statedeltas : bool
                Show frame deltas to network?
            prepoc_states : list of dicts
                Network config to preprocess frames
            prepoc_deltas : list of dicts
                Network config to preprocess frame deltas
            prepoc_observations : list of dicts
                Network config to preprocess features from frame and frame-delta preprocessing networks
        lstm_network_config : dict
            Dictionary containing config for LSTM network:
            -----
            show_actions : bool
                Show taken actions to LSTM?
            reversed : bool
                Process game sequence in reversed order?
            layers : list of dicts
                Network config for LSTM network and optional additional dense layers
            initializations : dict
                Initialization config for LSTM network
            timestep_encoding : dict
                Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class
        training_config : dict
            Dictionary containing config for training and update procedure:
            -----
            n_no_rr_updates : int
                Number of updates to perform without training or using reward redistribution network
            n_pretrain_games : int
                Number of games to pretrain the reward redistribution network without using it;
            downscale_lr_policylag : bool
                Downscale learningrate permanently if policy lag gets too large?
            optimizer : tf.train optimizer
                Optimizer in tf.train, e.g. "AdamOptimizer"
            optimizer_params : dict
                Kwargs for optimizer
            l1 : float
                Weighting for l1 weight regularization
            l2 : float
                Weighting for l2 weight regularization
            clip_gradients : float
                Threshold for clipping gradients (clipping by norm)
        exploration_config : dict
            Dictionary containing config for exploration:
            -----
            sample_actions_from_softmax : bool
                True: Apply softmax to policy network output and use it as probabilities to pick an action
                False: Use the max. policy network output as action
            temporal_safe_exploration : bool
                User RUDDER safe exploration
            save_pi_threshold : float
                Threshold value in range [0,1] for safe actions in RUDDER safe exploration
        nlstm : int
            Number of LSTM units (=memory cells)
        reuse : bool
            Reuse tensorflow variables?
        """
        #
        # Shapes
        #
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        seq_ob_shape = (nenv, -1, nh, nw, 1)
        nact = ac_space.n
        
        #
        # Placeholders for inputs
        #
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        
        #
        # Prepare input
        #
        single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32)
        delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32)
        
        #
        #  Get observation features from RR model
        #
        rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config,
                                             observation_network_config=observation_network_config,
                                             lstm_network_config=lstm_network_config, training_config=training_config,
                                             scopename="RR")
        self.rr_observation_model = rr_model
        rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames,
                                                            additional_inputs=[])
        
        #
        #  Build policy network
        #
        with tf.variable_scope("model", reuse=reuse):
            temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False,
                                          name='temperature')
            
            additional_inputs = [StopGradientLayer(rr_observation_layer)]
            observation_layers, observation_features = observation_network(
                    single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs,
                    observation_network_config=observation_network_config)
            
            self.observation_features_shape = observation_features.get_output_shape()
            
            xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps,
                                                       value=tf.reshape(observation_layers[-1].get_output(),
                                                                        [nenv, nsteps, -1]))]
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            h6 = h5
            pi = fc(h6, 'pi', nact)
            vf = fc(h6, 'v', 1)
        
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)
        
        if exploration_config['sample_actions_from_softmax']:
            a0 = self.pd.sample_temp(temperature=temperature)
        else:
            a0 = tf.argmax(pi, axis=-1)
        
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        
        def step(ob, state, mask):
            a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, v, s, neglogp
        
        def value(ob, state, mask):
            return tf_session.run(v0, {X:ob, S:state, M:mask})
        
        def action(ob, state, mask, *_args, **_kwargs):
            a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, s, neglogp
        
        #
        # Placeholders for exploration
        #
        n_envs = pi.shape.as_list()[0]
        exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,))
        prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        
        #
        # Setting up safe exploration
        #
        explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl),
                                                tf.less_equal(gamelengths_pl,
                                                              exploration_timesteps_pl + exploration_durations_pl)),
                                 tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32)))

        safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True)
        safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True)
        save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1)
                                   / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold']))
        safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32)
        safe_pi /= tf.reduce_sum(safe_pi)
        
        rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0]
        
        safe_pi_flat = tf.reshape(safe_pi, (-1,))
        prev_action_is_safe = tf.gather(safe_pi_flat,
                                        prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64)
                                        * safe_pi.shape.as_list()[1])
        prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32))
        
        a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl,
                                                           tf.not_equal(gamelengths_pl, exploration_timesteps_pl)),
                                            prev_action_is_safe),
                             prev_actions_pl, rand_safe_a)
        
        a_explore = tf.where(explore, a_explore, a0)
        
        # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start)
        rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype)
        a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore)
        
        if not exploration_config['temporal_safe_exploration']:
            a_explore = a0
            
        neglogp_explore = self.pd.neglogp(a_explore)
        
        def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths,
                               keep_prev_action, prev_action_count, exploration_durations, **_kwargs):
            """Get actions with exploration for long-term reward"""
            a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore],
                                  {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps,
                                   prev_actions_pl: prev_actions,
                                   gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations,
                                   keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count})
            return a, s, neglogp
        
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.action = action
        self.action_exploration = action_exploration
        self.seq_ob_shape = seq_ob_shape
        self.exploration_config = exploration_config

Пример #36

0

Показать файл

Файл: policy2.py Проект: Recharrs/NavRobot

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n

        X = tf.placeholder(tf.float32, ob_shape)  #obs
        I = tf.placeholder(tf.int32, [nbatch, 5])
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states

        # Model
        with tf.variable_scope("model", reuse=reuse):
            # Image Processing
            with tf.variable_scope("cnn"):
                x_image_rep = nature_cnn(X)

            # Instructioin Processing
            with tf.variable_scope("GRU"):
                embedding = tf.get_variable(
                    'word_embedding',
                    shape=[12, 32],
                    initializer=tf.random_uniform_initializer(-1e-3, 1e-3))
                gru_cell = tf.contrib.rnn.GRUCell(
                    num_units=256,
                    kernel_initializer=tf.random_uniform_initializer(
                        -1e-3, 1e-3),
                    bias_initializer=tf.random_uniform_initializer(
                        -1e-3, 1e-3))

                encoder_hidden = gru_cell.zero_state(nbatch, dtype=tf.float32)
                for i in range(5):
                    word_embedding = tf.nn.embedding_lookup(embedding, I[:, i])
                    output, encoder_hidden = gru_cell.call(
                        word_embedding, encoder_hidden)
                x_insts_rep = encoder_hidden

            # Gated-Attention layers
            with tf.variable_scope("x-attn"):
                x_attention = tf.sigmoid(
                    fc(x_insts_rep, 'x-attn', 64, init_scale=1.0))
                x_attention = tf.expand_dims(x_attention, 1)
                x_attention = tf.expand_dims(x_attention, 2)

            with tf.variable_scope("Gated-Attention"):
                x = x_image_rep * x_attention
                x = conv_to_fc(x)
                x = tf.nn.relu(fc(x, 'x-Ga', 256, init_scale=1.0))

            xs = batch_to_seq(x, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h20, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm, init_scale=1.0)
            h20 = seq_to_batch(h20)

            with tf.variable_scope("pi"):
                pi = tf.layers.dense(
                    h20,
                    nact,
                    kernel_initializer=normalized_columns_initializer(0.01))
            with tf.variable_scope("vf"):
                vf = tf.layers.dense(
                    h20,
                    1,
                    kernel_initializer=normalized_columns_initializer(0.01))

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, insts, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                I: insts,
                S: state,
                M: mask
            })

        def value(ob, insts, state, mask):
            return sess.run(v0, {X: ob, I: insts, S: state, M: mask})

        self.X = X
        self.I = I  #
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

        # start logging
        # =============
        if reuse:
            self.var_summary('./Asset/logdir', sess)

Пример #37

0

Показать файл

    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 feature_mlp=True):

        # Here the batch size is 1, i.e. one trajectory
        # also assume nenvs=1
        if nsteps is None:
            ob_shape = (None, ) + ob_space.shape
            M = tf.placeholder(tf.float32, [None])
        else:
            ob_shape = (nsteps, ) + ob_space.shape
            M = tf.placeholder(tf.float32, [nsteps])

        if len(ac_space.shape) == 0:
            # discrete set of actions
            nact = ac_space.n
            discrete = True
        else:
            actdim = ac_space.shape[0]
            discrete = False
        X = tf.placeholder(tf.float32, ob_shape, name="Ob")
        S = tf.placeholder(tf.float32, [1, nlstm * 2])  # states

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            if feature_mlp:
                h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2)))
                h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2)))
                xs = batch_to_seq(h2, 1, nsteps)
            else:
                xs = batch_to_seq(X, 1, nsteps)
            ms = batch_to_seq(M, 1, nsteps)
            h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, "vf", 1)
            if discrete:
                pi = fc(h5, "pi", nact, init_scale=0.01)
            else:
                pi = fc(h5, "pi", actdim, init_scale=0.01)
                logstd = tf.get_variable(name="logstd",
                                         shape=[1, actdim],
                                         initializer=tf.zeros_initializer())

        self.pdtype = make_pdtype(ac_space)
        if discrete:
            self.pd = self.pdtype.pdfromflat(pi)
        else:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((1, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

Python seq_to_batch примеры использования