Exemplo n.º 1
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi = fc(h, 'pi', nact, init_scale=0.01)
            vf = fc(h, 'v', 1)[:, 0]

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemplo n.º 2
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **kwargs): #pylint: disable=W0613
        self.width = kwargs['mlp_width']
        ob_shape = (nbatch,) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope(TF_VAR_SCOPE, reuse=reuse):
            activ = tf.tanh  # Diverges even at super low learning rates
            # activ = tf.nn.relu
            # activ = tf.nn.leaky_relu  # Diverges
            p_h1 = activ(fc(X, 'pi_fc1', nh=self.width, init_scale=np.sqrt(2)))
            p_h2 = activ(fc(p_h1, 'pi_fc2', nh=self.width, init_scale=np.sqrt(2)))
            pi = fc(p_h2, 'pi', actdim, init_scale=0.01)
            v_h1 = activ(fc(X, 'vf_fc1', nh=self.width, init_scale=np.sqrt(2)))
            v_h2 = activ(fc(v_h1, 'vf_fc2', nh=self.width, init_scale=np.sqrt(2)))
            vf = fc(v_h2, 'vf', 1)[:, 0]
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.p_h1 = p_h1
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)


        # One idea to benefit from SL learned actions would be to average the SL action with the one drawn here, where
        # the weight of the SL action decays over time.
        # That way you are still optimizing with respect to the parameters, but there is a nudge towards taking the
        # correct action which helps narrow down the exploration space. As you decay the SL weight, you offer a chance
        # to learn actions better than SL. This needs to be balanced with decaying other hyperparams like lr, etc... tho.
        a0 = self.pd.sample()

        neglogp0 = self.pd.neglogp(a0)
        action_probs0 = tf.exp(-neglogp0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp, action_probs = sess.run([a0, vf, neglogp0, action_probs0], {X: ob})

            # For deepdrive we expect outputs to be between -1 and 1
            # a = np.tanh(a)  # Doesn't work very well, clipping and letting the network learn the valid range is prob better

            return a, v, self.initial_state, neglogp, action_probs

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.action_probs0 = action_probs0
        self.vf = vf
        self.step = step
        self.value = value
Exemplo n.º 3
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemplo n.º 4
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        ob_shape = (nbatch, ob_space.shape[0])
        if type(ac_space) is spaces.Box:
            nact = ac_space.shape[0]
        else:
            nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope(TF_VAR_SCOPE, reuse=reuse):
            h = tf.cast(X, tf.float32)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)
            if type(ac_space) is spaces.Box:
                logstd = tf.get_variable(name="logstd", shape=[1, nact],
                                         initializer=tf.zeros_initializer())
            else:
                logstd = None

        if logstd is not None:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
        else:
            pdparam = pi
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemplo n.º 5
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  #obs
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)[:, 0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value