예제 #1
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 hid_size,
                 num_hid_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.num_subpolicies = num_subpolicies

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            # obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.vpred = U.dense(last_out,
                                 1,
                                 "vffinal",
                                 weight_init=U.normc_initializer(1.0))[:, 0]

            # master policy
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "masterpol%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.selector = U.dense(last_out,
                                    num_subpolicies, "masterpol_final",
                                    U.normc_initializer(0.01))
            self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
            self.pd = pdtype.pdfromflat(self.selector)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])
예제 #2
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 network='mlp',
                 gaussian_fixed_var=True,
                 nsteps=None,
                 nbatch=None,
                 nlstm=256,
                 states=None,
                 masks=None,
                 reuse=False):
        self.network = network

        shape = []
        for d in range(1, len(ob.shape)):
            shape.append(ob.shape[d])

        with tf.variable_scope(name, reuse=reuse):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=shape)
            obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)

            if network == 'mlp':
                hid_size = 64
                num_hid_layers = 2
                self.hid_size = hid_size
                self.num_hid_layers = num_hid_layers
                self.gaussian_fixed_var = gaussian_fixed_var
                self._mlp(obs, hid_size, num_hid_layers, ac_space,
                          gaussian_fixed_var)
            elif network == 'cnn':
                self._cnn(obs, ac_space, gaussian_fixed_var)
            elif network == 'lstm':
                assert nsteps is not None and nbatch is not None
                assert states is not None and masks is not None
                assert isinstance(nsteps, int) and isinstance(nbatch, int)
                assert nsteps > 0 and nbatch > 0
                self._lstm(obs, states, masks, nlstm, ac_space, nbatch, nsteps)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if network == 'mlp' or network == 'cnn':
            self._act = U.function([stochastic, ob], [ac, self.vpred])
        elif network == 'lstm':
            self._act = U.function([stochastic, ob, states, masks],
                                   [ac, self.vpred, self.snew])
예제 #3
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 num_subpolicies,
                 network='mlp',
                 gaussian_fixed_var=True):
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        shape = []
        for d in range(1, len(ob.shape)):
            shape.append(ob.shape[d])

        with tf.variable_scope("obfilter", reuse=tf.AUTO_REUSE):
            self.ob_rms = RunningMeanStd(shape=shape)
        obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            if network == 'mlp':
                hid_size = 64
                num_hid_layers = 2
                self.hid_size = hid_size
                self.num_hid_layers = num_hid_layers
                self._mlp(obs, num_subpolicies, hid_size, num_hid_layers,
                          ac_space, gaussian_fixed_var)
            elif network == 'cnn':
                self._cnn(obs, num_subpolicies)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])
예제 #4
0
    def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            # obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

            # sub policy
            self.pdtype = pdtype = make_pdtype(ac_space)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "pol%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(self.pdparam)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
예제 #5
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 hid_size,
                 num_hid_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.num_subpolicies = num_subpolicies

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
            # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob / 255.0

            # value function
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            '''Conv2d'''
            last_out = tf.nn.relu(
                U.conv2d(last_out, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            last_out = tf.nn.relu(
                U.conv2d(last_out, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            last_out = tf.nn.relu(
                U.conv2d(last_out, 32, "l3", [3, 3], [1, 1], pad="VALID"))
            last_out = U.flattenallbut0(last_out)
            last_out = tf.nn.relu(
                tf.layers.dense(last_out,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))

            self.vpred = U.dense(last_out,
                                 1,
                                 "vffinal",
                                 weight_init=U.normc_initializer(1.0))[:, 0]

            # master policy
            # last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "masterpol%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.selector = U.dense(last_out,
                                    num_subpolicies, "masterpol_final",
                                    U.normc_initializer(0.01))
            self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
            self.pd = pdtype.pdfromflat(self.selector)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])