Exemplo n.º 1
0
    def _mlp(self, obs, num_subpolicies, hid_size, num_hid_layers, ac_space,
             gaussian_fixed_var):
        # value function
        last_out = obs
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        # master policy
        last_out = obs
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "masterpol%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.selector = U.dense(last_out, num_subpolicies, "masterpol_final",
                                U.normc_initializer(0.01))
        self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
        self.pd = pdtype.pdfromflat(self.selector)
Exemplo n.º 2
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 hid_size,
                 num_hid_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.num_subpolicies = num_subpolicies

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            # obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.vpred = U.dense(last_out,
                                 1,
                                 "vffinal",
                                 weight_init=U.normc_initializer(1.0))[:, 0]

            # master policy
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "masterpol%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.selector = U.dense(last_out,
                                    num_subpolicies, "masterpol_final",
                                    U.normc_initializer(0.01))
            self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
            self.pd = pdtype.pdfromflat(self.selector)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])
Exemplo n.º 3
0
 def _cnn(self, obs, num_subpolicies):
     features = feature_net(obs)
     self.vpred = U.dense(features,
                          1,
                          "vffinal",
                          weight_init=U.normc_initializer(1.0))[:, 0]
     self.selector = U.dense(features, num_subpolicies, "masterpol_final",
                             U.normc_initializer(0.01))
     self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
     self.pd = pdtype.pdfromflat(self.selector)
Exemplo n.º 4
0
    def __init__(self,
                 name,
                 ob,
                 hid_size,
                 num_hid_layers,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                if (len(ob.shape) == 2):
                    self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
                elif (len(ob.shape) == 3):
                    self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] *
                                                        ob.get_shape()[2]))
                elif (len(ob.shape) == 4):
                    self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] *
                                                        ob.get_shape()[2] *
                                                        ob.get_shape()[3]))
            #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.vpred = tf.clip_by_value(
                U.sum(
                    U.dense(last_out,
                            1,
                            "vffinal",
                            weight_init=U.normc_initializer(1.0))[:, 0]), 0.0,
                1000.0)

        # sample actions
        self._act = U.function([ob], [self.vpred])
Exemplo n.º 5
0
    def _cnn(self, obs, ac_space, gaussian_fixed_var):
        features = feature_net(obs)
        self.vpred = U.dense(features,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        self.pdtype = pdtype = make_pdtype(ac_space)
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(features,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            self.pdparam = U.dense(features,
                                   pdtype.param_shape()[0], "polfinal",
                                   U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(self.pdparam)
Exemplo n.º 6
0
    def __init__(self, name, ob):
        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

            x = tf.nn.relu(U.conv2d(obz, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 64, 'lin', U.normc_initializer(1.0)))

            self.ob = x
Exemplo n.º 7
0
    def _mlp(self, obs, hid_size, num_hid_layers, ac_space,
             gaussian_fixed_var):
        # value function
        last_out = obs
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        # sub policy
        self.pdtype = pdtype = make_pdtype(ac_space)
        last_out = obs
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "pol%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            self.pdparam = U.dense(last_out,
                                   pdtype.param_shape()[0], "polfinal",
                                   U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(self.pdparam)
Exemplo n.º 8
0
    def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            # obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

            # sub policy
            self.pdtype = pdtype = make_pdtype(ac_space)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "pol%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(self.pdparam)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 9
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 hid_size,
                 num_hid_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.num_subpolicies = num_subpolicies

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
            # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob / 255.0

            # value function
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            '''Conv2d'''
            last_out = tf.nn.relu(
                U.conv2d(last_out, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            last_out = tf.nn.relu(
                U.conv2d(last_out, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            last_out = tf.nn.relu(
                U.conv2d(last_out, 32, "l3", [3, 3], [1, 1], pad="VALID"))
            last_out = U.flattenallbut0(last_out)
            last_out = tf.nn.relu(
                tf.layers.dense(last_out,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))

            self.vpred = U.dense(last_out,
                                 1,
                                 "vffinal",
                                 weight_init=U.normc_initializer(1.0))[:, 0]

            # master policy
            # last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "masterpol%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.selector = U.dense(last_out,
                                    num_subpolicies, "masterpol_final",
                                    U.normc_initializer(0.01))
            self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
            self.pd = pdtype.pdfromflat(self.selector)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])