Python switch примеры, baselines.common.tf_util.switch Python примеры использования

Пример #1

0

Показать файл

Файл: stc_policy.py Проект: Wellan89/rl-board

    def _init(self, ob_space, ac_space):
        self.pdtype = distributions.make_pdtype(ac_space)

        ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape))
        next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1)

        with tf.variable_scope('next_blocks'):
            next_blocks = tf.one_hot(next_blocks, depth=5)
            next_blocks = U.flattenallbut0(next_blocks)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        with tf.variable_scope('grids', reuse=False):
            my_grid = _grid_cnn(my_grid)

        with tf.variable_scope('grids', reuse=True):
            opp_grid = _grid_cnn(opp_grid)

        x = tf.concat([next_blocks, my_grid, opp_grid], axis=1)
        x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = self.pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #2

0

Показать файл

Файл: mlp_policy.py Проект: IcarusTan/baselines

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #3

0

Показать файл

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #4

0

Показать файл

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True):
        #assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        # Apply rnn_to reduce history
        with tf.variable_scope("vf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        # Apply rnn_to reduce history
        with tf.variable_scope("pf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0))

            assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box)
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #5

0

Показать файл

Файл: model.py Проект: tornadoyi/gamescholar

    def _create_logit_value(self,
                            action_layer,
                            value_layer,
                            gaussian_fixed_var=False):
        # actor
        if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box):
            mean = U.dense(action_layer,
                           self.pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, self.pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(action_layer,
                              self.pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = self.pdtype.pdfromflat(pdparam)
        self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        # critic
        self.vpred = U.dense(value_layer,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

Пример #6

0

Показать файл

Файл: cnnpolicy.py Проект: zhaokang1228/my_git

    def _policy_nn(self, odim, adim, train):
        # activ = tf.nn.tanh
        # self.pdtype = make_pdtype(self.ac_space)
        # self.pdtype = DiagGaussianPdType(self.ac_space.shape[0])
        # hid1_size = 64
        # out = tf.layers.dense(self.x, adim, trainable=train,
        #                       kernel_initializer=tf.random_normal_initializer(stddev=np.sqrt(1/adim)), name='out')
        #
        # self.pd = self.pdtype.pdfromflat(out)
        # self._act = U.function([self.ob], self.pd.sample()) # [self.pd.sample(), mean, logstd]
        # self .ac = self.pd.sample()
        # logits = tf.layers.dense(self.x, self.pdtype.param_shape()[0], name='logits',
        #                               kernel_initializer=U.normc_initializer(0.01))
        # self.pd = self.pdtype.pdfromflat(logits)
        mean = tf.layers.dense(self.x,
                               self.pdtype.param_shape()[0] // 2,
                               name="polfinal",
                               kernel_initializer=U.normc_initializer(0.01))
        logstd = tf.get_variable(name="logstd",
                                 shape=[1,
                                        self.pdtype.param_shape()[0] // 2],
                                 initializer=tf.zeros_initializer())
        # 链接
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        self.pd = self.pdtype.pdfromflat(pdparam)

        stochastic = U.get_placeholder(dtype=tf.bool,
                                       shape=(),
                                       name="stochastic")
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([stochastic, self.ob], ac)
        self.ac = ac

Пример #7

0

Показать файл

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              exploration_rate,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        # with tf.variable_scope("obfilter"):
        #     self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz = ob

        valueFunction = Sequential()
        valueFunction.add(InputLayer(input_tensor=obz))
        valueFunction.add(Dense(64, activation='tanh'))
        valueFunction.add(Dense(64, activation='tanh'))

        self.vpred = self.dense(x=valueFunction.output,
                                size=1,
                                name="vffinal",
                                weight_init=U.normc_initializer(1.0),
                                bias=True)[:, 0]

        model = Sequential()
        model.add(InputLayer(input_tensor=obz))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(23))
        model.load_weights("neural_kick")

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = model.output
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, pdtype.param_shape()[0] // 2],
                initializer=tf.constant_initializer(exploration_rate))
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = tf.layers.dense(model.output,
                                      pdtype.param_shape()[0], "polfinal",
                                      U.normc_initializer(0.01))
        my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1)
        my_var_out = tf.identity(my_var, name='output_node')
        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #8

0

Показать файл

Файл: cnn_policy.py Проект: bukysun/pipeline_following

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        print type(ob_space)
        assert isinstance(ob_space, gym.spaces.box.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        self.ob = [ob]

        #process ob_
        x = ob / 255.0
            
        ob_last = self.img_encoder(x, kind)
        
        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #9

0

Показать файл

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              tau,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)
        print('use zpmpl_Adv')
        self.ac_space = ac_space
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        self.ob = U.get_placeholder(name="ob_adv",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.ob_ = U.get_placeholder(name="adv_ob_",
                                     dtype=tf.float32,
                                     shape=[sequence_length] +
                                     list(ob_space.shape))

        with tf.variable_scope("obfilter_adv"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('adv_vf'):
            self.obz = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = self.obz
            for i in range(self.num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        self.hid_size,
                        name="adv_vffc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="adv_vffinal",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.pdparam = self.build_action(self.ob)
        self.pdparam_ = self.build_action(self.ob_, reuse=True)

        self.pd = pdtype.pdfromflat(self.pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = self.pd.sample()
        self.ac_, _ = self.sample_()
        self._act = U.function([stochastic, self.ob], [ac, self.vpred])

Пример #10

0

Показать файл

Файл: mlp_policy.py Проект: rish987/baselines

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        # to store current input observation
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # construct value function estimator
        with tf.variable_scope('vf'):
            # to store clipped normalized current input observation
            #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob
            # last layer is input obz
            last_out = obz
            for i in range(num_hid_layers):
                #last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=tf.zeros_initializer()))
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))

            # close off the neural network
            #self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=tf.zeros_initializer())[:,0]
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        # construct policy network
        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                #last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=tf.zeros_initializer()))
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            # continuous action space, and want state-independent variance on
            # output gaussian means
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                #mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=tf.zeros_initializer())
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                self.mean = mean
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                #pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=tf.zeros_initializer())
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        # apparently unnecessary
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #11

0

Показать файл

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0]
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape])

        if activation == 'tanh':
            activ = tf.nn.tanh
        elif activation == 'elu':
            activ = tf.nn.elu
        elif activation == 'lrelu':
            activ = lambda x: tf.maximum(x, 0.01 * x)
        else:
            raise NotImplementedError("Not available activation: " + activation)

        if PREPROCESS:
            last_out = ob
        else:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #12

0

Показать файл

    def __init__(self,
                 name,
                 observation_shape,
                 action_shape,
                 hid_size,
                 num_hid_layers,
                 stochastic=True):
        with tf.variable_scope(name):
            self.stochastic = stochastic
            self.hid_size, self.num_hid_layers = hid_size, num_hid_layers
            self.action_shape, self.observation_shape = action_shape, observation_shape
            self.scope = tf.get_variable_scope().name
            self.pdtype = DiagGaussianPdType(action_shape[0])

            observations_ph = U.get_placeholder(name='ob',
                                                dtype=tf.float32,
                                                shape=[None] +
                                                list(observation_shape))
            stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())

            with tf.variable_scope('obfilter'):
                self.ob_rms = RunningMeanStd(shape=observation_shape)

            with tf.variable_scope('pol'):
                last_out = tf.clip_by_value(
                    (observations_ph - self.ob_rms.mean) / self.ob_rms.std,
                    -5.0, 5.0)
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name='fc%i' % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                mean = tf.layers.dense(
                    last_out,
                    self.pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name='logstd',
                    shape=[1, self.pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

            self.pd = self.pdtype.pdfromflat(pdparam)

            action_op = U.switch(stochastic_ph, self.pd.sample(),
                                 self.pd.mode())
            self._act = U.function([stochastic_ph, observations_ph], action_op)

Пример #13

0

Показать файл

Файл: mlp_policy.py Проект: mklissa/DAVF

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        # pdb.set_trace()
        # var_is_good = any(isinstance(ob_space, t) for t in [gym.spaces.Box,Box])
        assert isinstance(ob_space, (gym.spaces.Box))



        if isinstance(hid_size,int):
            hid_size = [hid_size] * num_hid_layers

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('sigma'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.sigmapred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred, self.sigmapred])

Пример #14

0

Показать файл

Файл: lstm_fc_discrete_policy.py Проект: ByzanTine/baselines

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n)
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n)
            def sub_pol(input_m, scope):
                state_embedding = tf.tile(tf.expand_dims(input_m, axis=1), [1, 1, 1])
                rnn_cell = rnn.BasicLSTMCell(
                  num_units=pdtype.param_shape()[0])
                last_out, states = tf.nn.dynamic_rnn(
                  cell=rnn_cell,
                  inputs=state_embedding,
                  dtype=tf.float32, scope=scope)
                return tf.squeeze(last_out, axis=1)
            ppsl = []
            for i in range(4):
                ppsl.append(sub_pol(last_out, 'pol' + '/' + str(i)))
            last_out = tf.concat(ppsl, axis=1) 
	    
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01), activity_regularizer=tf.contrib.layers.l2_regularizer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01), activity_regularizer=tf.contrib.layers.l2_regularizer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #15

0

Показать файл

    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            # self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
            self.vpred = discriminator_model([last_out], drop_rate=0.5)

        with tf.variable_scope('pol'):
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))

            pdparam = generator_model([last_out],
                                      pdtype.param_shape()[0],
                                      drop_rate=0.5)

            # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            #     mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
            #     logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            #     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            # else:
            #     pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #16

0

Показать файл

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, summaries = False, should_act = True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        ob = tf.get_default_graph().get_tensor_by_name("observations:0");
        if ob is None:
            ob = U.get_placeholder(name="observations", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope('pol'):
            last_out = ob
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))
                last_out = tf.nn.elu(last_out);
                #last_out = tf.nn.tanh(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, tf.ones(shape=mean.shape)* logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        with tf.variable_scope("distribution"):
            self.pd = pdtype.pdfromflat(pdparam)

        if should_act:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)

            with tf.variable_scope('vf'):
                #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
                last_out = ob
                for i in range(num_hid_layers):
                    last_out = tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))
                    last_out = tf.nn.tanh(last_out);
                self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]


            self.state_in = []
            self.state_out = []

            with tf.variable_scope("distribution"):
                stochastic = tf.placeholder(dtype=tf.bool, shape=(), name = "stochastic")
                ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

            self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #17

0

Показать файл

    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        ob_shape = (nbatch, ob_space.shape[0]*nstack)
        nact = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("obfilter", reuse=reuse):
            self.ob_rms = RunningMeanStd(shape=ob_shape[1:])
        with tf.variable_scope("retfilter", reuse=reuse):
            self.ret_rms = RunningMeanStd(shape=(1,))

        obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #obz = X

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0))

            mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0)
            logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer())
            logstd = tf.expand_dims(logstd, 0)
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0)

        v0 = vf[:, 0]
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self.initial_state = [] #not stateful

        def step(stoch, ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {stochastic:stoch, X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.vf = vf
        self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std
        self.step = step
        self.value = value

Пример #18

0

Показать файл

Файл: cnn_policy.py Проект: bukysun/pipeline_following

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        self.ob = [ob_p, ob_f]
        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0
            
        x = self.img_encoder(x, kind)
        
        ob_last = tf.concat((obpz, x), axis=-1)

        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred_ext = tf.layers.dense(last_out, 1, name='vf_ext', kernel_initializer=U.normc_initializer(1.0))[:,0]
            self.vpred_int = tf.layers.dense(last_out, 1, name='vf_int', kernel_initializer=U.normc_initializer(1.0))[:,0]


        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred_ext, self.vpred_int])

Пример #19

0

Показать файл

    def _init(self, ob_space, ac_space, layers_val, layers_pol, gaussian_fixed_var=True,
              dist='gaussian', ):
        assert isinstance(ob_space, gym.spaces.Box)

        self.dist = dist
        self.pdtype = pdtype = make_pdtype(ac_space, dist=dist)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i, size in enumerate(layers_val):
                last_out = tf.nn.relu(tf.layers.dense(last_out, size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i, size in enumerate(layers_pol):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if dist == 'gaussian':
            self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.std, self.pd.mean, self.pd.logstd])
        elif dist == 'beta':
            self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.alpha, self.pd.beta, self.pd.alpha_beta])

Пример #20

0

Показать файл

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(0.01)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.constant_initializer(exploration_rate))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1)
        my_var_out = tf.identity(my_var, name='output_node')
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #21

0

Показать файл

    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var

        # obs
        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}_primitive".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])

        # obs normalization
        self.ob_rms = {}
        for ob_name in self.ob_type:
            with tf.variable_scope("ob_rms_{}".format(ob_name)):
                self.ob_rms[ob_name] = RunningMeanStd(
                    shape=self._ob_shape[ob_name])
        obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
               self.ob_rms[ob_name].std for ob_name in self.ob_type]
        obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
        obz = tf.concat(obz, -1)

        # value function
        with tf.variable_scope("vf"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="final",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        # primitive policy
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("pol"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic] + self.obs, [ac, self.vpred])
        self._value = U.function(self.obs, self.vpred)

Пример #22

0

Показать файл

Файл: mlp_policy.py Проект: yimingpeng/primal_dual_baseline

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        feature_funcs = []

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        self.std = tf.constant(1.0)
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):

            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            import numpy as np
            # for i in range(0, ob_space.shape[0]):
            #     # Polinomial
            #     # feature_funcs.append(lambda s, i=i: tf.pow(s, i))
            #     # Fourier
            #     # feature_funcs.append(lambda s, i=i: tf.cos(i*np.pi*s))
            #     # RBF
            #     feature_funcs.append(lambda s, i=i: tf.exp(-tf.pow(s - self.ob_rms.mean, 2)/(2*self.ob_rms.std
            #                                                                                     **2)))
            # obz = tf.concat([func(ob) for func in feature_funcs], axis = 1)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(0.1))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name = 'fc%i' % (i + 1), kernel_initializer = U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.multiply(
                    tf.ones(shape=[1, pdtype.param_shape()[0] // 2]),
                    tf.constant(0.05))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        pdparam = tf.clip_by_value(pdparam, -10.0, 10.0)
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #23

0

Показать файл

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        next_ob = U.get_placeholder(name="next_ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))

        act = U.get_placeholder(name="act",
                                dtype=tf.float32,
                                shape=[sequence_length] + list(ac_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('qf'):
            obz = tf.clip_by_value(
                (next_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

            for i in range(num_hid_layers):
                if i == num_hid_layers - 1:
                    last_out = tf.concat([last_out, act], axis=-1)
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.qpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            # out_std = tf.exp(0.5*logstd + 0.0)
            # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                import numpy as np
                pdparam = tf.concat([
                    mean, mean * 0.0 +
                    np.random.randn(pdtype.param_shape()[0] // 2) * logstd
                ],
                                    axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #24

0

Показать файл

Файл: mlp_policy_noscale.py Проект: wil3/openai-baseline

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = ob  #tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        # Since we are using a Box for the action space
        # this distribution is used DiagGaussianPd
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        # if stocastic = true, the call the sample of the distribion
        # otherwise just use the mean
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

Пример #25

0

Показать файл

    def _build(self):
        num_primitives = self.num_primitives
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size

        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])
        self._prev_primitive = prev_primitive = U.get_placeholder(
            name="prev_primitive", dtype=tf.int32, shape=[None])

        with tf.variable_scope(self.name):
            self._scope = tf.get_variable_scope().name

            self.ob_rms = {}
            for ob_name in self.ob_type:
                with tf.variable_scope("ob_rms_{}".format(ob_name)):
                    self.ob_rms[ob_name] = RunningMeanStd(
                        shape=self._ob_shape[ob_name])
            obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
                   self.ob_rms[ob_name].std for ob_name in self.ob_type]
            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            prev_primitive_one_hot = tf.one_hot(prev_primitive,
                                                num_primitives,
                                                name="prev_primitive_one_hot")
            obz = tf.concat([obz, prev_primitive_one_hot], -1)

            # value function
            with tf.variable_scope("vf"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%d" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    _,
                    1,
                    name="vpred",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

            # meta policy
            with tf.variable_scope("pol"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.selector = tf.layers.dense(
                    _,
                    num_primitives,
                    name="action",
                    kernel_initializer=U.normc_initializer(0.01))
                self.pdtype = pdtype = CategoricalPdType(num_primitives)
                self.pd = pdtype.pdfromflat(self.selector)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic, self._prev_primitive] + self.obs,
                               [ac, self.vpred])

Пример #26

0

Показать файл

Файл: policies_bc.py Проект: mxgiuliani00/irl_real_life

    def __init__(self, observations, action_space, latent, optimizer=None, sess=None, train=True, beta=1.0,
                 l2=0., lr=0.001, init_scale=0.01, init_bias=0.0, trainable_variance=True, trainable_bias=True,
                 init_logstd=0., scope_name="pi", clip=None, state_dependent_variance=True, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        latent = tf.layers.flatten(latent)

        self.action_space = action_space
        self.pdtype = make_pdtype(action_space)
        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=init_scale,
                                                    init_bias=init_bias,
                                                    trainable_variance=trainable_variance,
                                                    state_dependent_variance=state_dependent_variance,
                                                    trainable_bias=trainable_bias,
                                                    init_logstd=init_logstd,
                                                    clip=clip, beta=beta)  # init_bias=0.0

        self.stochastic = tf.placeholder(dtype=tf.bool, shape=())
        self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode())
        self.neglogp = self.pd.neglogp(self.action)
        if beta == 1.0:
            self.prob = tf.nn.softmax(self.pd.flatparam())
        else:
            self.prob = boltzmann(self.pd.flatparam(), beta=beta)
        if optimizer is None:
            self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        else:
            self.optimizer = optimizer
        self.sess = sess or tf.get_default_session()
        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope_name)
        try:
            self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder')
            self.action_selected = action_selected = tf.one_hot(self.action_ph, self.action_space.n)
        #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1))
            out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1)))
            gradients = tf.gradients(out, self.vars)
        except:
            self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + action_space.shape,
                                            name='targets_placeholder')
            gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        self.cont = cont = not isinstance(self.action_space, Discrete)

        self.compute_gradients = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=[gradients, tf.exp(- self.pd.neglogp(self.action_ph)), - self.pd.neglogp(self.action_ph),
                     self.pd.mean]
        )
        '''self.compute_cont_gradients = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        )'''
        self.debug = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=[gradients, self.prob, self.action_ph]
        )
        self.set_from_flat = tf_util.SetFromFlat(self.vars)
        if self.cont:
            total_error = tf.reduce_sum(tf.square(self.action_ph - tf.reduce_mean(self.action_ph, axis=0)), axis=0)
            unexplained_error = tf.reduce_sum(tf.square(self.action_ph - self.pd.mean), axis=0)
            R_squared = 1 - (unexplained_error / total_error)
            self.accuracy = accuracy = R_squared
        else:
            self.accuracy = accuracy = tf.reduce_mean(tf.cast(tf.math.equal(self.pd.mode(), self.action_ph), tf.float32))
        self.entropy = entropy = tf.reduce_mean(self.pd.entropy())
        if train:
            self.gamma = l2
            self._build_train(cont=cont, state_dependent_variance=state_dependent_variance)
        self.pdf = tf.exp(self.pd.logp(self.action_ph))

Пример #27

0

Показать файл

    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind):
        print("This is lstm policy for only sensors.")
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0

        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        # lstm layer for memmory
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn")
        c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
        h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
        self.state_init = (c_init, h_init)
        c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c))
        h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h))
        self.state_in = (c_in, h_in)

        state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
        lstm_outputs, lstm_states = lstm_cell(x, state_in)
        lstm_c, lstm_h = lstm_states
        self.state_out = (lstm_c, lstm_h)

        rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size))
        
        # conjugate sensor and physics
        ob_last = tf.concat((rnn_out, obpz), axis = -1)

        # value network
        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
 
        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])

Пример #28

0

Показать файл

Файл: mlp_policy.py Проект: HineWAN/learning_event_triggered_control

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]
        #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])])
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # define the angle
        #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2])
        #last_out = ctrl_in
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])

Пример #29

0

Показать файл

    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var
        if not isinstance(hid_size, list):
            hid_size = [hid_size]
        if len(hid_size) != num_hid_layers:
            hid_size += [hid_size[-1]] * (num_hid_layers - len(hid_size))

        self.obs = []
        self.pds = []

        for j in range(self._config.num_contexts):
            # obs
            _ob = {}
            for ob_name, ob_shape in self._ob_shape.items():
                _ob[ob_name] = U.get_placeholder(
                    name="ob_{}/from_{}".format(ob_name, j),
                    dtype=tf.float32,
                    shape=[None] + self._ob_shape[ob_name])

            # obs normalization
            if self._config.obs_norm == 'learn':
                obz = [(_ob[ob_name] - self.ob_rms[ob_name].mean) /
                       self.ob_rms[ob_name].std for ob_name in self.ob_type]
            else:
                obz = [_ob[ob_name] for ob_name in self.ob_type]

            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            # value function
            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = self._activation(
                        tf.layers.dense(
                            last_out,
                            hid_size[i],
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                vpred = tf.layers.dense(
                    last_out,
                    1,
                    name="final",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]
                if j == self._id:
                    self.vpred = vpred

            # policy
            pdtype = make_pdtype(ac_space)
            if j == self._id:
                self.pdtype = pdtype
            with tf.variable_scope('pol', reuse=tf.AUTO_REUSE):
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = self._activation(
                        tf.layers.dense(
                            last_out,
                            hid_size[i],
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0] // 2,
                        name="final",
                        kernel_initializer=U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                else:
                    pdparam = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0],
                        name="final",
                        kernel_initializer=U.normc_initializer(0.01))

            self.obs.append([_ob[ob_name] for ob_name in self.ob_type])
            self.pds.append(pdtype.pdfromflat(pdparam))

        self.ob = self.obs[self._id]
        self.pd = self.pds[self._id]

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic] + self.ob, [ac, self.vpred])
        self._value = U.function([stochastic] + self.ob, self.vpred)

Пример #30

0

Показать файл

Файл: policies.py Проект: ShikhaSurana/bp_tr-ppo-rb

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False,
                 name='policy', args=None): #pylint: disable=W0613
        policy_variance_state_dependent = args.policy_variance_state_dependent
        ac_fn = args.ac_fn
        hidden_sizes = args.hidden_sizes
        num_sharing_layers = args.num_sharing_layers
        num_layers = args.num_layers
        assert ac_fn in ['tanh', 'sigmoid', 'relu']

        if isinstance(hidden_sizes, int):
            assert num_layers is not None
            hidden_sizes = [hidden_sizes] * num_layers
        if num_layers is None:
            num_layers = len(hidden_sizes)
        assert num_layers == len(hidden_sizes)


        # print(f'Policy hidden_sizes:{hidden_sizes}')

        self.pdtype = make_pdtype(ac_space)

        with tf.variable_scope(name, reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)

            activ = getattr( tf.nn, ac_fn )
            processed_x = tf.layers.flatten(processed_x)

            # --- share layers
            for ind_layer in range(num_sharing_layers):
                processed_x = activ( fc(processed_x, f'share_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)) )

            # --- policy
            pi_h = processed_x
            for ind_layer in range( num_sharing_layers, num_layers ):
                pi_h = activ(fc(pi_h, f'pi_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))

            from gym import spaces
            params_addtional = {}
            if policy_variance_state_dependent and isinstance( ac_space, spaces.Box ):
                latent_logstd = processed_x
                for ind_layer in range(num_sharing_layers, num_layers):
                    latent_logstd = activ(fc(latent_logstd, f'logstd_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
                params_addtional['latent_logstd'] = latent_logstd

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h, init_scale=0.01, logstd_initial=args.logstd, **params_addtional)


            # --- value function
            vf_h = processed_x
            for ind_layer in range( num_sharing_layers, num_layers ):
                vf_h = activ(fc(vf_h, f'vf_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
            vf = fc(vf_h, 'vf', 1)[:,0]



            a_sample = self.pd.sample()
            neglogp_sample = self.pd.neglogp(a_sample)
            self.initial_state = None


            # --- predict function
            # use placeholder
            # use stochastic action
            # use deterministic action
            if args.coef_predict_task > 0:
                import tensorflow.contrib.distributions as dists
                assert isinstance( ac_space, Box ), 'Only Implement for Box action space'
                A_type = tf.placeholder_with_default('pl', dtype=tf.string)
                A_pl = self.pdtype.sample_placeholder([None])
                self.A = A_pl
                self.A_type = A_type

                A_input_1 = U.switch( tf.equal( A_type, 'det' ), self.pd.mode(), a_sample )
                A_input = U.switch( tf.equal( A_type, 'pl' ), A_pl,A_input_1)
                predict_h = tf.concat( (processed_x, A_input))
                for ind_layer in range(num_sharing_layers, num_layers):
                    predict_h = activ(fc(predict_h, f'predict_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
                predict_mean = fc(predict_h, f'predict_fc{ind_layer}', nh=ob_space.shape[0], init_scale=np.sqrt(2))

                predict_cov_init_value = np.identity( shape=ob_space.shape )
                predict_cov = tf.get_variable( name='predict_cov', shape=predict_cov_init_value, initializer=tf.constant_initializer(predict_cov_init_value) )
                predict_dist = dists.MultivariateNormalTriL( predict_mean, predict_cov )
                self.predict_dist = predict_dist

            scope_model = tf.get_variable_scope().name
            self.variables_all = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope_model)
            self.variables_trainable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope_model)


        #--- set logstd
        # if isinstance( ac_space, Box ):
        # if not policy_variance_state_dependent:
        #     logstd_pl, _ = observation_input( ac_space, batch_size=1, name='ac' )
        #     assign_logstd = tf.assign( self.pdtype.logstd, logstd_pl )
        #     set_logstd_entity = U.function([logstd_pl], assign_logstd)
        #     def set_logstd(logstd_new):
        #         # if isinstance( logstd_new, float  ):
        #         #     logstd_new = [[logstd_new] * ac_space.shape[0]]
        #         set_logstd_entity(logstd_new)
        #     self.set_logstd = set_logstd
        # self.get_logstd = U.function([], self.pdtype.logstd)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a_sample, vf, neglogp_sample], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run([a_sample, vf, neglogp_sample, self.pd.flatparam()], {X:ob}) #TODO: TEST flat for discrete action space
            return a, v, self.initial_state, neglogp, polciyflat

        def step_test(ob, *_args, **_kwargs):
            a = sess.run([self.pd.mode()], {X:ob})
            return a

        self.X = X
        self.vf = vf
        self.step = step
        self.step_policyflat = step_policyflat
        self.value = value
        self.step_test = step_test

Пример #31

0

Показать файл

Файл: mlp_policy.py Проект: kkhetarpal/ioc

    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "termfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0)))
        # pdb.set_trace()
        # self.op_pi = tf.constant(1./num_options)

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "intfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.intfc = tf.sigmoid(
            U.dense(last_out,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "OP%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.op_pi = tf.nn.softmax(
            U.dense(last_out,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])

Python switch примеры использования