示例#1
0
    def _create_logit_value(self,
                            action_layer,
                            value_layer,
                            gaussian_fixed_var=False):
        # actor
        if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box):
            mean = U.dense(action_layer,
                           self.pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, self.pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(action_layer,
                              self.pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = self.pdtype.pdfromflat(pdparam)
        self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        # critic
        self.vpred = U.dense(value_layer,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]
示例#2
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True):
        #assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        # Apply rnn_to reduce history
        with tf.variable_scope("vf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        # Apply rnn_to reduce history
        with tf.variable_scope("pf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0))

            assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box)
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#3
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#4
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#5
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0]
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape])

        if activation == 'tanh':
            activ = tf.nn.tanh
        elif activation == 'elu':
            activ = tf.nn.elu
        elif activation == 'lrelu':
            activ = lambda x: tf.maximum(x, 0.01 * x)
        else:
            raise NotImplementedError("Not available activation: " + activation)

        if PREPROCESS:
            last_out = ob
        else:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#6
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        ob_shape = (nbatch, ob_space.shape[0]*nstack)
        nact = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("obfilter", reuse=reuse):
            self.ob_rms = RunningMeanStd(shape=ob_shape[1:])
        with tf.variable_scope("retfilter", reuse=reuse):
            self.ret_rms = RunningMeanStd(shape=(1,))

        obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #obz = X

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0))

            mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0)
            logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer())
            logstd = tf.expand_dims(logstd, 0)
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0)

        v0 = vf[:, 0]
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self.initial_state = [] #not stateful

        def step(stoch, ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {stochastic:stoch, X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.vf = vf
        self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std
        self.step = step
        self.value = value
示例#7
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # determine the dimensions of the state space and observation space
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        option =  U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim),)

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_pure = tf.clip_by_value((ob[:,:-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0)

        # define the Q-function network here
        last_out0 = obz_pure # for option 0
        last_out1 = obz_pure # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "vffc0%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "vffc1%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0))

        # return the Q function value Q(s,o)
        self.vpred = U.switch(option[0], last_out1, last_out0)[:,0]

        
        # define the policy over options here
        last_out0 = obz_pure # for option 0
        last_out1 = obz_pure # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "oppi0%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "oppi1%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # return the probabilities for executing the options
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0]
        # we always terminate
        termination_sample = tf.constant([True])
        
        # implement the intra option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01),bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # now we never perform the ZOH, both policies are fully functional
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        ac = tf.clip_by_value(ac,-1.0,1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])        
        self._get_op = U.function([ob], [self.op_pi])
示例#8
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "termfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0)))
        # pdb.set_trace()
        # self.op_pi = tf.constant(1./num_options)

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "intfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.intfc = tf.sigmoid(
            U.dense(last_out,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "OP%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.op_pi = tf.nn.softmax(
            U.dense(last_out,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]
        #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])])
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # define the angle
        #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2])
        #last_out = ctrl_in
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
示例#10
0
    def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob, pol_keep_prob,\
             mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V, dropout_on_policy,tau, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)
        self.dropout_on_policy = dropout_on_policy
        #        self.pdtype = pdtype = make_pdtype(ac_space, dropout_on_policy)
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz

        self.mc_samples = mc_samples
        self.pol_keep_prob = pol_keep_prob
        self.V_keep_prob = V_keep_prob

        ### MAIN CHANGES
        #######################
        # Value function

        with tf.variable_scope("value_function"):

            dropout_networks = [last_out] * self.mc_samples
            #            dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.pol_keep_prob)

            for i in range(num_hid_layers):
                if layer_norm:
                    last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True))

                    apply_layer = lambda x: activation_critic(
                        tc.layers.layer_norm(tf.layers.dense(
                            x, hid_size_V, name="vffc%i" %
                            (i + 1), reuse=True),
                                             center=True,
                                             scope="vffc_activation%i" %
                                             (i + 1),
                                             scale=True,
                                             reuse=True))
                else:
                    last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)))

                    apply_layer = lambda x: activation_critic(
                        tf.layers.dense(
                            x, hid_size_V, name="vffc%i" %
                            (i + 1), reuse=True))

                dropout_networks = generate_dropout_layer(
                    apply_layer, dropout_networks, self.V_keep_prob)

            ## final layer
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="vffinal",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
                        name="vffinal", reuse=True)[:,0]
            dropout_networks = generate_dropout_layer(apply_layer,
                                                      dropout_networks,
                                                      self.V_keep_prob)

            self.vpred_mc_mean = tf.add_n(dropout_networks) / float(
                len(dropout_networks))
            self.vpred_dropout_networks = dropout_networks

        #######################
        ## Policy
        last_out = obz
        with tf.variable_scope("policy"):
            if not self.dropout_on_policy:
                for i in range(num_hid_layers):
                    last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \
                    weight_init=U.normc_initializer(1.0))
                    last_out = activation_actor(last_out)
                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2, "polfinal",
                                   U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = U.concatenate([mean, mean * 0.0 + logstd],
                                            axis=1)
                else:
                    pdparam = U.dense(last_out,
                                      pdtype.param_shape()[0], "polfinal",
                                      U.normc_initializer(0.01))
                self.pd = pdtype.pdfromflat(pdparam)
            else:
                dropout_networks = [last_out] * mc_samples
                dropout_networks = generate_dropout_layer(
                    lambda x: x, dropout_networks, 1.0)

                for i in range(num_hid_layers):
                    last_out = activation_actor(
                        tf.layers.dense(
                            last_out,
                            hid_size_actor,
                            activation=None,
                            name="polfc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0),
                            bias_initializer=tf.zeros_initializer()))
                    apply_layer = lambda x: activation_actor(
                        tf.layers.dense(
                            x,
                            hid_size_actor,
                            activation=None,
                            name="polfc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0),
                            bias_initializer=tf.zeros_initializer(),
                            reuse=True))
                    dropout_networks = generate_dropout_layer(
                        apply_layer, dropout_networks, pol_keep_prob)

                net = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name="polfinal",
                    activation=None,
                    kernel_initializer=U.normc_initializer(0.01))
                apply_layer = lambda x: tf.layers.dense(
                    x,
                    pdtype.param_shape()[0] // 2,
                    activation=None,
                    name="polfinal",
                    kernel_initializer=U.normc_initializer(0.01),
                    reuse=True)
                dropout_networks = generate_dropout_layer(
                    apply_layer, dropout_networks, pol_keep_prob)

                self.pd = pdtype.pdfromflat(dropout_networks, tau)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        last_out = obz

        ### MAIN CHANGES
        ## if dropout:

        if dropout_on_V:
            vact = self.vpred_mc_mean
        else:
            vact = self.vpred

        if dropout_on_policy:
            self._actsfunc = [
                U.function([ob], [x, vact]) for x in dropout_networks
            ]

            self._act = self.dropout_act
        else:
            self._actfunc = U.function([stochastic, ob], [ac, vact])
            self._act = self.reg_act
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              gmm_comp=1,
              mirror_loss=False,
              observation_permutation=[],
              action_permutation=[]):
        assert isinstance(ob_space, gym.spaces.Box)
        if mirror_loss:
            assert gaussian_fixed_var  # assume fixed std for now

        self.pdtype = pdtype = make_pdtype(
            ac_space, gmm_comp
        )  #pd = probability distribution -- distrib of possible actions
        sequence_length = None
        self.mirror_loss = mirror_loss
        if mirror_loss:
            # construct permutation matrices
            obs_perm_mat = np.zeros(
                (len(observation_permutation), len(observation_permutation)),
                dtype=np.float32
            )  #implements mirror loss using permutation matrices
            act_perm_mat = np.zeros(
                (len(action_permutation), len(action_permutation)),
                dtype=np.float32
            )  #is it about // limbs learning same behavior?
            for i, perm in enumerate(
                    observation_permutation):  #to swap rows / cols of a matrix
                obs_perm_mat[i][int(np.abs(perm))] = np.sign(
                    perm)  # PA  /  AP  -- Permutation P swaps rows / cols of A
            for i, perm in enumerate(action_permutation):
                act_perm_mat[i][int(np.abs(perm))] = np.sign(perm)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(
                shape=ob_space.shape
            )  #obs gathered from the env, use the Root Means Square of obs matrix as input to NN

        obz = tf.clip_by_value(
            (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
            5.0)  # Standard Normal Distrib of Obs, clipped btw [-5, 5] !
        #Z = (X - μ)/σ where Z is the value on the standard normal distribution,
        #X is the value on the original distribution,
        #μ is the mean of the original distribution, and
        #σ is the standard deviation of the original distribution.
        last_out = obz  #input
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(
            last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0)
        )[:,
          0]  #prediction V network (predicts value V of state)  -- value function

        last_out = obz  #input
        params = []
        for i in range(num_hid_layers):
            rt, pw, pb = U.dense_wparams(
                last_out,
                hid_size,
                "polfc%i" % (i + 1),
                weight_init=U.normc_initializer(1.0)
            )  #policy function NN -- policy itself, what action given obervations? (aka state)
            last_out = tf.nn.tanh(rt)
            params.append([pw, pb])
        if gaussian_fixed_var and isinstance(ac_space,
                                             gym.spaces.Box):  #usually True
            if gmm_comp == 1:  #usually True
                mean, pw, pb = U.dense_wparams(
                    last_out,
                    pdtype.param_shape()[0] // 2, "polfinal",
                    U.normc_initializer(0.01)
                )  #final is the Gaussian distrib of all possible actions
                params.append([pw, pb])
                self.mean = mean
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer(
                    ))  #log of std dev of actions distrib
                pdparam = U.concatenate(
                    [mean, mean * 0.0 + logstd], axis=1
                )  # probability distrib of actions! in given distribution
            else:
                means = U.dense(last_out,
                                (pdtype.param_shape()[0] - gmm_comp) // 2,
                                "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    initializer=tf.constant(
                        np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2),
                                dtype=np.float32) * (-1.0)))
                weights = tf.nn.softmax(
                    U.dense(last_out, gmm_comp, "gmmweights",
                            U.normc_initializer(0.01)))
                pdparam = U.concatenate([means, means * 0.0 + logstd, weights],
                                        axis=1)
        elif gmm_comp == 1:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        else:
            meanstd = U.dense(last_out,
                              pdtype.param_shape()[0] - gmm_comp, "polfinal",
                              U.normc_initializer(0.01))
            weights = tf.nn.softmax(
                U.dense(last_out, gmm_comp, "gmmweights",
                        U.normc_initializer(0.01)))
            pdparam = U.concatenate([meanstd, weights], axis=1)

        if mirror_loss:
            mirrored_obz = tf.matmul(
                obz, obs_perm_mat
            )  #for mirrorred loss, input is permutated Observations!
            last_val = mirrored_obz  #as said in paper -- "encourage gait symmetry by measuring the symmetry of ACTIONS (instead os states) thus avoiding issue of delayed reward"
            for i in range(len(params) - 1):
                last_val = tf.nn.tanh(
                    tf.matmul(last_val, params[i][0]) + params[i][1]
                )  #wanna minimize error of both Action and Mirrored Action
            mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][
                1]  #aka action in State and Mirrored State
            self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat)

        if gmm_comp == 1:  #usually True
            self.pd = pdtype.pdfromflat(
                pdparam)  #flattens probability distrib params
        else:
            self.pd = pdtype.pdfromflat([pdparam, gmm_comp])

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(
            stochastic, self.pd.sample(), self.pd.mode()
        )  #take action == sample from Diag Gaussian probalility distribution of actions
        self._act = U.function(
            [stochastic, ob], [ac, self.vpred]
        )  #wrapper function that when given stochastic and obs  ->  returns action it sampled and predicted value of state based on obs
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # init
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # return Q-function value
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # policy over options:
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)

        # instead of applying the softmax also define self.op_pi_orig which is the difference between the output values
        self.op_pi_orig = last_out0 - last_out1  #tf.math.subtract(last_out0,last_out1)
        self.op_pi = tf.nn.softmax(last_out)

        # still always terminate
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # choose the appropriate action
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.relu(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            # leave here the tanh to squash to (-1,1)
            mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1
            #mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # keep the variable which only incorporates the mean
        ac_mean = mean
        ac_mean = U.switch(option[0], ac_mean,
                           tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        self.ac_mean = tf.clip_by_value(ac_mean, -1.0, 1.0)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])

        # additional functions that return the action mean and the special values for the policy over options
        self._act_mean = U.function([ob, option], [ac_mean])
        self._get_op_orig = U.function([ob], [self.op_pi_orig])
示例#13
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # Define the dimensions
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # implementation of the Q-funtion:
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        # presents the value of (state,option) -> denoted as Q-fct in report
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # Implementation of the policy over options:
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # this is the output of the policy over options:
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]

        # Always terminate
        termination_sample = tf.constant([True])

        # calculate the control action: -> implementation of intra option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # if stochastic is true, we sample around the mean, this corresponds to the exploration at the action level
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # determine the control action to be applied. In case of ZOH == opt 0 just use u[k-1]
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
示例#14
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, gmm_comp=1,
              mirror_loss=False, observation_permutation=[], action_permutation=[]):
        assert isinstance(ob_space, gym.spaces.Box)
        if mirror_loss:
            assert gaussian_fixed_var  # assume fixed std for now

        self.pdtype = pdtype = make_pdtype(ac_space, gmm_comp)
        sequence_length = None
        self.mirror_loss = mirror_loss
        if mirror_loss:
            # construct permutation matrices
            obs_perm_mat = np.zeros((len(observation_permutation), len(observation_permutation)), dtype=np.float32)
            act_perm_mat = np.zeros((len(action_permutation), len(action_permutation)), dtype=np.float32)
            for i, perm in enumerate(observation_permutation):
                obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm)
            for i, perm in enumerate(action_permutation):
                act_perm_mat[i][int(np.abs(perm))] = np.sign(perm)

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        last_out = ob
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        params = []
        for i in range(num_hid_layers):
            rt, pw, pb = U.dense_wparams(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))
            last_out = tf.nn.tanh(rt)
            params.append([pw, pb])
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            if gmm_comp == 1:
                mean, pw, pb = U.dense_wparams(last_out, pdtype.param_shape()[0] // 2, "polfinal",
                                               U.normc_initializer(0.01))
                params.append([pw, pb])
                self.mean = mean
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                         initializer=tf.zeros_initializer())
                pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                means = U.dense(last_out, (pdtype.param_shape()[0] - gmm_comp) // 2, "polfinal",
                                U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd",
                                         initializer=tf.constant(np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2),
                                                                         dtype=np.float32) * (-1.0)))
                weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01)))
                pdparam = U.concatenate([means, means * 0.0 + logstd, weights], axis=1)
        elif gmm_comp == 1:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
        else:
            meanstd = U.dense(last_out, pdtype.param_shape()[0] - gmm_comp, "polfinal", U.normc_initializer(0.01))
            weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01)))
            pdparam = U.concatenate([meanstd, weights], axis=1)

        if mirror_loss:
            mirrored_ob = tf.matmul(ob, obs_perm_mat)
            last_val = mirrored_ob
            for i in range(len(params) - 1):
                last_val = tf.nn.tanh(tf.matmul(last_val, params[i][0]) + params[i][1])
            mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][1]
            self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat)

        if gmm_comp == 1:
            self.pd = pdtype.pdfromflat(pdparam)
        else:
            self.pd = pdtype.pdfromflat([pdparam, gmm_comp])

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
示例#15
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              bound_by_sigmoid=False,
              sigmoid_coef=1.,
              activation='tanh',
              normalize_obs=True,
              actions='gaussian',
              avg_norm_symmetry=False,
              symmetric_interpretation=False,
              stdclip=5.0,
              gaussian_bias=False,
              gaussian_from_binary=False,
              parallel_value=False,
              pv_layers=2,
              pv_hid_size=512,
              three=False):
        assert isinstance(ob_space, gym.spaces.Box)

        if actions == 'binary':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'beta':
            self.pdtype = pdtype = BetaPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'bernoulli':
            self.pdtype = pdtype = BernoulliPdType(ac_space.low.size)
        elif actions == 'gaussian':
            self.pdtype = pdtype = make_pdtype(ac_space)
        elif actions == 'cat_3':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 2)
        elif actions == 'cat_5':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 4)
        else:
            assert False

        sequence_length = None

        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None])

        if normalize_obs:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            if avg_norm_symmetry:
                # Warning works only for normal observations (41 numbers)
                ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) +
                           self.ob_rms.mean) / 2
                ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) +
                          self.ob_rms.std) / 2  # Pretty crude
            else:
                ob_mean = self.ob_rms.mean
                ob_std = self.ob_rms.std

            obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip,
                                   stdclip)

            #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41)
            #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41)
        else:
            obz = self.ob

        vpreds = []
        pparams = []

        for part in range(1 if not three else 3):
            part_prefix = "" if part == 0 else "part_" + str(part)

            # Predicted value
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            part_prefix + "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))

            vpreds.append(
                U.dense(last_out,
                        1,
                        part_prefix + "vffinal",
                        weight_init=U.normc_initializer(1.0)))
            vpreds[-1] = vpreds[-1][:, 0]

            if parallel_value:
                last_out_2 = obz
                for i in range(pv_layers):
                    last_out_2 = tf.nn.tanh(
                        U.dense(last_out_2,
                                pv_hid_size,
                                part_prefix + "pv_vffc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0)))
                last_out_2 = U.dense(last_out_2,
                                     1,
                                     part_prefix + "pv_vffinal",
                                     weight_init=U.normc_initializer(1.0))
                vpreds[-1] += last_out_2[:, 0]

            last_out = obz
            if activation == 'tanh': activation = tf.nn.tanh
            elif activation == 'relu': activation = tf.nn.relu
            for i in range(num_hid_layers):
                dense = U.dense(last_out,
                                hid_size,
                                part_prefix + "polfc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0))
                last_out = activation(dense)

            if actions == 'gaussian':
                if gaussian_fixed_var:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    if bound_by_sigmoid:
                        mean = tf.nn.sigmoid(mean * sigmoid_coef)
                    logstd = tf.get_variable(
                        name=part_prefix + "logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    logstd = mean * 0.0 + logstd
                else:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    logstd = U.dense(last_out,
                                     pdtype.param_shape()[0] // 2,
                                     part_prefix + "polfinal_2",
                                     U.normc_initializer(0.01))
                if gaussian_bias:
                    mean = mean + 0.5

                pdparam = U.concatenate([mean, logstd], axis=1)
            elif actions == 'beta':
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "beta_lastlayer",
                                  U.normc_initializer(0.01))
                pdparam = tf.nn.softplus(pdparam)
            elif actions in ['bernoulli', 'binary']:
                if bound_by_sigmoid:
                    raise NotImplementedError(
                        "bound by sigmoid not implemented here")
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "polfinal",
                                  U.normc_initializer(0.01))
            elif actions in ['cat_3']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat3_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            elif actions in ['cat_5']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat5_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            else:
                assert False

            pparams.append(pdparam)

        pparams = tf.stack(pparams)
        vpreds = tf.stack(vpreds)
        pparams = tf.transpose(pparams,
                               perm=(1, 0, 2))  # [batchsize, networks, values]
        vpreds = tf.transpose(vpreds,
                              perm=(1, 0))  # [batchsize, networks, values]

        self.stochastic = tf.placeholder(name="stochastic",
                                         dtype=tf.bool,
                                         shape=())

        if three:
            batchsize = tf.shape(pdparam)[0]
            NO_OBSTACLES_ID = 5
            OBST_DIST = [278, 279, 280, 281, 282, 283, 284,
                         285]  # TODO: Alternative approach
            distances = [self.ob[:, i] for i in OBST_DIST]
            distances = tf.stack(distances, axis=1)
            no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0),
                                   tf.int32)
            distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1),
                                tf.int32)
            no_obstacles_ahead = distances * no_obstacles  # 0 if obstacles, 1 if no obstacles
            begin = tf.cast(tf.less(self.st, 75), tf.int32)
            take_id = (1 - begin) * (
                1 + no_obstacles_ahead
            )  # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead

            take_id = tf.stack((tf.range(batchsize), take_id), axis=1)
            pdparam = tf.gather_nd(pparams, take_id)

            self.vpred = tf.gather_nd(vpreds, take_id)
            #self.vpred = tf.Print(self.vpred, [take_id])
        else:
            self.vpred = vpreds[:, 0]
            pdparam = pparams[:, 0]

        self.pd = pdtype.pdfromflat(pdparam)

        if hasattr(self.pd, 'real_mean'):
            real_mean = self.pd.real_mean()
            ac = U.switch(self.stochastic, self.pd.sample(), real_mean)
        else:
            ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([self.stochastic, self.ob, self.st],
                               [ac, self.vpred, ob_mean, ob_std])

        if actions == 'binary':
            self._binary_f = U.function([self.stochastic, self.ob, self.st],
                                        [ac, self.pd.flat, self.vpred])
示例#16
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        bins = ac_space.high[0] - ac_space.low[0] + 1
        print('making policy bins size {}'.format(bins))
        assert bins is not None
        act_dim = len(ac_space.high)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            raise NotImplementedError
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            m = U.dense(last_out,
                        pdtype.param_shape()[0], "polfinal",
                        U.normc_initializer(0.01))
            norm_softm = tf.nn.sigmoid(
                m
            )  # of size [batchsize, num-actions*bins], initialized to be about uniform
            norm_softm = tf.reshape(
                norm_softm, [-1, act_dim, bins]
            )  # of size [batchsize, num-actions, bins], initialized to be about uniform

            norm_softm_tiled = tf.tile(tf.expand_dims(norm_softm, axis=-1),
                                       [1, 1, 1, bins])

            # construct the mask
            am_numpy = construct_mask(bins)
            am_tf = tf.constant(am_numpy, dtype=tf.float32)

            # construct pdparam
            pdparam = tf.reduce_sum(
                tf.math.log(norm_softm_tiled + 1e-8) * am_tf +
                tf.math.log(1 - norm_softm_tiled + 1e-8) * (1 - am_tf),
                axis=-1)
            pdparam = tf.reshape(pdparam, [-1, act_dim * bins])

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # define action and observation space
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # implement Q-function approximation
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        # return the Q-function value
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # implement parametrizatzion for policy over options
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # return probabilities for the options
        self.op_pi = tf.nn.softmax(last_out)

        # always terminate
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.constant([True])

        # define the control policy / intra-option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.relu(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            # now also use relus to squash to -1,1
            mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # sample stochastically -> this corresponds to exploration
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        # choose the appropriate action, apply the ZOH if using option 0
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
    def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob,\
             mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V,gaussian_fixed_var=True, sample_dropout=False):
        assert isinstance(ob_space, gym.spaces.Box)
        self.sample_dropout = sample_dropout

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        
        
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        
        
        
        self.mc_samples=mc_samples
        self.V_keep_prob=V_keep_prob
        
        ### MAIN CHANGES
        #######################
        # Value function  

      
        with tf.variable_scope("value_function"):
            
            dropout_networks = [last_out] * self.mc_samples
           # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.V_keep_prob)
            
            for i in range(num_hid_layers):
                if layer_norm:
                    last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True))
                    
                    apply_layer = lambda x : activation_critic(tc.layers.layer_norm(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), 
                                        reuse=True) ,center=True,scope="vffc_activation%i"%(i+1) ,scale=True,reuse=True) )
                else:
                    last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)))
                    
                    apply_layer = lambda x : activation_critic(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), 
                                        reuse=True))
               
                dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.V_keep_prob)
            
            ## final layer
            self.vpred = tf.layers.dense(last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:,0]
            
            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
                        name="vffinal", reuse=True)[:,0]
            dropout_networks=generate_layer(apply_layer,dropout_networks,self.V_keep_prob)
            
            mean,variance=tf.nn.moments(tf.stack(dropout_networks), 0)
            
            self.vpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks))
            self.vpred_dropout_networks=dropout_networks
            
            self.variance=variance
            LAMBDA = tf.placeholder(dtype=tf.float32, shape=())
            self.v_lambda_variance=self.vpred_mc_mean+LAMBDA*tf.sqrt(variance)
         
            

            
        #######################    
        ## Policy
        last_out = obz
      
        with tf.variable_scope("policy"):
            for i in range(num_hid_layers):
                
                last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \
                weight_init=U.normc_initializer(1.0)) 
                last_out = activation_actor(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
       
        
        
        last_out = obz
        
        ## BUilding function Q(s,a)
        
#        last_out2=self.pd.sample()
#        activation=tf.nn.relu
#        #######################
#        # Action Value function  
#        with tf.variable_scope("Q"):        
#            dropout_networks = [last_out] * self.mc_samples
#            dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.keep_prob)
#                 
#            ## concatenate state and action
#            last_out = tf.concat([last_out, last_out2], axis=-1)
#            
#            new_networks = []
#            for dropout_network in dropout_networks:
#                dropout_network = tf.concat([dropout_network, last_out2], axis=-1)
#                dropout_network, mask = U.bayes_dropout(dropout_network, self.keep_prob)
#                new_networks.append(dropout_network)
#            dropout_networks = new_networks
#            
#            ### hidden layers
#            for i in range(num_hid_layers):
#                
#                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="Q%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
#                apply_layer = lambda x : activation(tf.layers.dense(x, hid_size, activation=None, \
#                        name="Q%i"%(i+1), reuse=True))
#                dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob)
#            
#            ## final layer
#            self.qpred = tf.layers.dense(last_out, 1, name="Qfinal", kernel_initializer=U.normc_initializer(1.0))[:,0]
#            
#            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
#                        name="Qfinal", reuse=True)[:,0]
#            dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob)
#            
#            self.qpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks))
#            self.qpred_dropout_networks=dropout_networks
        
        
        
        
        ### MAIN CHANGES
        ## if dropout:
        if dropout_on_V:
            if self.sample_dropout:
                self._act = [U.function([stochastic, ob], [ac, x]) for x in dropout_networks]
            else:
                self._act = U.function([stochastic, ob], [ac, self.vpred_mc_mean])
                       


            
        else:
            self._act = U.function([stochastic, ob], [ac, self.vpred])