Exemplo n.º 1
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 2
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 3
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 4
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = tf.layers.dense(x,
                                 pdtype.param_shape()[0],
                                 name='logits',
                                 kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(
            x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,
                                                                             0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 5
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        logits, self.vpred = keras_net(ob)
        self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 6
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, noisy_nets=False, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz

        for i in range(num_hid_layers):
            last_out = tf.nn.selu(U.dense(last_out, hid_size, "vffc%i"%(i + 1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
        
        last_out = obz
        for i in range(num_hid_layers):
            if noisy_nets:
                last_out = tf.nn.selu(U.noisy_dense(last_out, hid_size, "noisy_polfc%i"%(i + 1), weight_init=U.normc_initializer(1.0)))
            else:
                last_out = tf.nn.selu(U.dense(last_out, hid_size, "polfc%i"%(i + 1), weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            assert(noisy_nets is False)
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            if noisy_nets:
                pdparam = U.noisy_dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
            else:
                pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pdparam = pdparam
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        self._vpred_pdparam = U.function([ob], [self.vpred, self.pdparam])
        self.ob = ob
Exemplo n.º 7
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            # self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
            self.vpred = discriminator_model([last_out], drop_rate=0.5)

        with tf.variable_scope('pol'):
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))

            pdparam = generator_model([last_out],
                                      pdtype.param_shape()[0],
                                      drop_rate=0.5)

            # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            #     mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
            #     logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            #     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            # else:
            #     pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 8
0
 def _register_placeholder(self,
                           *,
                           name=None,
                           dtype=None,
                           shape=None,
                           placeholder=None):
     if placeholder is None:
         placeholder = U.get_placeholder(name=name,
                                         dtype=tf.float32,
                                         shape=shape)
     elif name is None:
         name = placeholder.name
     if name in self._tf_placeholders:
         raise ValueError(
             "Placeholder with name {} already exists".format(name))
     self._tf_placeholders[name] = placeholder
     return placeholder
Exemplo n.º 9
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, summaries = False, should_act = True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        ob = tf.get_default_graph().get_tensor_by_name("observations:0");
        if ob is None:
            ob = U.get_placeholder(name="observations", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope('pol'):
            last_out = ob
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))
                last_out = tf.nn.elu(last_out);
                #last_out = tf.nn.tanh(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, tf.ones(shape=mean.shape)* logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        with tf.variable_scope("distribution"):
            self.pd = pdtype.pdfromflat(pdparam)

        if should_act:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)

            with tf.variable_scope('vf'):
                #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
                last_out = ob
                for i in range(num_hid_layers):
                    last_out = tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))
                    last_out = tf.nn.tanh(last_out);
                self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]


            self.state_in = []
            self.state_out = []

            with tf.variable_scope("distribution"):
                stochastic = tf.placeholder(dtype=tf.bool, shape=(), name = "stochastic")
                ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

            self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self, ob_space, ac_space, embedding_space_size):
        assert isinstance(ob_space, gym.spaces.Box)

        # self.input = tf.placeholder(dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        self.input = U.get_placeholder(name="ob_f",
                                       dtype=tf.float32,
                                       shape=[None] + list(ob_space.shape))
        self.embedding_space = embedding_space_size

        # x = self.input / 255.0
        x = tf.nn.relu(
            conv2d(self.input, 32, "cnn1", [8, 8], [4, 4], pad="VALID"))
        x = tf.nn.relu(conv2d(x, 64, "cnn2", [4, 4], [2, 2], pad="VALID"))
        x = tf.nn.relu(conv2d(x, 64, "cnn3", [3, 3], [1, 1], pad="VALID"))
        x = flatten(x)
        self.output = tf.nn.relu(
            linear(x, self.embedding_space, 'linlast',
                   normalized_columns_initializer(1.0)))
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers):
        assert isinstance(ob_space, gym.spaces.Box)

        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('rew'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.reward = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        self._rew = U.function([ob], [self.reward])
Exemplo n.º 12
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True):
        #assert isinstance(ob_space, gym.spaces.Box)
        print("Constructing policy for observation space",ob_space)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        #with tf.variable_scope("obfilter"):
        #    self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz = ob

        # Apply rnn_to reduce history
        with tf.variable_scope("vf"):
            state = self.rnn(obz, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = resnet(state, hid_size, "vf%i"%(i+1))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        # Apply rnn_to reduce history
        with tf.variable_scope("pf"):
            state = self.rnn(obz, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = resnet(state, hid_size, "pf%i"%(i+1))

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                raise
                pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 13
0
    def _init(self, ob_space, ac_space, hid_sizes, gaussian_fixed_var=True, use_obfilter=False):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        if use_obfilter:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)

            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        else:
            obz = ob

        last_out = obz
        for i, hid_size in enumerate(hid_sizes):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(0.01)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(0.01))[:,0]

        last_out = obz
        for i, hid_size in enumerate(hid_sizes):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(0.01)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            # mean = tf.clip_by_value(mean, ac_space.low, ac_space.high)
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        pdparam = tf.identity(pdparam, name="pdparam")
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stoch")
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        ac = tf.identity(ac, name="pi")
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 14
0
    def _init(self, ob_space, ac_space, layers_val, layers_pol, gaussian_fixed_var=True,
              dist='gaussian', ):
        assert isinstance(ob_space, gym.spaces.Box)

        self.dist = dist
        self.pdtype = pdtype = make_pdtype(ac_space, dist=dist)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i, size in enumerate(layers_val):
                last_out = tf.nn.relu(tf.layers.dense(last_out, size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i, size in enumerate(layers_pol):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if dist == 'gaussian':
            self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.std, self.pd.mean, self.pd.logstd])
        elif dist == 'beta':
            self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.alpha, self.pd.beta, self.pd.alpha_beta])
    def _init(self, ac_space, joint_training, emb_size=None, emb_network=None):
        self.pdtype = pdtype = make_pdtype(ac_space)

        self.emb_network = emb_network
        self.joint_training = joint_training
        size = 256
        if self.joint_training:
            self.input, output = emb_network.get_input_and_last_layer()
            x = tf.nn.relu(linear(output, size, 'lin1', normalized_columns_initializer(1.0)))
        else:
            self.input = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, emb_size])
            x = tf.nn.relu(linear(self.input, size, 'lin1', normalized_columns_initializer(1.0)))

        # x = tf.nn.relu(linear(x, 32, 'lin2', normalized_columns_initializer(1.0)))
        logits = linear(x, pdtype.param_shape()[0], "logits", normalized_columns_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.ac = self.pd.sample()
        # self.probs = tf.nn.softmax(logits, dim=-1)[0, :]
        self.vpred = linear(x, 1, "value", normalized_columns_initializer(1.0))

        self._act = U.function([self.input], [self.ac, self.vpred])
Exemplo n.º 16
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(0.01)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.constant_initializer(exploration_rate))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1)
        my_var_out = tf.identity(my_var, name='output_node')
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        #obscaled = ob / 255.0
        obscaled = ob

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [2, 2], [1, 1], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x,
                             pdtype.param_shape()[0], "logits",
                             U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [2, 2], [1, 1], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 18
0
    def _build(self):
        num_primitives = self.num_primitives
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size

        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])
        self._prev_primitive = prev_primitive = U.get_placeholder(
            name="prev_primitive", dtype=tf.int32, shape=[None])

        with tf.variable_scope(self.name):
            self._scope = tf.get_variable_scope().name

            self.ob_rms = {}
            for ob_name in self.ob_type:
                with tf.variable_scope("ob_rms_{}".format(ob_name)):
                    self.ob_rms[ob_name] = RunningMeanStd(
                        shape=self._ob_shape[ob_name])
            obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
                   self.ob_rms[ob_name].std for ob_name in self.ob_type]
            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            prev_primitive_one_hot = tf.one_hot(prev_primitive,
                                                num_primitives,
                                                name="prev_primitive_one_hot")
            obz = tf.concat([obz, prev_primitive_one_hot], -1)

            # value function
            with tf.variable_scope("vf"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%d" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    _,
                    1,
                    name="vpred",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

            # meta policy
            with tf.variable_scope("pol"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.selector = tf.layers.dense(
                    _,
                    num_primitives,
                    name="action",
                    kernel_initializer=U.normc_initializer(0.01))
                self.pdtype = pdtype = CategoricalPdType(num_primitives)
                self.pd = pdtype.pdfromflat(self.selector)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic, self._prev_primitive] + self.obs,
                               [ac, self.vpred])
Exemplo n.º 19
0
def learn(env, policy_func, med_func, expert_dataset, pretrained, pretrained_weight, g_step, m_step, e_step, inner_iters, save_per_iter,
          ckpt_dir, log_dir, timesteps_per_batch, task_name, max_kl=0.01, max_timesteps=0, max_episodes=0, max_iters=0,
          batch_size=64, med_stepsize=1e-3, pi_stepsize=1e-3, callback=None, writer=None):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    med = med_func("mediator", ob_space, ac_space)
    pi_var_list = pi.get_trainable_variables()
    med_var_list = med.get_trainable_variables()
    g_ob = U.get_placeholder(name="g_ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))
    g_ac = U.get_placeholder(name='g_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape))
    e_ob = U.get_placeholder(name='e_ob', dtype=tf.float32, shape=[None] + list(ob_space.shape))
    e_ac = U.get_placeholder(name='e_ac', dtype=tf.float32, shape=[None] + list(ac_space.shape))
    med_loss = -tf.reduce_mean(med.g_pd.logp(g_ac) + med.e_pd.logp(e_ac)) * 0.5
    #pi_loss = -0.5 * (tf.reduce_mean(pi.pd.logp(ac) - med.pd.logp(ac)))
    g_pdf = tfd.MultivariateNormalDiag(loc=pi.pd.mean, scale_diag=pi.pd.std)
    m_pdf = tfd.MultivariateNormalDiag(loc=med.g_pd.mean, scale_diag=med.g_pd.std)
    pi_loss = tf.reduce_mean(g_pdf.cross_entropy(m_pdf) - g_pdf.entropy())  # tf.reduce_mean(pi.pd.kl(med.pd))
    kloldnew = oldpi.pd.kl(pi.pd)
    meankl = tf.reduce_mean(kloldnew)
    dist = meankl
    expert_loss = -tf.reduce_mean(pi.pd.logp(e_ac))


    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])

    compute_med_loss = U.function([g_ob, g_ac, e_ob, e_ac], med_loss)
    compute_pi_loss = U.function([g_ob], pi_loss)
    compute_exp_loss = U.function([e_ob, e_ac], expert_loss)
    # compute_kl_loss = U.function([ob], dist)
    # compute_fvp = U.function([flat_tangent, ob, ac], fvp)
    compute_med_lossandgrad = U.function([g_ob, g_ac, e_ob, e_ac], [med_loss, U.flatgrad(med_loss, med_var_list)])
    compute_pi_lossandgrad = U.function([g_ob],  [pi_loss, U.flatgrad(pi_loss, pi_var_list)])
    compute_exp_lossandgrad = U.function([e_ob, e_ac], [expert_loss, U.flatgrad(expert_loss, pi_var_list)])
    get_flat = U.GetFlat(pi_var_list)
    set_from_flat = U.SetFromFlat(pi_var_list)
    med_adam = MpiAdam(med_var_list)
    pi_adam = MpiAdam(pi_var_list)

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    # th_init = get_flat()
    # MPI.COMM_WORLD.Bcast(th_init, root=0)
    # set_from_flat(th_init)

    med_adam.sync()
    pi_adam.sync()
    # if rank == 0:
    #     print("Init pi param sum %d, init med param sum %d." % (th_pi_init.sum(), th_med_init.sum()), flush=True)

    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1
    loss_stats = stats(["med_loss", "pi_loss"])
    ep_stats = stats(["True_rewards", "Episode_length"])

    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi_var_list)

    med_losses = []
    pi_losses = []

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)
        # ======= Optimize Mediator=========

        seg = seg_gen.__next__()
        g_ob, g_ac = seg['ob'], seg['ac']
        #assign_old_eq_new()
        #stepsize = 3e-4
        # thbefore = get_flat()
        d = dataset.Dataset(dict(ob=g_ob, ac=g_ac))
        optim_batchsize = min(batch_size, len(g_ob))
        g_loss = []
        logger.log("Optimizing Generator...")
        for _ in range(1):
            g_batch = d.next_batch(optim_batchsize)
            g_batch_ob, g_batch_ac = g_batch['ob'], g_batch['ac']
            if hasattr(pi, "obs_rms"):
                pi.obs_rms.update(g_batch_ob)
            pi_loss, g = compute_pi_lossandgrad(g_batch_ob)
            # kl = compute_kl_loss(g_ob)
            # if kl > max_kl * 1.5:
            #     logger.log("violated KL constraint. Shrinking step.")
            #     # stepsize *= 0.1
            #     break
            # else:
            #     logger.log("Stepsize OK!")
            pi_adam.update(allmean(g), pi_stepsize)
            g_loss.append(pi_loss)
        pi_losses.append(np.mean(np.array(g_loss)))
        med_loss = []
        logger.log("Optimizing Mediator...")
        for g_ob_batch, g_ac_batch in dataset.iterbatches((seg['ob'], seg['ac']), include_final_partial_batch=False, batch_size=batch_size):
            # g_batch = d.next_batch(optim_batchsize)
            # g_ob_batch, g_ac_batch = g_batch['ob'], g_batch['ac']
            e_ob_batch, e_ac_batch = expert_dataset.get_next_batch(optim_batchsize)
            if hasattr(med, "obs_rms"):
                med.obs_rms.update(np.concatenate((g_ob_batch, e_ob_batch), 0))

            newlosses, g = compute_med_lossandgrad(g_ob_batch, g_ac_batch, e_ob_batch, e_ac_batch)
            med_adam.update(allmean(g), med_stepsize)
            med_loss.append(newlosses)
        med_losses.append(np.mean(np.array(med_loss)))
        #logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses)))



            #logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses)))
        #logger.record_tabular("expert_loss_each_iter", np.mean(np.array(exp_losses)))
        logger.record_tabular("med_loss_each_iter", np.mean(np.array(med_losses)))
        logger.record_tabular("gen_loss_each_iter", np.mean(np.array(pi_losses)))

        lrlocal = (seg["ep_lens"], seg["ep_true_rets"])
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
        lens, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if writer is not None:
            loss_stats.add_all_summary(writer, [np.mean(np.array(med_losses)), np.mean(np.array(pi_losses))], episodes_so_far)
            ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(lenbuffer)], episodes_so_far)
        if rank == 0:
            logger.dump_tabular()
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=1,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    ### Book-keeping
    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app
    version_name = 'FINAL_NORM-ACT-LOWER-LR-len-400-wNoise-update1-ppo-ESCH-1-2-5-nI'

    dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options)
    print(dirname)
    #input ("wait here after dirname")

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py']
        first = True
        for i in range(len(files)):
            src = os.path.join(
                '/home/nfunk/Code_MA/ppoc_off_tryout/baselines/baselines/ppo1/'
            ) + files[i]
            print(src)
            #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname
            dest = dirname + "src_code/"
            if (first):
                os.makedirs(dest)
                first = False
            print(dest)
            shutil.copy2(src, dest)
        # brute force copy normal env file at end of copying process:
        src = os.path.join(
            '/home/nfunk/Code_MA/ppoc_off_tryout/nfunk/envs_nf/pendulum_nf.py')
        shutil.copy2(src, dest)
    ###

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    max_action = env.action_space.high

    # add the dimension in the observation space!
    ob_space.shape = ((ob_space.shape[0] + ac_space.shape[0]), )
    print(ob_space.shape)
    print(ac_space.shape)
    #input ("wait here where the spaces are printed!!!")
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    pol_ov_op_ent = tf.placeholder(dtype=tf.float32,
                                   shape=None)  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    atarg_clip = atarg  #tf.clip_by_value(atarg,-10,10)
    surr1 = ratio * atarg_clip  #atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param,
                   1.0 + clip_param) * atarg_clip  #atarg #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    #vf_loss = U.mean(tf.square(tf.clip_by_value(pi.vpred - ret, -10.0, 10.0)))
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    force_pi_loss = U.mean(
        tf.square(
            tf.clip_by_value(pi.op_pi, 1e-5, 1.0) -
            tf.constant([[0.05, 0.95]])))

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0))
    #log_pi = tf.Print(log_pi, [log_pi, tf.shape(tf.transpose(log_pi))])
    old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)

    ratio_pol_ov_op = tf.exp(
        tf.transpose(log_pi)[option[0]] -
        tf.transpose(old_log_pi)[option[0]])  # pnew / pold
    term_adv_clip = term_adv  #tf.clip_by_value(term_adv,-10,10)
    surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip  # surrogate from conservative policy iteration
    surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param,
                             1.0 + clip_param) * term_adv_clip  #
    pol_surr_pol_ov_op = -U.mean(
        tf.minimum(surr1_pol_ov_op,
                   surr2_pol_ov_op))  # PPO's pessimistic surrogate (L^CLIP)

    op_loss = pol_surr_pol_ov_op - pol_ov_op_ent * tf.reduce_sum(entropy)
    #op_loss = pol_surr_pol_ov_op

    #total_loss += force_pi_loss
    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    lossandgrad = U.function(
        [ob, ac, atarg, ret, lrmult, option, term_adv, pol_ov_op_ent],
        losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    saver = tf.train.Saver(max_to_keep=10000)
    saver_best = tf.train.Saver(max_to_keep=1)

    ### More book-kepping
    results = []
    if saves:
        results = open(
            version_name + '_' + gamename + '_' + str(num_options) + 'opts_' +
            '_results.csv', 'w')
        results_best_model = open(
            dirname + version_name + '_' + gamename + '_' + str(num_options) +
            'opts_' + '_bestmodel.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:

        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)
    ###

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    des_pol_op_ent = 0.1
    max_val = -100000
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))
        print("mean opt dur:", opt_d)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        if hasattr(pi, "ob_rms_only"):
            pi.ob_rms_only.update(ob[:, :-ac_space.shape[0]]
                                  )  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if (iters_so_far + 1) % 1000 == 0:
            des_pol_op_ent = des_pol_op_ent / 10

        if iters_so_far % 50 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        # adaptively save best run:
        if (np.mean(rewbuffer) > max_val) and wsaves:
            max_val = np.mean(rewbuffer)
            results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' +
                                     str(np.mean(rewbuffer)) + '\n')
            results_best_model.flush()
            filename = dirname + 'best.ckpt'.format(gamename, iters_so_far)
            save_path = saver_best.save(U.get_session(), filename)

        min_batch = 160  # Arbitrary
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
            ###

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    #tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt])
                    tadv, nodc_adv = pi.get_opt_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    #if (opt==1):
                    #    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    #else:
                    #    *newlosses, grads = lossandgrad0(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult,
                                                    [opt], tadv,
                                                    des_pol_op_ent)
                    #*newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    #termg = termloss(batch["ob"], [opt], tadv)
                    #adam.update(termg[0], 5e-7 * cur_lrmult)
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
Exemplo n.º 21
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        identifier,
        save_result=True,
        save_interval=100,
        reward_list=[],
        cont=False,
        play=False,
        iter,
        action_repeat=1):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    mirror = hasattr(env, 'mirror_id')
    mirror_id = env.mirror_id if mirror else None
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    if mirror:
        mirror_ob = U.get_placeholder(name="mirror_ob",
                                      dtype=tf.float32,
                                      shape=[None] + list(ob_space.shape))
        mirror_ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    sym_loss = 4 * tf.reduce_mean(tf.square(ac - mirror_ac)) if mirror else 0
    total_loss = pol_surr + pol_entpen + vf_loss + sym_loss

    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
    if mirror:
        losses.append(sym_loss)
        loss_names.append("sym_loss")

    var_list = pi.get_trainable_variables()
    inputs = [ob, ac, atarg, ret, lrmult]
    if mirror:
        inputs += [mirror_ob, mirror_ac]
    lossandgrad = U.function(inputs,
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function(inputs, losses)

    if play:
        return pi

    if cont:
        load_state(identifier, iter)
    else:
        U.initialize()
        iter = 0
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     mirror_id=mirror_id,
                                     action_repeat=action_repeat)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = int(iter)
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_ori = deque(maxlen=100)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        if mirror:
            mirror_ob, mirror_ac = seg["mirror_ob"], seg["mirror_ac"]

        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d_dict = dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret)
        if mirror:
            d_dict["mirror_ob"] = mirror_ob
            d_dict["mirror_ac"] = mirror_ac
        d = Dataset(d_dict, shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                batches = [
                    batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                    cur_lrmult
                ]
                if mirror:
                    batches += [batch["mirror_ob"], batch["mirror_ac"]]
                *newlosses, g = lossandgrad(*batches)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

        losses = []
        for batch in d.iterate_once(optim_batchsize):
            batches = [
                batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                cur_lrmult
            ]
            if mirror:
                batches += [batch["mirror_ob"], batch["mirror_ac"]]
            newlosses = compute_losses(*batches)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)

        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_ori"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, rews_ori = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        rewbuffer_ori.extend(rews_ori)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpRewOriMean", np.mean(rewbuffer_ori))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

            reward_list.append(np.mean(rewbuffer_ori))
            if save_result and iters_so_far % save_interval == 0:
                save_state(identifier, iters_so_far)
                save_rewards(reward_list, identifier, iters_so_far)
                logger.log('Model and reward saved')

    return pi
def learn(
        args,
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        writer=None):
    print("\nBeginning learning...\n")

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.compat.v1.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.compat.v1.placeholder(dtype=tf.float32,
                                   shape=[None])  # Empirical return

    lrmult = tf.compat.v1.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = {}
    ob['adj'] = U.get_placeholder_cached(name="adj")
    ob['node'] = U.get_placeholder_cached(name="node")

    ob_gen = {}
    ob_gen['adj'] = U.get_placeholder(
        shape=[None, ob_space['adj'].shape[0], None, None],
        dtype=tf.float32,
        name='adj_gen')
    ob_gen['node'] = U.get_placeholder(
        shape=[None, 1, None, ob_space['node'].shape[2]],
        dtype=tf.float32,
        name='node_gen')

    ob_real = {}
    ob_real['adj'] = U.get_placeholder(
        shape=[None, ob_space['adj'].shape[0], None, None],
        dtype=tf.float32,
        name='adj_real')
    ob_real['node'] = U.get_placeholder(
        shape=[None, 1, None, ob_space['node'].shape[2]],
        dtype=tf.float32,
        name='node_real')

    ac = tf.compat.v1.placeholder(dtype=tf.int64,
                                  shape=[None, 4],
                                  name='ac_real')

    ## PPO loss
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    pi_logp = pi.pd.logp(ac)
    oldpi_logp = oldpi.pd.logp(ac)
    ratio_log = pi.pd.logp(ac) - oldpi.pd.logp(ac)

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    ## Expert loss
    loss_expert = -tf.reduce_mean(pi_logp)

    ## Discriminator loss
    step_pred_real, step_logit_real = discriminator_net(ob_real,
                                                        args,
                                                        name='d_step')
    step_pred_gen, step_logit_gen = discriminator_net(ob_gen,
                                                      args,
                                                      name='d_step')
    loss_d_step_real = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=step_logit_real,
            labels=tf.ones_like(step_logit_real) * 0.9))
    loss_d_step_gen = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen)))
    loss_d_step = loss_d_step_real + loss_d_step_gen
    if args.gan_type == 'normal':
        loss_g_step_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen)))
    elif args.gan_type == 'recommend':
        loss_g_step_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=step_logit_gen,
                labels=tf.ones_like(step_logit_gen) * 0.9))
    elif args.gan_type == 'wgan':
        loss_d_step, _, _ = discriminator(ob_real, ob_gen, args, name='d_step')
        loss_d_step = loss_d_step * -1
        loss_g_step_gen, _ = discriminator_net(ob_gen, args, name='d_step')

    final_pred_real, final_logit_real = discriminator_net(ob_real,
                                                          args,
                                                          name='d_final')
    final_pred_gen, final_logit_gen = discriminator_net(ob_gen,
                                                        args,
                                                        name='d_final')
    loss_d_final_real = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=final_logit_real,
            labels=tf.ones_like(final_logit_real) * 0.9))
    loss_d_final_gen = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen)))
    loss_d_final = loss_d_final_real + loss_d_final_gen
    if args.gan_type == 'normal':
        loss_g_final_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen)))
    elif args.gan_type == 'recommend':
        loss_g_final_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=final_logit_gen,
                labels=tf.ones_like(final_logit_gen) * 0.9))
    elif args.gan_type == 'wgan':
        loss_d_final, _, _ = discriminator(ob_real,
                                           ob_gen,
                                           args,
                                           name='d_final')
        loss_d_final = loss_d_final * -1
        loss_g_final_gen, _ = discriminator_net(ob_gen, args, name='d_final')

    var_list_pi = pi.get_trainable_variables()
    var_list_pi_stop = [
        var for var in var_list_pi
        if ('emb' in var.name) or ('gcn' in var.name) or ('stop' in var.name)
    ]
    var_list_d_step = [
        var for var in tf.compat.v1.global_variables() if 'd_step' in var.name
    ]
    var_list_d_final = [
        var for var in tf.compat.v1.global_variables() if 'd_final' in var.name
    ]

    ## debug
    debug = {}

    ## loss update function
    lossandgrad_ppo = U.function([
        ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret,
        lrmult
    ], losses + [U.flatgrad(total_loss, var_list_pi)])
    lossandgrad_expert = U.function(
        [ob['adj'], ob['node'], ac, pi.ac_real],
        [loss_expert, U.flatgrad(loss_expert, var_list_pi)])
    lossandgrad_expert_stop = U.function(
        [ob['adj'], ob['node'], ac, pi.ac_real],
        [loss_expert, U.flatgrad(loss_expert, var_list_pi_stop)])
    lossandgrad_d_step = U.function(
        [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']],
        [loss_d_step, U.flatgrad(loss_d_step, var_list_d_step)])
    lossandgrad_d_final = U.function(
        [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']],
        [loss_d_final,
         U.flatgrad(loss_d_final, var_list_d_final)])
    loss_g_gen_step_func = U.function([ob_gen['adj'], ob_gen['node']],
                                      loss_g_step_gen)
    loss_g_gen_final_func = U.function([ob_gen['adj'], ob_gen['node']],
                                       loss_g_final_gen)

    adam_pi = MpiAdam(var_list_pi, epsilon=adam_epsilon)
    adam_pi_stop = MpiAdam(var_list_pi_stop, epsilon=adam_epsilon)
    adam_d_step = MpiAdam(var_list_d_step, epsilon=adam_epsilon)
    adam_d_final = MpiAdam(var_list_d_final, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.compat.v1.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    compute_losses = U.function([
        ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret,
        lrmult
    ], losses)

    # Prepare for rollouts
    # ----------------------------------------
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    lenbuffer_valid = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_env = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_d_step = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_d_final = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_final = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_final_stat = deque(
        maxlen=100)  # rolling buffer for episode rewardsn

    seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch,
                                     True, loss_g_gen_step_func,
                                     loss_g_gen_final_func)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"
    if args.load == 1:
        try:
            fname = './ckpt/' + args.name_full_load
            sess = tf.get_default_session()
            # sess.run(tf.compat.v1.global_variables_initializer())
            saver = tf.train.Saver(var_list_pi)
            saver.restore(sess, fname)
            iters_so_far = int(fname.split('_')[-1]) + 1
            print('model restored!', fname, 'iters_so_far:', iters_so_far)
        except:
            print(fname, 'ckpt not found, start with iters 0')

    U.initialize()
    adam_pi.sync()
    adam_pi_stop.sync()
    adam_d_step.sync()
    adam_d_final.sync()

    counter = 0
    level = 0
    ## start training
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)
        ob_adj, ob_node, ac, atarg, tdlamret = seg["ob_adj"], seg[
            "ob_node"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob_adj=ob_adj,
                         ob_node=ob_node,
                         ac=ac,
                         atarg=atarg,
                         vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob_adj.shape[0]

        # inner training loop, train policy
        for i_optim in range(optim_epochs):

            loss_expert = 0
            loss_expert_stop = 0
            g_expert = 0
            g_expert_stop = 0

            loss_d_step = 0
            loss_d_final = 0
            g_ppo = 0
            g_d_step = 0
            g_d_final = 0

            pretrain_shift = 5
            ## Expert
            if iters_so_far >= args.expert_start and iters_so_far <= args.expert_end + pretrain_shift:
                ## Expert train
                # # # learn how to stop
                ob_expert, ac_expert = env.get_expert(optim_batchsize)
                loss_expert, g_expert = lossandgrad_expert(
                    ob_expert['adj'], ob_expert['node'], ac_expert, ac_expert)
                loss_expert = np.mean(loss_expert)

            ## PPO
            if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end:
                assign_old_eq_new(
                )  # set old parameter values to new parameter values
                batch = d.next_batch(optim_batchsize)
                # ppo
                if iters_so_far >= args.rl_start + pretrain_shift:  # start generator after discriminator trained a well..
                    *newlosses, g_ppo = lossandgrad_ppo(
                        batch["ob_adj"], batch["ob_node"], batch["ac"],
                        batch["ac"], batch["ac"], batch["atarg"],
                        batch["vtarg"], cur_lrmult)
                    losses_ppo = newlosses

                if args.has_d_step == 1 and i_optim >= optim_epochs // 2:
                    # update step discriminator
                    ob_expert, _ = env.get_expert(
                        optim_batchsize,
                        curriculum=args.curriculum,
                        evel_total=args.curriculum_num,
                        evel=level)
                    loss_d_step, g_d_step = lossandgrad_d_step(
                        ob_expert["adj"], ob_expert["node"], batch["ob_adj"],
                        batch["ob_node"])
                    adam_d_step.update(g_d_step, optim_stepsize * cur_lrmult)
                    loss_d_step = np.mean(loss_d_step)

                if args.has_d_final == 1 and i_optim >= optim_epochs // 4 * 3:
                    # update final discriminator
                    ob_expert, _ = env.get_expert(
                        optim_batchsize,
                        is_final=True,
                        curriculum=args.curriculum,
                        level_total=args.curriculum_num,
                        level=level)
                    seg_final_adj, seg_final_node = traj_final_generator(
                        pi, copy.deepcopy(env), optim_batchsize, True)
                    # update final discriminator
                    loss_d_final, g_d_final = lossandgrad_d_final(
                        ob_expert["adj"], ob_expert["node"], seg_final_adj,
                        seg_final_node)
                    adam_d_final.update(g_d_final, optim_stepsize * cur_lrmult)

            # update generator
            adam_pi.update(0.2 * g_ppo + 0.05 * g_expert,
                           optim_stepsize * cur_lrmult)

        # WGAN
        # if args.has_d_step == 1:
        #     clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_step]
        # if args.has_d_final == 1:
        #     clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_final]
        #

        ## PPO val
        # if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end:
        # logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob_adj"], batch["ob_node"],
                                       batch["ac"], batch["ac"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        # logger.log(fmt_row(13, meanlosses))

        if writer is not None:
            writer.add_scalar("loss_expert", loss_expert, iters_so_far)
            writer.add_scalar("loss_expert_stop", loss_expert_stop,
                              iters_so_far)
            writer.add_scalar("loss_d_step", loss_d_step, iters_so_far)
            writer.add_scalar("loss_d_final", loss_d_final, iters_so_far)
            writer.add_scalar('grad_expert_min', np.amin(g_expert),
                              iters_so_far)
            writer.add_scalar('grad_expert_max', np.amax(g_expert),
                              iters_so_far)
            writer.add_scalar('grad_expert_norm', np.linalg.norm(g_expert),
                              iters_so_far)
            writer.add_scalar('grad_expert_stop_min', np.amin(g_expert_stop),
                              iters_so_far)
            writer.add_scalar('grad_expert_stop_max', np.amax(g_expert_stop),
                              iters_so_far)
            writer.add_scalar('grad_expert_stop_norm',
                              np.linalg.norm(g_expert_stop), iters_so_far)
            writer.add_scalar('grad_rl_min', np.amin(g_ppo), iters_so_far)
            writer.add_scalar('grad_rl_max', np.amax(g_ppo), iters_so_far)
            writer.add_scalar('grad_rl_norm', np.linalg.norm(g_ppo),
                              iters_so_far)
            writer.add_scalar('g_d_step_min', np.amin(g_d_step), iters_so_far)
            writer.add_scalar('g_d_step_max', np.amax(g_d_step), iters_so_far)
            writer.add_scalar('g_d_step_norm', np.linalg.norm(g_d_step),
                              iters_so_far)
            writer.add_scalar('g_d_final_min', np.amin(g_d_final),
                              iters_so_far)
            writer.add_scalar('g_d_final_max', np.amax(g_d_final),
                              iters_so_far)
            writer.add_scalar('g_d_final_norm', np.linalg.norm(g_d_final),
                              iters_so_far)
            writer.add_scalar('learning_rate', optim_stepsize * cur_lrmult,
                              iters_so_far)

        for (lossval, name) in zipsame(meanlosses, loss_names):
            # logger.record_tabular("loss_"+name, lossval)
            if writer is not None:
                writer.add_scalar("loss_" + name, lossval, iters_so_far)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        if writer is not None:
            writer.add_scalar("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret),
                              iters_so_far)
        lrlocal = (seg["ep_lens"], seg["ep_lens_valid"], seg["ep_rets"],
                   seg["ep_rets_env"], seg["ep_rets_d_step"],
                   seg["ep_rets_d_final"], seg["ep_final_rew"],
                   seg["ep_final_rew_stat"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, lens_valid, rews, rews_env, rews_d_step, rews_d_final, rews_final, rews_final_stat = map(
            flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        lenbuffer_valid.extend(lens_valid)
        rewbuffer.extend(rews)
        rewbuffer_d_step.extend(rews_d_step)
        rewbuffer_d_final.extend(rews_d_final)
        rewbuffer_env.extend(rews_env)
        rewbuffer_final.extend(rews_final)
        rewbuffer_final_stat.extend(rews_final_stat)
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        # logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        if writer is not None:
            writer.add_scalar("EpLenMean", np.mean(lenbuffer), iters_so_far)
            writer.add_scalar("EpLenValidMean", np.mean(lenbuffer_valid),
                              iters_so_far)
            writer.add_scalar("EpRewMean", np.mean(rewbuffer), iters_so_far)
            writer.add_scalar("EpRewDStepMean", np.mean(rewbuffer_d_step),
                              iters_so_far)
            writer.add_scalar("EpRewDFinalMean", np.mean(rewbuffer_d_final),
                              iters_so_far)
            writer.add_scalar("EpRewEnvMean", np.mean(rewbuffer_env),
                              iters_so_far)
            writer.add_scalar("EpRewFinalMean", np.mean(rewbuffer_final),
                              iters_so_far)
            writer.add_scalar("EpRewFinalStatMean",
                              np.mean(rewbuffer_final_stat), iters_so_far)
            writer.add_scalar("EpThisIter", len(lens), iters_so_far)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        # logger.record_tabular("EpisodesSoFar", episodes_so_far)
        # logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        # logger.record_tabular("TimeElapsed", time.time() - tstart)
        if writer is not None:
            writer.add_scalar("EpisodesSoFar", episodes_so_far, iters_so_far)
            writer.add_scalar("TimestepsSoFar", timesteps_so_far, iters_so_far)
            writer.add_scalar("TimeElapsed",
                              time.time() - tstart, iters_so_far)

        if MPI.COMM_WORLD.Get_rank() == 0:
            with open('molecule_gen/' + args.name_full + '.csv', 'a') as f:
                f.write('***** Iteration {} *****\n'.format(iters_so_far))
            # save
            if iters_so_far % args.save_every == 0:
                fname = './ckpt/' + args.name_full + '_' + str(iters_so_far)
                saver = tf.compat.v1.train.Saver(var_list_pi)
                saver.save(tf.compat.v1.get_default_session(), fname)
                print('model saved!', fname)
                # fname = os.path.join(ckpt_dir, task_name)
                # os.makedirs(os.path.dirname(fname), exist_ok=True)
                # saver = tf.train.Saver()
                # saver.save(tf.get_default_session(), fname)
            # if iters_so_far==args.load_step:
        iters_so_far += 1
        counter += 1
        if counter % args.curriculum_step and counter // args.curriculum_step < args.curriculum_num:
            level += 1
Exemplo n.º 23
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        feature_funcs = []

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        self.std = tf.constant(1.0)
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):

            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            import numpy as np
            # for i in range(0, ob_space.shape[0]):
            #     # Polinomial
            #     # feature_funcs.append(lambda s, i=i: tf.pow(s, i))
            #     # Fourier
            #     # feature_funcs.append(lambda s, i=i: tf.cos(i*np.pi*s))
            #     # RBF
            #     feature_funcs.append(lambda s, i=i: tf.exp(-tf.pow(s - self.ob_rms.mean, 2)/(2*self.ob_rms.std
            #                                                                                     **2)))
            # obz = tf.concat([func(ob) for func in feature_funcs], axis = 1)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(0.1))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name = 'fc%i' % (i + 1), kernel_initializer = U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.multiply(
                    tf.ones(shape=[1, pdtype.param_shape()[0] // 2]),
                    tf.constant(0.05))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        pdparam = tf.clip_by_value(pdparam, -10.0, 10.0)
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 24
0
    def _init(self, ob_space, ac_space, hid_layers=[],
              deterministic=True, diagonal=True, trainable_std=True,
              use_bias=True, use_critic=False,
              seed=None, verbose=True,
              hidden_W_init=U.normc_initializer(1.0),
              higher_mean_init=None,
              higher_logstd_init=tf.constant_initializer(np.log(0.11)),
              const_std_init=False):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid__layers: list with width of each hidden layer
            deterministic: whether the actor is deterministic
            diagonal: whether the higher order policy has a diagonal covariance
            matrix
            use_bias: whether to include bias in neurons
            use_critic: whether to include a critic network
            seed: optional random seed
        """
        # Check environment's shapes
        assert isinstance(ob_space, gym.spaces.Box)
        assert len(ac_space.shape) == 1
        # Set seed
        if seed is not None:
            set_global_seeds(seed)
        # Set some attributes
        self.diagonal = diagonal
        self.use_bias = use_bias
        batch_length = None  # Accepts a sequence of eps of arbitrary length
        self.ac_dim = ac_space.shape[0]
        self.ob_dim = ob_space.shape[0]
        self.linear = not hid_layers
        self.verbose = verbose
        self._ob = ob = U.get_placeholder(
            name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))

        # Actor (N.B.: weight initialization is irrelevant)
        with tf.variable_scope('actor'):
            last_out = ob
            for i, hid_size in enumerate(hid_layers):
                # Mlp feature extraction
                last_out = tf.nn.tanh(
                    tf.layers.dense(last_out, hid_size,
                                    name='fc%i' % (i+1),
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias))
            if deterministic and isinstance(ac_space, gym.spaces.Box):
                # Determinisitc action selection
                self.actor_mean = actor_mean = \
                    tf.layers.dense(last_out, ac_space.shape[0],
                                    name='action',
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias)
            else:
                raise NotImplementedError

        # Get actor flatten weights
        with tf.variable_scope('actor') as scope:
            self.actor_weights = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope=scope.name)
            # flatten weights
            self.flat_actor_weights = tf.concat(
                [tf.reshape(w, [-1]) for w in self.actor_weights], axis=0)
            self._n_actor_weights = n_actor_weights = \
                self.flat_actor_weights.shape[0]

        # Higher order policy (Gaussian)
        with tf.variable_scope('higher'):
            if higher_mean_init is None:
                # Initial means sampled from a normal distribution N(0,1)
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.random_normal(shape=[n_actor_weights.value],
                                     stddev=0.01),
                    tf.zeros(shape=[n_actor_weights]))  # bias init always zero
            self.higher_mean = tf.get_variable(
                name='higher_mean',
                initializer=higher_mean_init,
                shape=self.flat_actor_weights.get_shape())
            # Keep the weights'domain compact
            # self.higher_mean = higher_mean = tf.clip_by_value(
            #     self.higher_mean, -1, 1, 'higher_mean_clipped')
            higher_mean = self.higher_mean
            if diagonal:
                if const_std_init:
                    self.higher_logstd = higher_logstd = \
                        tf.get_variable(
                            name='higher_logstd',
                            initializer=higher_logstd_init,
                            trainable=trainable_std)
                else:
                    self.higher_logstd = higher_logstd = \
                        tf.get_variable(
                            name='higher_logstd',
                            shape=[n_actor_weights],
                            initializer=higher_logstd_init,
                            trainable=trainable_std)
                pdparam = tf.concat([higher_mean,
                                     higher_mean * 0. + higher_logstd],
                                    axis=0)
                self.pdtype = pdtype = \
                    DiagGaussianPdType(n_actor_weights.value)
            else:
                # Cholesky covariance matrix
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights*(n_actor_weights + 1)//2],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat([higher_mean, higher_logstd],
                                    axis=0)
                self.pdtype = pdtype = CholeskyGaussianPdType(
                    n_actor_weights.value)

        # Sample actor weights
        self.pd = pdtype.pdfromflat(pdparam)
        sampled_actor_params = self.pd.sample()
        symm_sampled_actor_params = self.pd.sample_symmetric()
        self._sample_actor_params = U.function([], [sampled_actor_params])
        self._sample_symm_actor_params = U.function(
            [], list(symm_sampled_actor_params))

        # Assign actor weights
        with tf.variable_scope('actor') as scope:
            actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope=scope.name)
            self._use_sampled_actor_params = \
                U.assignFromFlat(actor_params, sampled_actor_params)
            self._get_actor_params = U.GetFlat(actor_params)
            self._set_actor_params = U.SetFromFlat(actor_params)

        # Act
        self._action = action = actor_mean
        self._act = U.function([ob], [action])

        # Manage higher policy weights
        with tf.variable_scope('higher') as scope:
            self._higher_params = higher_params = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
            self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in
                                                 self._higher_params], axis=0)
            self._n_higher_params = self.flat_higher_params.shape[0]
            self._get_flat_higher_params = U.GetFlat(higher_params)
            self._set_higher_params = U.SetFromFlat(self._higher_params)

        # Evaluating
        self._actor_params_in = actor_params_in = \
            U.get_placeholder(name='actor_params_in',
                              dtype=tf.float32,
                              shape=[batch_length] + [n_actor_weights])
        self._rets_in = rets_in = \
            U.get_placeholder(name='returns_in',
                              dtype=tf.float32,
                              shape=[batch_length])
        ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0])
        self._get_ret_mean = U.function([self._rets_in], [ret_mean])
        self._get_ret_std = U.function([self._rets_in], [ret_std])
        self._logprobs = logprobs = self.pd.logp(actor_params_in)
        pgpe_times_n = U.flatgrad(logprobs*rets_in, higher_params)
        self._get_pgpe_times_n = U.function([actor_params_in, rets_in],
                                            [pgpe_times_n])
        self._get_actor_mean = U.function([ob], [self.actor_mean])
        self._get_higher_mean = U.function([ob], [self.higher_mean])
        self._get_higher_std = U.function([], tf.exp([self.higher_logstd]))

        # Batch off-policy PGPE
        self._probs = tf.exp(logprobs)
        self._behavioral = None
        self._renyi_other = None

        # Renyi computation
        self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd))

        # Fisher computation (diagonal case)
        mean_fisher_diag = tf.exp(-2*self.higher_logstd)
        if trainable_std:
            cov_fisher_diag = mean_fisher_diag*0 + 2
            self._fisher_diag = tf.concat(
                [mean_fisher_diag, cov_fisher_diag], axis=0)
        else:
            self._fisher_diag = mean_fisher_diag
        self._get_fisher_diag = U.function([], [self._fisher_diag])
Exemplo n.º 25
0
    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var
        if not isinstance(hid_size, list):
            hid_size = [hid_size]
        if len(hid_size) != num_hid_layers:
            hid_size += [hid_size[-1]] * (num_hid_layers - len(hid_size))

        self.obs = []
        self.pds = []

        for j in range(self._config.num_contexts):
            # obs
            _ob = {}
            for ob_name, ob_shape in self._ob_shape.items():
                _ob[ob_name] = U.get_placeholder(
                    name="ob_{}/from_{}".format(ob_name, j),
                    dtype=tf.float32,
                    shape=[None] + self._ob_shape[ob_name])

            # obs normalization
            if self._config.obs_norm == 'learn':
                obz = [(_ob[ob_name] - self.ob_rms[ob_name].mean) /
                       self.ob_rms[ob_name].std for ob_name in self.ob_type]
            else:
                obz = [_ob[ob_name] for ob_name in self.ob_type]

            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            # value function
            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = self._activation(
                        tf.layers.dense(
                            last_out,
                            hid_size[i],
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                vpred = tf.layers.dense(
                    last_out,
                    1,
                    name="final",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]
                if j == self._id:
                    self.vpred = vpred

            # policy
            pdtype = make_pdtype(ac_space)
            if j == self._id:
                self.pdtype = pdtype
            with tf.variable_scope('pol', reuse=tf.AUTO_REUSE):
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = self._activation(
                        tf.layers.dense(
                            last_out,
                            hid_size[i],
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0] // 2,
                        name="final",
                        kernel_initializer=U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                else:
                    pdparam = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0],
                        name="final",
                        kernel_initializer=U.normc_initializer(0.01))

            self.obs.append([_ob[ob_name] for ob_name in self.ob_type])
            self.pds.append(pdtype.pdfromflat(pdparam))

        self.ob = self.obs[self._id]
        self.pd = self.pds[self._id]

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic] + self.ob, [ac, self.vpred])
        self._value = U.function([stochastic] + self.ob, self.vpred)
Exemplo n.º 26
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "termfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0)))
        # pdb.set_trace()
        # self.op_pi = tf.constant(1./num_options)

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "intfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.intfc = tf.sigmoid(
            U.dense(last_out,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "OP%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.op_pi = tf.nn.softmax(
            U.dense(last_out,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])
Exemplo n.º 27
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = ob  #tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        # Since we are using a Box for the action space
        # this distribution is used DiagGaussianPd
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        # if stocastic = true, the call the sample of the distribion
        # otherwise just use the mean
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemplo n.º 28
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        next_ob = U.get_placeholder(name="next_ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))

        act = U.get_placeholder(name="act",
                                dtype=tf.float32,
                                shape=[sequence_length] + list(ac_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('qf'):
            obz = tf.clip_by_value(
                (next_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

            for i in range(num_hid_layers):
                if i == num_hid_layers - 1:
                    last_out = tf.concat([last_out, act], axis=-1)
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.qpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            # out_std = tf.exp(0.5*logstd + 0.0)
            # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                import numpy as np
                pdparam = tf.concat([
                    mean, mean * 0.0 +
                    np.random.randn(pdtype.param_shape()[0] // 2) * logstd
                ],
                                    axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]
        #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])])
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # define the angle
        #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2])
        #last_out = ctrl_in
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
Exemplo n.º 30
0
    def _init(self, ob_space, ac_space, kind, atom_type_num, args):
        self.pdtype = MultiCatCategoricalPdType
        ### 0 Get input
        ob = {
            'adj':
            U.get_placeholder(
                name="adj",
                dtype=tf.float32,
                shape=[None, ob_space['adj'].shape[0], None, None]),
            'node':
            U.get_placeholder(name="node",
                              dtype=tf.float32,
                              shape=[None, 1, None, ob_space['node'].shape[2]])
        }
        # only when evaluating given action, at training time
        self.ac_real = U.get_placeholder(name='ac_real',
                                         dtype=tf.int64,
                                         shape=[None,
                                                4])  # feed groudtruth action
        ob_node = tf.compat.v1.layers.dense(ob['node'],
                                            8,
                                            activation=None,
                                            use_bias=False,
                                            name='emb')  # embedding layer
        if args.bn == 1:
            ob_node = tf.compat.v1.layers.batch_normalization(ob_node, axis=-1)
        if args.has_concat == 1:
            emb_node = tf.concat(
                (GCN_batch(ob['adj'],
                           ob_node,
                           args.emb_size,
                           name='gcn1',
                           aggregate=args.gcn_aggregate), ob_node),
                axis=-1)
        else:
            emb_node = GCN_batch(ob['adj'],
                                 ob_node,
                                 args.emb_size,
                                 name='gcn1',
                                 aggregate=args.gcn_aggregate)
        if args.bn == 1:
            emb_node = tf.compat.v1.layers.batch_normalization(emb_node,
                                                               axis=-1)
        for i in range(args.layer_num_g - 2):
            if args.has_residual == 1:
                emb_node = GCN_batch(
                    ob['adj'],
                    emb_node,
                    args.emb_size,
                    name='gcn1_' + str(i + 1),
                    aggregate=args.gcn_aggregate) + self.emb_node1
            elif args.has_concat == 1:
                emb_node = tf.concat(
                    (GCN_batch(ob['adj'],
                               emb_node,
                               args.emb_size,
                               name='gcn1_' + str(i + 1),
                               aggregate=args.gcn_aggregate), self.emb_node1),
                    axis=-1)
            else:
                emb_node = GCN_batch(ob['adj'],
                                     emb_node,
                                     args.emb_size,
                                     name='gcn1_' + str(i + 1),
                                     aggregate=args.gcn_aggregate)
            if args.bn == 1:
                emb_node = tf.compat.v1.layers.batch_normalization(emb_node,
                                                                   axis=-1)
        emb_node = GCN_batch(ob['adj'],
                             emb_node,
                             args.emb_size,
                             is_act=False,
                             is_normalize=(args.bn == 0),
                             name='gcn2',
                             aggregate=args.gcn_aggregate)
        emb_node = tf.squeeze(emb_node, axis=1)  # B*n*f

        ### 1 only keep effective nodes
        # ob_mask = tf.cast(tf.transpose(tf.reduce_sum(ob['node'],axis=-1),[0,2,1]),dtype=tf.bool) # B*n*1
        ob_len = tf.reduce_sum(tf.squeeze(tf.cast(tf.cast(tf.reduce_sum(
            ob['node'], axis=-1),
                                                          dtype=tf.bool),
                                                  dtype=tf.float32),
                                          axis=-2),
                               axis=-1)  # B
        ob_len_first = ob_len - atom_type_num
        logits_mask = tf.sequence_mask(ob_len, maxlen=tf.shape(
            ob['node'])[2])  # mask all valid entry
        logits_first_mask = tf.sequence_mask(
            ob_len_first, maxlen=tf.shape(
                ob['node'])[2])  # mask valid entry -3 (rm isolated nodes)

        if args.mask_null == 1:
            emb_node_null = tf.zeros(tf.shape(emb_node))
            emb_node = tf.where(condition=tf.tile(
                tf.expand_dims(logits_mask, axis=-1),
                (1, 1, emb_node.get_shape()[-1])),
                                x=emb_node,
                                y=emb_node_null)

        ## get graph embedding
        emb_graph = tf.reduce_sum(emb_node, axis=1, keepdims=True)
        if args.graph_emb == 1:
            emb_graph = tf.tile(emb_graph, [1, tf.shape(emb_node)[1], 1])
            emb_node = tf.concat([emb_node, emb_graph], axis=2)

        ### 2 predict stop
        emb_stop = tf.compat.v1.layers.dense(emb_node,
                                             args.emb_size,
                                             activation=tf.nn.relu,
                                             use_bias=False,
                                             name='linear_stop1')
        if args.bn == 1:
            emb_stop = tf.compat.v1.layers.batch_normalization(emb_stop,
                                                               axis=-1)
        self.logits_stop = tf.reduce_sum(emb_stop, axis=1)
        self.logits_stop = tf.compat.v1.layers.dense(
            self.logits_stop, 2, activation=None, name='linear_stop2_1')  # B*2
        # explicitly show node num
        # self.logits_stop = tf.concat((tf.reduce_mean(tf.compat.v1.layers.dense(emb_node, 32, activation=tf.nn.relu, name='linear_stop1'),axis=1),tf.reshape(ob_len_first/5,[-1,1])),axis=1)
        # self.logits_stop = tf.compat.v1.layers.dense(self.logits_stop, 2, activation=None, name='linear_stop2')  # B*2

        stop_shift = tf.constant([[0, args.stop_shift]], dtype=tf.float32)
        pd_stop = CategoricalPdType(-1).pdfromflat(flat=self.logits_stop +
                                                   stop_shift)
        ac_stop = pd_stop.sample()

        ### 3.1: select first (active) node
        # rules: only select effective nodes
        self.logits_first = tf.compat.v1.layers.dense(emb_node,
                                                      args.emb_size,
                                                      activation=tf.nn.relu,
                                                      name='linear_select1')
        self.logits_first = tf.squeeze(tf.compat.v1.layers.dense(
            self.logits_first, 1, activation=None, name='linear_select2'),
                                       axis=-1)  # B*n
        logits_first_null = tf.ones(tf.shape(self.logits_first)) * -1000
        self.logits_first = tf.where(condition=logits_first_mask,
                                     x=self.logits_first,
                                     y=logits_first_null)
        # using own prediction
        pd_first = CategoricalPdType(-1).pdfromflat(flat=self.logits_first)
        ac_first = pd_first.sample()
        mask = tf.one_hot(ac_first,
                          depth=tf.shape(emb_node)[1],
                          dtype=tf.bool,
                          on_value=True,
                          off_value=False)
        emb_first = tf.boolean_mask(emb_node, mask)
        emb_first = tf.expand_dims(emb_first, axis=1)
        # using groud truth action
        ac_first_real = self.ac_real[:, 0]
        mask_real = tf.one_hot(ac_first_real,
                               depth=tf.shape(emb_node)[1],
                               dtype=tf.bool,
                               on_value=True,
                               off_value=False)
        emb_first_real = tf.boolean_mask(emb_node, mask_real)
        emb_first_real = tf.expand_dims(emb_first_real, axis=1)

        ### 3.2: select second node
        # rules: do not select first node
        # using own prediction

        # mlp
        emb_cat = tf.concat(
            [tf.tile(emb_first, [1, tf.shape(emb_node)[1], 1]), emb_node],
            axis=2)
        self.logits_second = tf.compat.v1.layers.dense(emb_cat,
                                                       args.emb_size,
                                                       activation=tf.nn.relu,
                                                       name='logits_second1')
        self.logits_second = tf.compat.v1.layers.dense(self.logits_second,
                                                       1,
                                                       activation=None,
                                                       name='logits_second2')
        # # bilinear
        # self.logits_second = tf.transpose(bilinear(emb_first, emb_node, name='logits_second'), [0, 2, 1])

        self.logits_second = tf.squeeze(self.logits_second, axis=-1)
        ac_first_mask = tf.one_hot(ac_first,
                                   depth=tf.shape(emb_node)[1],
                                   dtype=tf.bool,
                                   on_value=False,
                                   off_value=True)
        logits_second_mask = tf.logical_and(logits_mask, ac_first_mask)
        logits_second_null = tf.ones(tf.shape(self.logits_second)) * -1000
        self.logits_second = tf.where(condition=logits_second_mask,
                                      x=self.logits_second,
                                      y=logits_second_null)

        pd_second = CategoricalPdType(-1).pdfromflat(flat=self.logits_second)
        ac_second = pd_second.sample()
        mask = tf.one_hot(ac_second,
                          depth=tf.shape(emb_node)[1],
                          dtype=tf.bool,
                          on_value=True,
                          off_value=False)
        emb_second = tf.boolean_mask(emb_node, mask)
        emb_second = tf.expand_dims(emb_second, axis=1)

        # using groudtruth
        # mlp
        emb_cat = tf.concat(
            [tf.tile(emb_first_real, [1, tf.shape(emb_node)[1], 1]), emb_node],
            axis=2)
        self.logits_second_real = tf.compat.v1.layers.dense(
            emb_cat,
            args.emb_size,
            activation=tf.nn.relu,
            name='logits_second1',
            reuse=True)
        self.logits_second_real = tf.compat.v1.layers.dense(
            self.logits_second_real,
            1,
            activation=None,
            name='logits_second2',
            reuse=True)
        # # bilinear
        # self.logits_second_real = tf.transpose(bilinear(emb_first_real, emb_node, name='logits_second'), [0, 2, 1])

        self.logits_second_real = tf.squeeze(self.logits_second_real, axis=-1)
        ac_first_mask_real = tf.one_hot(ac_first_real,
                                        depth=tf.shape(emb_node)[1],
                                        dtype=tf.bool,
                                        on_value=False,
                                        off_value=True)
        logits_second_mask_real = tf.logical_and(logits_mask,
                                                 ac_first_mask_real)
        self.logits_second_real = tf.where(condition=logits_second_mask_real,
                                           x=self.logits_second_real,
                                           y=logits_second_null)

        ac_second_real = self.ac_real[:, 1]
        mask_real = tf.one_hot(ac_second_real,
                               depth=tf.shape(emb_node)[1],
                               dtype=tf.bool,
                               on_value=True,
                               off_value=False)
        emb_second_real = tf.boolean_mask(emb_node, mask_real)
        emb_second_real = tf.expand_dims(emb_second_real, axis=1)

        ### 3.3 predict edge type
        # using own prediction
        # MLP
        emb_cat = tf.concat([emb_first, emb_second], axis=-1)
        self.logits_edge = tf.compat.v1.layers.dense(emb_cat,
                                                     args.emb_size,
                                                     activation=tf.nn.relu,
                                                     name='logits_edge1')
        self.logits_edge = tf.compat.v1.layers.dense(self.logits_edge,
                                                     ob['adj'].get_shape()[1],
                                                     activation=None,
                                                     name='logits_edge2')
        self.logits_edge = tf.squeeze(self.logits_edge, axis=1)
        # # bilinear
        # self.logits_edge = tf.reshape(bilinear_multi(emb_first,emb_second,out_dim=ob['adj'].get_shape()[1]),[-1,ob['adj'].get_shape()[1]])
        pd_edge = CategoricalPdType(-1).pdfromflat(self.logits_edge)
        ac_edge = pd_edge.sample()

        # using ground truth
        # MLP
        emb_cat = tf.concat([emb_first_real, emb_second_real], axis=-1)
        self.logits_edge_real = tf.compat.v1.layers.dense(
            emb_cat,
            args.emb_size,
            activation=tf.nn.relu,
            name='logits_edge1',
            reuse=True)
        self.logits_edge_real = tf.compat.v1.layers.dense(
            self.logits_edge_real,
            ob['adj'].get_shape()[1],
            activation=None,
            name='logits_edge2',
            reuse=True)
        self.logits_edge_real = tf.squeeze(self.logits_edge_real, axis=1)
        # # bilinear
        # self.logits_edge_real = tf.reshape(bilinear_multi(emb_first_real, emb_second_real, out_dim=ob['adj'].get_shape()[1]),
        #                               [-1, ob['adj'].get_shape()[1]])

        # ncat_list = [tf.shape(logits_first),ob_space['adj'].shape[-1],ob_space['adj'].shape[0]]
        self.pd = self.pdtype(-1).pdfromflat([
            self.logits_first, self.logits_second_real, self.logits_edge_real,
            self.logits_stop
        ])
        self.vpred = tf.compat.v1.layers.dense(emb_node,
                                               args.emb_size,
                                               use_bias=False,
                                               activation=tf.nn.relu,
                                               name='value1')
        if args.bn == 1:
            self.vpred = tf.compat.v1.layers.batch_normalization(self.vpred,
                                                                 axis=-1)
        self.vpred = tf.reduce_max(self.vpred, axis=1)
        self.vpred = tf.compat.v1.layers.dense(self.vpred,
                                               1,
                                               activation=None,
                                               name='value2')

        self.state_in = []
        self.state_out = []

        self.ac = tf.concat(
            (tf.expand_dims(ac_first, axis=1), tf.expand_dims(
                ac_second, axis=1), tf.expand_dims(
                    ac_edge, axis=1), tf.expand_dims(ac_stop, axis=1)),
            axis=1)

        debug = {}
        debug['ob_node'] = tf.shape(ob['node'])
        debug['ob_adj'] = tf.shape(ob['adj'])
        debug['emb_node'] = emb_node
        debug['logits_stop'] = self.logits_stop
        debug['logits_second'] = self.logits_second
        debug['ob_len'] = ob_len
        debug['logits_first_mask'] = logits_first_mask
        debug['logits_second_mask'] = logits_second_mask
        # debug['pd'] = self.pd.logp(self.ac)
        debug['ac'] = self.ac

        stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=())
        self._act = U.function(
            [stochastic, ob['adj'], ob['node']],
            [self.ac, self.vpred, debug])  # add debug in second arg if needed
Exemplo n.º 31
0
    def __init__(self, scope, *, ob_space, ac_space, hiddens, reuse=False, normalize=False):
        self.recurrent = True
        self.normalized = False
        with tf.variable_scope(scope, reuse=reuse):
            self.scope = tf.get_variable_scope().name

            self.pdtype = pdtype = make_pdtype(ac_space)

            assert isinstance(ob_space, gym.spaces.Box)

            #self.observation_ph = tf.placeholder(tf.float32, [None, None] + list(ob_space.shape), name="observation")
            self.observation_ph = U.get_placeholder(name="observation", dtype=tf.float32, shape=[None,None] + list(ob_space.shape))

            # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
            self.stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
            self.taken_action_ph = tf.placeholder(dtype=tf.float32, shape=[None, None, ac_space.shape[0]], name="taken_action")

            if self.normalized:
                if self.normalized != 'ob':
                    self.ret_rms = RunningMeanStd(scope="retfilter")
                self.ob_rms = RunningMeanStd(shape=ob_space.shape, scope="obsfilter")

            obz = self.observation_ph
            if self.normalized:
                obz = tf.clip_by_value((self.observation_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

            last_out = obz
            for hidden in hiddens[:-1]:
                last_out = tf.contrib.layers.fully_connected(last_out, hidden)
            self.zero_state = []
            self.state_in_ph = []
            self.state_out = []
            cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse)
            size = cell.state_size
            self.zero_state.append(np.zeros(size.c, dtype=np.float32))
            self.zero_state.append(np.zeros(size.h, dtype=np.float32))
            self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmv_c"))
            self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmv_h"))
            initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1])

            last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmv")
            self.state_out.append(state_out)

            self.vpredz = tf.contrib.layers.fully_connected(last_out, 1, activation_fn=None)[:, :, 0]
            self.vpred = self.vpredz
            if self.normalized and self.normalized != 'ob':
                self.vpred = self.vpredz * self.ret_rms.std + self.ret_rms.mean  # raw = not standardized

            last_out = obz
            for hidden in hiddens[:-1]:
                last_out = tf.contrib.layers.fully_connected(last_out, hidden)
            cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse)
            size = cell.state_size
            print(" SIZE ")
            print(size)
            self.zero_state.append(np.zeros(size.c, dtype=np.float32))
            self.zero_state.append(np.zeros(size.h, dtype=np.float32))
            self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmp_c"))
            self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmp_h"))
            initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1])
            last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmp")
            self.state_out.append(state_out)

            mean = tf.contrib.layers.fully_connected(last_out, ac_space.shape[0], activation_fn=None)
            logstd = tf.get_variable(name="logstd", shape=[1, ac_space.shape[0]], initializer=tf.zeros_initializer())

            self.pd = DiagonalGaussian(mean, logstd)
            self.sampled_action = switch(self.stochastic_ph, self.pd.sample(), self.pd.mode())

            self.zero_state = np.array(self.zero_state)
            self.state_in_ph = tuple(self.state_in_ph)
            self.state = self.zero_state

            for p in self.get_trainable_variables():
                tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.reduce_sum(tf.square(p)))
Exemplo n.º 32
0
    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var

        # obs
        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}_primitive".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])

        # obs normalization
        self.ob_rms = {}
        for ob_name in self.ob_type:
            with tf.variable_scope("ob_rms_{}".format(ob_name)):
                self.ob_rms[ob_name] = RunningMeanStd(
                    shape=self._ob_shape[ob_name])
        obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
               self.ob_rms[ob_name].std for ob_name in self.ob_type]
        obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
        obz = tf.concat(obz, -1)

        # value function
        with tf.variable_scope("vf"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="final",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        # primitive policy
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("pol"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic] + self.obs, [ac, self.vpred])
        self._value = U.function(self.obs, self.vpred)
Exemplo n.º 33
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind):
        print("This is lstm policy for only sensors.")
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0

        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        # lstm layer for memmory
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn")
        c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
        h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
        self.state_init = (c_init, h_init)
        c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c))
        h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h))
        self.state_in = (c_in, h_in)

        state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
        lstm_outputs, lstm_states = lstm_cell(x, state_in)
        lstm_c, lstm_h = lstm_states
        self.state_out = (lstm_c, lstm_h)

        rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size))
        
        # conjugate sensor and physics
        ob_last = tf.concat((rnn_out, obpz), axis = -1)

        # value network
        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
 
        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])