Python Scheduler примеры использования

Язык программирования: Python

Пространство имен/Пакет: rl.acktr.utils

Класс/Тип: Scheduler

Примеров на hotexamples.com: 8

Python Scheduler - 8 примеров найдено. Это лучшие примеры Python кода для rl.acktr.utils.Scheduler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Scheduler(5)

value(4)

Основные методы

Scheduler (5)

value (4)

Пример #1

Показать файл

Файл: kfac_discriminator_wgan_gp.py Проект: kvas7andy/multiagent-gail_wsjeon

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())
        self.adv = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1

        logits = self.build_graph(tf.concat([self.g, self.e], axis=0),
                                  num_outputs,
                                  reuse=False)
        labels = tf.concat([
            tf.ones([tf.shape(self.g)[0], 1]),
            -tf.ones([tf.shape(self.e)[0], 1])
        ],
                           axis=0)

        g_logits = self.build_graph(self.g, num_outputs, reuse=True)
        e_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.g_loss = tf.reduce_mean(g_logits)
        self.e_loss = tf.reduce_mean(-e_logits)

        # self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=g_logits, labels=tf.zeros_like(g_logits)))
        # self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=e_logits, labels=tf.ones_like(e_logits)))

        self.total_loss = logits * labels  # tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) * 5

        sample_net = logits + tf.random_normal(tf.shape(logits))
        fisher_loss = -tf.reduce_mean(
            tf.pow(logits - tf.stop_gradient(sample_net), 2))

        self.reward_op = tf.sigmoid(g_logits)
        # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))

        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        grads = tf.gradients(self.total_loss, params)

        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            d_optim = kfac.KfacOptimizer(learning_rate=self.lr_rate,
                                         clip_kl=kfac_clip,
                                         momentum=0.9,
                                         kfac_update=1,
                                         epsilon=0.01,
                                         stats_decay=0.99,
                                         async=0,
                                         cold_iter=10,
                                         max_grad_norm=max_grad_norm)
            update_stats_op = d_optim.compute_and_apply_stats(fisher_loss,
                                                              var_list=params)
            train_op, q_runner = d_optim.apply_gradients(
                list(zip(grads, params)))
            self.q_runner = q_runner

        self.g_optim = tf.train.AdamOptimizer(learning_rate=0.0005).minimize(
            self.ddd)
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

Пример #2

Показать файл

Файл: kfac_discriminator_airl.py Проект: codailiclr2020/CoDAIL

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 state_only,
                 discount,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5,
                 l2_loss_ratio=0.01):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        self.l2_loss_ratio = l2_loss_ratio
        if disc_type not in disc_types:
            assert False
        self.state_only = state_only
        self.gamma = discount
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            nact = ac_space.n
        except:
            nact = ac_space.shape[0]
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        except:
            self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            self.obs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        elif disc_type == 'decentralized-all':
            self.obs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.all_ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        else:
            assert False

        self.lr_rate = tf.placeholder(tf.float32, ())

        with tf.variable_scope(self.scope):
            rew_input = self.obs
            if not self.state_only:
                rew_input = tf.concat([self.obs, self.act], axis=1)

            with tf.variable_scope('reward'):
                self.reward = self.relu_net(rew_input, dout=1)
                # self.reward = self.tanh_net(rew_input, dout=1)

            with tf.variable_scope('vfn'):
                self.value_fn_n = self.relu_net(self.nobs, dout=1)
                # self.value_fn_n = self.tanh_net(self.nobs, dout=1)
            with tf.variable_scope('vfn', reuse=True):
                self.value_fn = self.relu_net(self.obs, dout=1)
                # self.value_fn = self.tanh_net(self.obs, dout=1)

            log_q_tau = self.lprobs
            log_p_tau = self.reward + self.gamma * self.value_fn_n - self.value_fn
            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)

        self.total_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                          (1 - self.labels) *
                                          (log_q_tau - log_pq))
        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        self.l2_loss = tf.add_n([tf.nn.l2_loss(v)
                                 for v in params]) * self.l2_loss_ratio
        self.total_loss += self.l2_loss

        grads = tf.gradients(self.total_loss, params)
        # fisher_loss = -self.total_loss
        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            # d_optim = kfac.KfacOptimizer(
            #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
            #     momentum=0.9, kfac_update=1, epsilon=0.01,
            #     stats_decay=0.99, async=0, cold_iter=10,
            #     max_grad_norm=max_grad_norm)
            # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
            # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
            # self.q_runner = q_runner
            d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
            train_op = d_optim.apply_gradients(list(zip(grads, params)))
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

Пример #3

Показать файл

Файл: kfac_discriminator_wgan_gp.py Проект: kvas7andy/multiagent-gail_wsjeon

class Discriminator(object):
    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())
        self.adv = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1

        logits = self.build_graph(tf.concat([self.g, self.e], axis=0),
                                  num_outputs,
                                  reuse=False)
        labels = tf.concat([
            tf.ones([tf.shape(self.g)[0], 1]),
            -tf.ones([tf.shape(self.e)[0], 1])
        ],
                           axis=0)

        g_logits = self.build_graph(self.g, num_outputs, reuse=True)
        e_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.g_loss = tf.reduce_mean(g_logits)
        self.e_loss = tf.reduce_mean(-e_logits)

        # self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=g_logits, labels=tf.zeros_like(g_logits)))
        # self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=e_logits, labels=tf.ones_like(e_logits)))

        self.total_loss = logits * labels  # tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) * 5

        sample_net = logits + tf.random_normal(tf.shape(logits))
        fisher_loss = -tf.reduce_mean(
            tf.pow(logits - tf.stop_gradient(sample_net), 2))

        self.reward_op = tf.sigmoid(g_logits)
        # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))

        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        grads = tf.gradients(self.total_loss, params)

        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            d_optim = kfac.KfacOptimizer(learning_rate=self.lr_rate,
                                         clip_kl=kfac_clip,
                                         momentum=0.9,
                                         kfac_update=1,
                                         epsilon=0.01,
                                         stats_decay=0.99,
                                         async=0,
                                         cold_iter=10,
                                         max_grad_norm=max_grad_norm)
            update_stats_op = d_optim.compute_and_apply_stats(fisher_loss,
                                                              var_list=params)
            train_op, q_runner = d_optim.apply_gradients(
                list(zip(grads, params)))
            self.q_runner = q_runner

        self.g_optim = tf.train.AdamOptimizer(learning_rate=0.0005).minimize(
            self.ddd)
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()
        # self.clip = [tf.assign(v, tf.clip_by_value(v, -0.05, 0.05)) for v in self.get_trainable_variables()]
        # self.clip = tf.group(*self.clip)

    def build_graph(self, x, num_outputs=1, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            p_h1 = fc(x, 'fc1', nh=self.hidden_size)
            p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
            logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x)
        return logits

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_reward(self, obs, acs):
        if len(obs.shape) == 1:
            obs = np.expand_dims(obs, 0)
        if len(acs.shape) == 1:
            acs = np.expand_dims(acs, 0)
        feed_dict = {self.g: np.concatenate([obs, acs], axis=1)}
        return self.sess.run(self.reward_op, feed_dict)

    def train(self, g_obs, g_acs, e_obs, e_acs):
        feed_dict = {
            self.g: np.concatenate([g_obs, g_acs], axis=1),
            self.e: np.concatenate([e_obs, e_acs], axis=1),
            self.lr_rate: self.lr.value()
        }
        loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
        for _ in range(5):
            self.sess.run(self.g_optim, feed_dict)
        g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict)
        return g_loss, e_loss, None, None

    def restore(self, path):
        print('restoring from:' + path)
        self.saver.restore(self.sess, path)

    def save(self, save_path):
        ps = self.sess.run(self.params_flat)
        joblib.dump(ps, save_path)

    def load(self, load_path):
        loaded_params = joblib.load(load_path)
        restores = []
        for p, loaded_p in zip(self.params_flat, loaded_params):
            restores.append(p.assign(loaded_p))
        self.sess.run(restores)

Пример #4

Показать файл

Файл: kfac_discriminator_airl.py Проект: codailiclr2020/CoDAIL

class Discriminator(object):
    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 state_only,
                 discount,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5,
                 l2_loss_ratio=0.01):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        self.l2_loss_ratio = l2_loss_ratio
        if disc_type not in disc_types:
            assert False
        self.state_only = state_only
        self.gamma = discount
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            nact = ac_space.n
        except:
            nact = ac_space.shape[0]
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        except:
            self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            self.obs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        elif disc_type == 'decentralized-all':
            self.obs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.all_ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        else:
            assert False

        self.lr_rate = tf.placeholder(tf.float32, ())

        with tf.variable_scope(self.scope):
            rew_input = self.obs
            if not self.state_only:
                rew_input = tf.concat([self.obs, self.act], axis=1)

            with tf.variable_scope('reward'):
                self.reward = self.relu_net(rew_input, dout=1)
                # self.reward = self.tanh_net(rew_input, dout=1)

            with tf.variable_scope('vfn'):
                self.value_fn_n = self.relu_net(self.nobs, dout=1)
                # self.value_fn_n = self.tanh_net(self.nobs, dout=1)
            with tf.variable_scope('vfn', reuse=True):
                self.value_fn = self.relu_net(self.obs, dout=1)
                # self.value_fn = self.tanh_net(self.obs, dout=1)

            log_q_tau = self.lprobs
            log_p_tau = self.reward + self.gamma * self.value_fn_n - self.value_fn
            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)

        self.total_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                          (1 - self.labels) *
                                          (log_q_tau - log_pq))
        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        self.l2_loss = tf.add_n([tf.nn.l2_loss(v)
                                 for v in params]) * self.l2_loss_ratio
        self.total_loss += self.l2_loss

        grads = tf.gradients(self.total_loss, params)
        # fisher_loss = -self.total_loss
        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            # d_optim = kfac.KfacOptimizer(
            #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
            #     momentum=0.9, kfac_update=1, epsilon=0.01,
            #     stats_decay=0.99, async=0, cold_iter=10,
            #     max_grad_norm=max_grad_norm)
            # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
            # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
            # self.q_runner = q_runner
            d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
            train_op = d_optim.apply_gradients(list(zip(grads, params)))
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

    def relu_net(self, x, layers=2, dout=1, hidden_size=128):
        out = x
        for i in range(layers):
            out = relu_layer(out, dout=hidden_size, name='l%d' % i)
        out = linear(out, dout=dout, name='lfinal')
        return out

    def tanh_net(self, x, layers=2, dout=1, hidden_size=128):
        out = x
        for i in range(layers):
            out = tanh_layer(out, dout=hidden_size, name='l%d' % i)
        out = linear(out, dout=dout, name='lfinal')
        return out

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_reward(self, obs, acs, obs_next, path_probs, discrim_score=False):
        if len(obs.shape) == 1:
            obs = np.expand_dims(obs, 0)
        if len(acs.shape) == 1:
            acs = np.expand_dims(acs, 0)
        if discrim_score:
            feed_dict = {
                self.obs: obs,
                self.act: acs,
                self.nobs: obs_next,
                self.lprobs: path_probs
            }
            scores = self.sess.run(self.discrim_output, feed_dict)
            score = np.log(scores + 1e-20) - np.log(1 - scores + 1e-20)
        else:
            feed_dict = {self.obs: obs, self.act: acs}
            score = self.sess.run(self.reward, feed_dict)
        return score

    def train(self, g_obs, g_acs, g_nobs, g_probs, e_obs, e_acs, e_nobs,
              e_probs):
        labels = np.concatenate(
            (np.zeros([g_obs.shape[0], 1]), np.ones([e_obs.shape[0], 1])),
            axis=0)
        feed_dict = {
            self.obs: np.concatenate([g_obs, e_obs], axis=0),
            self.act: np.concatenate([g_acs, e_acs], axis=0),
            self.nobs: np.concatenate([g_nobs, e_nobs], axis=0),
            self.lprobs: np.concatenate([g_probs, e_probs], axis=0),
            self.labels: labels,
            self.lr_rate: self.lr.value()
        }
        loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
        return loss

    def restore(self, path):
        print('restoring from:' + path)
        self.saver.restore(self.sess, path)

    def save(self, save_path):
        ps = self.sess.run(self.params_flat)
        joblib.dump(ps, save_path)

    def load(self, load_path):
        loaded_params = joblib.load(load_path)
        restores = []
        for p, loaded_p in zip(self.params_flat, loaded_params):
            restores.append(p.assign(loaded_p))
        self.sess.run(restores)

Пример #5

Показать файл

Файл: kfac_discriminator_ncdail.py Проект: codailiclr2020/CoDAIL

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        try:
            nact = ac_space.n
        except:
            nact = ac_space.shape[0]
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        except:
            self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.ob_shape + self.all_ac_shape
        elif disc_type == 'decentralized-all':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())
        self.adv = tf.placeholder(tf.float32, ())

        num_outputs = 1

        logits = self.build_graph(tf.concat([self.g, self.e], axis=0),
                                  num_outputs,
                                  reuse=False)
        labels = tf.concat([
            tf.zeros([tf.shape(self.g)[0], 1]),
            tf.ones([tf.shape(self.e)[0], 1])
        ],
                           axis=0)

        g_logits = self.build_graph(self.g, num_outputs, reuse=True)
        e_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.g_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=g_logits, labels=tf.zeros_like(g_logits)))
        self.e_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=e_logits, labels=tf.ones_like(e_logits)))

        self.total_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                    labels=labels))
        fisher_loss = -self.total_loss

        # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1
        # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10)
        # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))
        self.reward_op = tf.log(tf.sigmoid(g_logits) +
                                1e-10) - tf.log(1 - tf.sigmoid(g_logits) +
                                                1e-10)

        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        grads = tf.gradients(self.total_loss, params)

        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            # d_optim = kfac.KfacOptimizer(
            #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
            #     momentum=0.9, kfac_update=1, epsilon=0.01,
            #     stats_decay=0.99, async=0, cold_iter=10,
            #     max_grad_norm=max_grad_norm)
            # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
            # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
            # self.q_runner = q_runner
            d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
            train_op = d_optim.apply_gradients(list(zip(grads, params)))

        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

Пример #6

Показать файл

    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=2,
                 nsteps=200,
                 nstack=1,
                 ent_coef=0.00,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 identical=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nbatch = nenvs * nsteps
        self.num_agents = num_agents = len(ob_space)
        self.n_actions = [ac_space[k].n for k in range(self.num_agents)]
        if identical is None:
            identical = [False for _ in range(self.num_agents)]

        scale = [1 for _ in range(num_agents)]
        pointer = [i for i in range(num_agents)]
        h = 0
        for k in range(num_agents):
            if identical[k]:
                scale[h] += 1
            else:
                pointer[h] = k
                h = k
        pointer[h] = num_agents

        A, ADV, R, PG_LR = [], [], [], []
        for k in range(num_agents):
            if identical[k]:
                A.append(A[-1])
                ADV.append(ADV[-1])
                R.append(R[-1])
                PG_LR.append(PG_LR[-1])
            else:
                A.append(tf.placeholder(tf.int32, [nbatch * scale[k]]))
                ADV.append(tf.placeholder(tf.float32, [nbatch * scale[k]]))
                R.append(tf.placeholder(tf.float32, [nbatch * scale[k]]))
                PG_LR.append(tf.placeholder(tf.float32, []))

        pg_loss, entropy, vf_loss, train_loss = [], [], [], []
        self.model = step_model = []
        self.model2 = train_model = []
        self.pg_fisher = pg_fisher_loss = []
        self.logits = logits = []
        sample_net = []
        self.vf_fisher = vf_fisher_loss = []
        self.joint_fisher = joint_fisher_loss = []
        self.lld = lld = []
        self.log_pac = []

        for k in range(num_agents):
            if identical[k]:
                step_model.append(step_model[-1])
                train_model.append(train_model[-1])
            else:
                step_model.append(
                    policy(sess,
                           ob_space[k],
                           ac_space[k],
                           ob_space,
                           ac_space,
                           nenvs,
                           1,
                           nstack,
                           reuse=False,
                           name='%d' % k))
                train_model.append(
                    policy(sess,
                           ob_space[k],
                           ac_space[k],
                           ob_space,
                           ac_space,
                           nenvs * scale[k],
                           nsteps,
                           nstack,
                           reuse=True,
                           name='%d' % k))
            logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model[k].pi, labels=A[k])
            self.log_pac.append(-logpac)

            lld.append(tf.reduce_mean(logpac))
            logits.append(train_model[k].pi)

            pg_loss.append(tf.reduce_mean(ADV[k] * logpac))
            entropy.append(tf.reduce_mean(cat_entropy(train_model[k].pi)))
            pg_loss[k] = pg_loss[k] - ent_coef * entropy[k]
            vf_loss.append(
                tf.reduce_mean(mse(tf.squeeze(train_model[k].vf), R[k])))
            train_loss.append(pg_loss[k] + vf_coef * vf_loss[k])

            pg_fisher_loss.append(-tf.reduce_mean(logpac))
            sample_net.append(train_model[k].vf +
                              tf.random_normal(tf.shape(train_model[k].vf)))
            vf_fisher_loss.append(-vf_fisher_coef * tf.reduce_mean(
                tf.pow(train_model[k].vf - tf.stop_gradient(sample_net[k]),
                       2)))
            joint_fisher_loss.append(pg_fisher_loss[k] + vf_fisher_loss[k])

        self.policy_params = []
        self.value_params = []

        for k in range(num_agents):
            if identical[k]:
                self.policy_params.append(self.policy_params[-1])
                self.value_params.append(self.value_params[-1])
            else:
                self.policy_params.append(
                    find_trainable_variables("policy_%d" % k))
                self.value_params.append(
                    find_trainable_variables("value_%d" % k))
        self.params = params = [
            a + b for a, b in zip(self.policy_params, self.value_params)
        ]
        params_flat = []
        for k in range(num_agents):
            params_flat.extend(params[k])

        self.grads_check = grads = [
            tf.gradients(train_loss[k], params[k]) for k in range(num_agents)
        ]
        clone_grads = [
            tf.gradients(lld[k], params[k]) for k in range(num_agents)
        ]

        self.optim = optim = []
        self.clones = clones = []
        update_stats_op = []
        train_op, clone_op, q_runner = [], [], []

        for k in range(num_agents):
            if identical[k]:
                optim.append(optim[-1])
                train_op.append(train_op[-1])
                q_runner.append(q_runner[-1])
                clones.append(clones[-1])
                clone_op.append(clone_op[-1])
            else:
                with tf.variable_scope('optim_%d' % k):
                    optim.append(
                        kfac.KfacOptimizer(learning_rate=PG_LR[k],
                                           clip_kl=kfac_clip,
                                           momentum=0.9,
                                           kfac_update=1,
                                           epsilon=0.01,
                                           stats_decay=0.99,
                                           async_var=0,
                                           cold_iter=10,
                                           max_grad_norm=max_grad_norm))
                    update_stats_op.append(optim[k].compute_and_apply_stats(
                        joint_fisher_loss, var_list=params[k]))
                    train_op_, q_runner_ = optim[k].apply_gradients(
                        list(zip(grads[k], params[k])))
                    train_op.append(train_op_)
                    q_runner.append(q_runner_)

                with tf.variable_scope('clone_%d' % k):
                    clones.append(
                        kfac.KfacOptimizer(learning_rate=PG_LR[k],
                                           clip_kl=kfac_clip,
                                           momentum=0.9,
                                           kfac_update=1,
                                           epsilon=0.01,
                                           stats_decay=0.99,
                                           async_var=0,
                                           cold_iter=10,
                                           max_grad_norm=max_grad_norm))
                    update_stats_op.append(clones[k].compute_and_apply_stats(
                        pg_fisher_loss[k], var_list=self.policy_params[k]))
                    clone_op_, q_runner_ = clones[k].apply_gradients(
                        list(zip(clone_grads[k], self.policy_params[k])))
                    clone_op.append(clone_op_)

        update_stats_op = tf.group(*update_stats_op)
        train_ops = train_op
        clone_ops = clone_op
        train_op = tf.group(*train_op)
        clone_op = tf.group(*clone_op)

        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self.clone_lr = Scheduler(v=lr,
                                  nvalues=total_timesteps,
                                  schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = [rewards[k] - values[k] for k in range(num_agents)]
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            ob = np.concatenate(obs, axis=1)

            td_map = {}
            for k in range(num_agents):
                if identical[k]:
                    continue
                new_map = {}
                if num_agents > 1:
                    action_v = []
                    for j in range(k, pointer[k]):
                        action_v.append(
                            np.concatenate([
                                multionehot(actions[i], self.n_actions[i])
                                for i in range(num_agents) if i != k
                            ],
                                           axis=1))
                    action_v = np.concatenate(action_v, axis=0)
                    new_map.update({train_model[k].A_v: action_v})
                    td_map.update({train_model[k].A_v: action_v})

                new_map.update({
                    train_model[k].X:
                    np.concatenate([obs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    train_model[k].X_v:
                    np.concatenate([ob.copy() for j in range(k, pointer[k])],
                                   axis=0),
                    A[k]:
                    np.concatenate([actions[j] for j in range(k, pointer[k])],
                                   axis=0),
                    ADV[k]:
                    np.concatenate([advs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    R[k]:
                    np.concatenate([rewards[j] for j in range(k, pointer[k])],
                                   axis=0),
                    PG_LR[k]:
                    cur_lr / float(scale[k])
                })
                sess.run(train_ops[k], feed_dict=new_map)
                td_map.update(new_map)

                if states[k] != []:
                    td_map[train_model[k].S] = states
                    td_map[train_model[k].M] = masks

            policy_loss, value_loss, policy_entropy = sess.run(
                [pg_loss, vf_loss, entropy], td_map)
            return policy_loss, value_loss, policy_entropy

        def clone(obs, actions):
            td_map = {}
            cur_lr = self.clone_lr.value()
            for k in range(num_agents):
                if identical[k]:
                    continue
                new_map = {}
                new_map.update({
                    train_model[k].X:
                    np.concatenate([obs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    A[k]:
                    np.concatenate([actions[j] for j in range(k, pointer[k])],
                                   axis=0),
                    PG_LR[k]:
                    cur_lr / float(scale[k])
                })
                sess.run(clone_ops[k], feed_dict=new_map)
                td_map.update(new_map)
            lld_loss = sess.run([lld], td_map)
            return lld_loss

        def get_log_action_prob(obs, actions):
            action_prob = []
            for k in range(num_agents):
                if identical[k]:
                    continue
                new_map = {
                    train_model[k].X:
                    np.concatenate([obs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    A[k]:
                    np.concatenate([actions[j] for j in range(k, pointer[k])],
                                   axis=0)
                }
                log_pac = sess.run(self.log_pac[k], feed_dict=new_map)
                if scale[k] == 1:
                    action_prob.append(log_pac)
                else:
                    log_pac = np.split(log_pac, scale[k], axis=0)
                    action_prob += log_pac
            return action_prob

        self.get_log_action_prob = get_log_action_prob

        def get_log_action_prob_step(obs, actions):
            action_prob = []
            for k in range(num_agents):
                action_prob.append(step_model[k].step_log_prob(
                    obs[k], actions[k]))
            return action_prob

        self.get_log_action_prob_step = get_log_action_prob_step

        def save(save_path):
            ps = sess.run(params_flat)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params_flat, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.clone = clone
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model

        def step(ob, av, *_args, **_kwargs):
            a, v, s = [], [], []
            obs = np.concatenate(ob, axis=1)
            for k in range(num_agents):
                a_v = np.concatenate([
                    multionehot(av[i], self.n_actions[i])
                    for i in range(num_agents) if i != k
                ],
                                     axis=1)
                a_, v_, s_ = step_model[k].step(ob[k], obs, a_v)
                a.append(a_)
                v.append(v_)
                s.append(s_)
            return a, v, s

        self.step = step

        def value(obs, av):
            v = []
            ob = np.concatenate(obs, axis=1)
            for k in range(num_agents):
                a_v = np.concatenate([
                    multionehot(av[i], self.n_actions[i])
                    for i in range(num_agents) if i != k
                ],
                                     axis=1)
                v_ = step_model[k].value(ob, a_v)
                v.append(v_)
            return v

        self.value = value
        self.initial_state = [
            step_model[k].initial_state for k in range(num_agents)
        ]

Пример #7

Показать файл

Файл: discriminator.py Проект: kvas7andy/multiagent-gail_wsjeon

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 gp_coef=5,
                 lr_rate=5e-4,
                 total_steps=50000,
                 scope="discriminator"):
        self.lr = Scheduler(v=lr_rate,
                            nvalues=total_steps * 20,
                            schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.all_ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1
        self.bias = tf.get_variable(name=scope + '_bias',
                                    shape=(num_outputs, ),
                                    initializer=tf.zeros_initializer,
                                    trainable=False)
        self.bias_ph = tf.placeholder(tf.float32, (num_outputs, ))
        self.update_bias = tf.assign(self.bias,
                                     self.bias_ph * 0.01 + self.bias * 0.99)

        generator_logits = self.build_graph(self.g, num_outputs, reuse=False)
        expert_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.generator_loss = tf.reduce_mean(generator_logits, axis=0)
        self.expert_loss = tf.reduce_mean(expert_logits, axis=0)

        ddg = tf.gradients(generator_logits, [self.g])
        ddg = tf.sqrt(tf.reduce_sum(tf.square(ddg[0]), axis=1))
        self.ddg = tf.reduce_mean(tf.square(ddg - 1.))

        dde = tf.gradients(expert_logits, [self.e])
        dde = tf.sqrt(tf.reduce_sum(tf.square(dde[0]), axis=1))
        self.dde = tf.reduce_mean(tf.square(dde - 1.))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.))

        self.total_loss = self.generator_loss - self.expert_loss + gp_coef * self.ddd  #(self.ddg + self.dde)
        self.reward_op = generator_logits

        self.var_list = self.get_trainable_variables()
        self.d_optim = tf.train.AdamOptimizer(self.lr_rate,
                                              beta1=0.5,
                                              beta2=0.9).minimize(
                                                  self.total_loss,
                                                  var_list=self.var_list)
        self.saver = tf.train.Saver(self.get_variables())

Пример #8

Показать файл

Файл: discriminator.py Проект: kvas7andy/multiagent-gail_wsjeon

class Discriminator(object):
    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 gp_coef=5,
                 lr_rate=5e-4,
                 total_steps=50000,
                 scope="discriminator"):
        self.lr = Scheduler(v=lr_rate,
                            nvalues=total_steps * 20,
                            schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.all_ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1
        self.bias = tf.get_variable(name=scope + '_bias',
                                    shape=(num_outputs, ),
                                    initializer=tf.zeros_initializer,
                                    trainable=False)
        self.bias_ph = tf.placeholder(tf.float32, (num_outputs, ))
        self.update_bias = tf.assign(self.bias,
                                     self.bias_ph * 0.01 + self.bias * 0.99)

        generator_logits = self.build_graph(self.g, num_outputs, reuse=False)
        expert_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.generator_loss = tf.reduce_mean(generator_logits, axis=0)
        self.expert_loss = tf.reduce_mean(expert_logits, axis=0)

        ddg = tf.gradients(generator_logits, [self.g])
        ddg = tf.sqrt(tf.reduce_sum(tf.square(ddg[0]), axis=1))
        self.ddg = tf.reduce_mean(tf.square(ddg - 1.))

        dde = tf.gradients(expert_logits, [self.e])
        dde = tf.sqrt(tf.reduce_sum(tf.square(dde[0]), axis=1))
        self.dde = tf.reduce_mean(tf.square(dde - 1.))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.))

        self.total_loss = self.generator_loss - self.expert_loss + gp_coef * self.ddd  #(self.ddg + self.dde)
        self.reward_op = generator_logits

        self.var_list = self.get_trainable_variables()
        self.d_optim = tf.train.AdamOptimizer(self.lr_rate,
                                              beta1=0.5,
                                              beta2=0.9).minimize(
                                                  self.total_loss,
                                                  var_list=self.var_list)
        self.saver = tf.train.Saver(self.get_variables())

    def build_graph(self, x, num_outputs=1, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            p_h1 = fc(x, 'fc1', nh=self.hidden_size)
            p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
            p_h3 = fc(p_h2, 'fc3', nh=self.hidden_size)
            logits = fc(p_h3, 'out', nh=num_outputs, act=lambda x: x)
            logits -= self.bias
        return logits

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_reward(self, all_obs, acs):
        if len(all_obs.shape) == 1:
            all_obs = np.expand_dims(all_obs, 0)
        if len(acs.shape) == 1:
            acs = np.expand_dims(acs, 0)
        feed_dict = {self.g: np.concatenate([all_obs, acs], axis=1)}
        return self.sess.run(self.reward_op, feed_dict)

    def train(self, g_all_obs, g_acs, e_all_obs, e_acs):
        feed_dict = {
            self.g: np.concatenate([g_all_obs, g_acs], axis=1),
            self.e: np.concatenate([e_all_obs, e_acs], axis=1),
            self.lr_rate: self.lr.value()
        }
        gl, el, _ = self.sess.run(
            [self.generator_loss, self.expert_loss, self.d_optim], feed_dict)
        # self.sess.run(self.update_bias, feed_dict={self.bias_ph: (gl + el) / 2.0})
        return self.sess.run(
            [self.generator_loss, self.expert_loss, self.ddg, self.dde],
            feed_dict)

    def restore(self, path):
        print('restoring from:' + path)
        self.saver.restore(self.sess, path)