def __init__(
        self,
        *,
        scope,
        ob_space,
        ac_space,
        stochpol_fn,
        nsteps,
        nepochs=4,
        nminibatches=1,
        gamma=0.99,
        gamma_ext=0.99,
        lam=0.95,
        ent_coef=0,
        cliprange=0.2,
        max_grad_norm=1.0,
        vf_coef=1.0,
        lr=30e-5,
        adam_hps=None,
        testing=False,
        comm=None,
        comm_train=None,
        use_news=False,
        update_ob_stats_every_step=True,
        int_coeff=None,
        ext_coeff=None,
    ):
        self.lr = lr
        self.ext_coeff = ext_coeff
        self.int_coeff = int_coeff
        self.use_news = use_news
        self.update_ob_stats_every_step = update_ob_stats_every_step
        self.abs_scope = (tf.get_variable_scope().name + '/' +
                          scope).lstrip('/')
        self.testing = testing
        self.comm_log = MPI.COMM_SELF
        if comm is not None and comm.Get_size() > 1:
            self.comm_log = comm
            assert not testing or comm.Get_rank(
            ) != 0, "Worker number zero can't be testing"
        if comm_train is not None:
            self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size(
            )
        else:
            self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size(
            )
        self.is_log_leader = self.comm_log.Get_rank() == 0
        self.is_train_leader = self.comm_train.Get_rank() == 0
        with tf.variable_scope(scope):
            self.best_ret = -np.inf
            self.local_best_ret = -np.inf
            self.rooms = []
            self.local_rooms = []
            self.scores = []
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.stochpol = stochpol_fn()
            self.nepochs = nepochs
            self.cliprange = cliprange
            self.nsteps = nsteps
            self.nminibatches = nminibatches
            self.gamma = gamma
            self.gamma_ext = gamma_ext
            self.lam = lam
            self.adam_hps = adam_hps or dict()
            self.ph_adv = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_int = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_ext = tf.placeholder(tf.float32, [None, None])
            self.ph_oldnlp = tf.placeholder(tf.float32, [None, None])
            self.ph_oldvpred = tf.placeholder(tf.float32, [None, None])
            self.ph_lr = tf.placeholder(tf.float32, [])
            self.ph_lr_pred = tf.placeholder(tf.float32, [])
            self.ph_cliprange = tf.placeholder(tf.float32, [])

            # Define loss.
            neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac)
            entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy())
            vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(
                tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int))
            vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(
                tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext))
            vf_loss = vf_loss_int + vf_loss_ext
            ratio = tf.exp(self.ph_oldnlp - neglogpac)  # p_new / p_old
            negadv = -self.ph_adv
            pg_losses1 = negadv * ratio
            pg_losses2 = negadv * tf.clip_by_value(
                ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))
            ent_loss = (-ent_coef) * entropy
            approxkl = .5 * tf.reduce_mean(
                tf.square(neglogpac - self.ph_oldnlp))
            maxkl = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp))
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                       self.ph_cliprange)))
            loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss

            # Create optimizer.
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=self.abs_scope)
            logger.info("PPO: using MpiAdamOptimizer connected to %i peers" %
                        self.comm_train_size)
            trainer = MpiAdamOptimizer(self.comm_train,
                                       learning_rate=self.ph_lr,
                                       **self.adam_hps)
            grads_and_vars = trainer.compute_gradients(loss, params)
            grads, vars = zip(*grads_and_vars)
            if max_grad_norm:
                _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            global_grad_norm = tf.global_norm(grads)
            grads_and_vars = list(zip(grads, vars))
            self._train = trainer.apply_gradients(grads_and_vars)

        # Quantities for reporting.
        self._losses = [
            loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl,
            self.stochpol.aux_loss, self.stochpol.feat_var,
            self.stochpol.max_feat, global_grad_norm
        ]
        self.loss_names = [
            'tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl',
            "auxloss", "featvar", "maxfeat", "gradnorm"
        ]
        self.I = None
        self.disable_policy_update = None
        allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope=self.abs_scope)
        if self.is_log_leader:
            tf_util.display_var_info(allvars)
        tf.get_default_session().run(tf.variables_initializer(allvars))
        sync_from_root(tf.get_default_session(),
                       allvars)  # Syncs initialization across mpi workers.
        self.t0 = time.time()
        self.global_tcount = 0
Пример #2
0
    def __init__(self, *, scope,
                 ob_space, ac_space,
                 stochpol_fn,
                 nsteps, nepochs=4, nminibatches=1,
                 gamma=0.99,
                 gamma_ext=0.99,
                 lam=0.95,
                 ent_coef=0,
                 cliprange=0.2,
                 max_grad_norm=1.0,
                 vf_coef=1.0,
                 lr=30e-5,
                 adam_hps=None,
                 testing=False,
                 kl_testing=False,
                 test_set=None,
                 test_set_clean=None,
                 comm=None, comm_train=None, use_news=False,
                 update_ob_stats_every_step=True,
                 int_coeff=None,
                 ext_coeff=None,
                 noise_type='box',
                 noise_p=0.01,
                 use_sched=0,
                 num_env=16,
                 model_to_load=None,
                 exp_name='none',
                 config=None
                 ):
        self.exp_name = exp_name
        self.noise_type = noise_type
        self.noise_p = noise_p
        self.lr = lr
        self.ext_coeff = ext_coeff
        self.int_coeff = int_coeff
        self.use_news = use_news
        self.update_ob_stats_every_step = update_ob_stats_every_step
        self.abs_scope = (tf.get_variable_scope().name + '/' + scope).lstrip('/')
        self.testing = testing
        self.kl_testing = kl_testing
        self.test_set = test_set
        self.test_set_clean = test_set_clean
        self.comm_log = MPI.COMM_SELF
        self.ep_rews = []
        self.model_to_load = model_to_load

        sess = tf.get_default_session()
        self.summary_writer = tf.summary.FileWriter(os.path.join('checkpoints', self.exp_name, 'tb'), sess.graph)
        if config is not None:
            with sess.graph.as_default():
                config_text = tf.summary.text('Experiment_Config', tf.convert_to_tensor(dict_to_array(config)))
            self.summary_writer.add_summary(config_text.eval(session=sess))

        if comm is not None and comm.Get_size() > 1:
            self.comm_log = comm
            assert not testing or comm.Get_rank() != 0, "Worker number zero can't be testing"
        if comm_train is not None:
            self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size()
        else:
            self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size()

        # make sure only one thread writes log
        self.is_log_leader = self.comm_log.Get_rank()==0
        self.is_train_leader = self.comm_train.Get_rank()==0

        with tf.variable_scope(scope):
            self.best_ret = -np.inf
            self.local_best_ret = - np.inf
            self.rooms = []
            self.local_rooms = []
            self.scores = []
            self.ob_space = ob_space
            self.ac_space = ac_space

            # define reward counter
            self.rew_counter = tf.get_variable('rew_counter', [], initializer=tf.constant_initializer(0.))
            self.inc_rew_counter = tf.assign_add(self.rew_counter, 1./num_env)

            if use_sched:
                self.stochpol = stochpol_fn(rew_counter=self.rew_counter)
            else:
                self.stochpol = stochpol_fn()

            self.nepochs = nepochs
            self.cliprange = cliprange
            self.nsteps = nsteps
            self.nminibatches = nminibatches
            self.gamma = gamma
            self.gamma_ext = gamma_ext
            self.lam = lam
            self.adam_hps = adam_hps or dict()
            self.ph_adv = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_int = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_ext = tf.placeholder(tf.float32, [None, None])
            self.ph_oldnlp = tf.placeholder(tf.float32, [None, None])
            self.ph_oldvpred = tf.placeholder(tf.float32, [None, None])
            self.ph_lr = tf.placeholder(tf.float32, [])
            self.ph_lr_pred = tf.placeholder(tf.float32, [])
            self.ph_cliprange = tf.placeholder(tf.float32, [])


            # Define loss.
            neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac)
            entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy())
            vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int))
            vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext))
            vf_loss = vf_loss_int + vf_loss_ext
            ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old
            negadv = - self.ph_adv
            pg_losses1 = negadv * ratio
            pg_losses2 = negadv * tf.clip_by_value(ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))
            ent_loss =  (- ent_coef) * entropy
            approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.ph_oldnlp))
            maxkl    = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp))
            clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.ph_cliprange)))

            loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss

            # Create optimizer.
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.abs_scope)
            logger.info("PPO: using MpiAdamOptimizer connected to %i peers" % self.comm_train_size)
            trainer = MpiAdamOptimizer(self.comm_train, learning_rate=self.ph_lr, **self.adam_hps)
            grads_and_vars = trainer.compute_gradients(loss, params)
            grads, vars = zip(*grads_and_vars)
            if max_grad_norm:
                _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            global_grad_norm = tf.global_norm(grads)
            grads_and_vars = list(zip(grads, vars))
            self._train = trainer.apply_gradients(grads_and_vars)

        # Quantities for reporting.
        self._losses = [loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl, self.stochpol.aux_loss,
                        self.stochpol.feat_var, self.stochpol.max_feat, global_grad_norm]
        self.loss_names = ['tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl', "auxloss", "featvar",
                           "maxfeat", "gradnorm"]
        self.I = None
        self.disable_policy_update = None
        allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.abs_scope)
        if self.is_log_leader:
            tf_util.display_var_info(allvars)

        tf.get_default_session().run(tf.variables_initializer(allvars))

        # Load saved checkpoint
        if model_to_load is not None:
            latest_ckpt = tf.train.latest_checkpoint(os.path.join('checkpoints', model_to_load))
            if latest_ckpt:
                saver = tf.train.Saver()
                saver.restore(tf.get_default_session(), latest_ckpt)
                print("@@@@@@@@@@@@@@@@@ Loaded checkpoint %s" % latest_ckpt)

        sync_from_root(tf.get_default_session(), allvars) # Syncs initialization across mpi workers.
        self.t0 = time.time()
        self.global_tcount = 0
Пример #3
0
import time