def __init__( self, *, scope, ob_space, ac_space, stochpol_fn, nsteps, nepochs=4, nminibatches=1, gamma=0.99, gamma_ext=0.99, lam=0.95, ent_coef=0, cliprange=0.2, max_grad_norm=1.0, vf_coef=1.0, lr=30e-5, adam_hps=None, testing=False, comm=None, comm_train=None, use_news=False, update_ob_stats_every_step=True, int_coeff=None, ext_coeff=None, ): self.lr = lr self.ext_coeff = ext_coeff self.int_coeff = int_coeff self.use_news = use_news self.update_ob_stats_every_step = update_ob_stats_every_step self.abs_scope = (tf.get_variable_scope().name + '/' + scope).lstrip('/') self.testing = testing self.comm_log = MPI.COMM_SELF if comm is not None and comm.Get_size() > 1: self.comm_log = comm assert not testing or comm.Get_rank( ) != 0, "Worker number zero can't be testing" if comm_train is not None: self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size( ) else: self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size( ) self.is_log_leader = self.comm_log.Get_rank() == 0 self.is_train_leader = self.comm_train.Get_rank() == 0 with tf.variable_scope(scope): self.best_ret = -np.inf self.local_best_ret = -np.inf self.rooms = [] self.local_rooms = [] self.scores = [] self.ob_space = ob_space self.ac_space = ac_space self.stochpol = stochpol_fn() self.nepochs = nepochs self.cliprange = cliprange self.nsteps = nsteps self.nminibatches = nminibatches self.gamma = gamma self.gamma_ext = gamma_ext self.lam = lam self.adam_hps = adam_hps or dict() self.ph_adv = tf.placeholder(tf.float32, [None, None]) self.ph_ret_int = tf.placeholder(tf.float32, [None, None]) self.ph_ret_ext = tf.placeholder(tf.float32, [None, None]) self.ph_oldnlp = tf.placeholder(tf.float32, [None, None]) self.ph_oldvpred = tf.placeholder(tf.float32, [None, None]) self.ph_lr = tf.placeholder(tf.float32, []) self.ph_lr_pred = tf.placeholder(tf.float32, []) self.ph_cliprange = tf.placeholder(tf.float32, []) # Define loss. neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac) entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy()) vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean( tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int)) vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean( tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext)) vf_loss = vf_loss_int + vf_loss_ext ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old negadv = -self.ph_adv pg_losses1 = negadv * ratio pg_losses2 = negadv * tf.clip_by_value( ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange) pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) ent_loss = (-ent_coef) * entropy approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.ph_oldnlp)) maxkl = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.ph_cliprange))) loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss # Create optimizer. params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.abs_scope) logger.info("PPO: using MpiAdamOptimizer connected to %i peers" % self.comm_train_size) trainer = MpiAdamOptimizer(self.comm_train, learning_rate=self.ph_lr, **self.adam_hps) grads_and_vars = trainer.compute_gradients(loss, params) grads, vars = zip(*grads_and_vars) if max_grad_norm: _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) global_grad_norm = tf.global_norm(grads) grads_and_vars = list(zip(grads, vars)) self._train = trainer.apply_gradients(grads_and_vars) # Quantities for reporting. self._losses = [ loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl, self.stochpol.aux_loss, self.stochpol.feat_var, self.stochpol.max_feat, global_grad_norm ] self.loss_names = [ 'tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl', "auxloss", "featvar", "maxfeat", "gradnorm" ] self.I = None self.disable_policy_update = None allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.abs_scope) if self.is_log_leader: tf_util.display_var_info(allvars) tf.get_default_session().run(tf.variables_initializer(allvars)) sync_from_root(tf.get_default_session(), allvars) # Syncs initialization across mpi workers. self.t0 = time.time() self.global_tcount = 0
def __init__(self, *, scope, ob_space, ac_space, stochpol_fn, nsteps, nepochs=4, nminibatches=1, gamma=0.99, gamma_ext=0.99, lam=0.95, ent_coef=0, cliprange=0.2, max_grad_norm=1.0, vf_coef=1.0, lr=30e-5, adam_hps=None, testing=False, kl_testing=False, test_set=None, test_set_clean=None, comm=None, comm_train=None, use_news=False, update_ob_stats_every_step=True, int_coeff=None, ext_coeff=None, noise_type='box', noise_p=0.01, use_sched=0, num_env=16, model_to_load=None, exp_name='none', config=None ): self.exp_name = exp_name self.noise_type = noise_type self.noise_p = noise_p self.lr = lr self.ext_coeff = ext_coeff self.int_coeff = int_coeff self.use_news = use_news self.update_ob_stats_every_step = update_ob_stats_every_step self.abs_scope = (tf.get_variable_scope().name + '/' + scope).lstrip('/') self.testing = testing self.kl_testing = kl_testing self.test_set = test_set self.test_set_clean = test_set_clean self.comm_log = MPI.COMM_SELF self.ep_rews = [] self.model_to_load = model_to_load sess = tf.get_default_session() self.summary_writer = tf.summary.FileWriter(os.path.join('checkpoints', self.exp_name, 'tb'), sess.graph) if config is not None: with sess.graph.as_default(): config_text = tf.summary.text('Experiment_Config', tf.convert_to_tensor(dict_to_array(config))) self.summary_writer.add_summary(config_text.eval(session=sess)) if comm is not None and comm.Get_size() > 1: self.comm_log = comm assert not testing or comm.Get_rank() != 0, "Worker number zero can't be testing" if comm_train is not None: self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size() else: self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size() # make sure only one thread writes log self.is_log_leader = self.comm_log.Get_rank()==0 self.is_train_leader = self.comm_train.Get_rank()==0 with tf.variable_scope(scope): self.best_ret = -np.inf self.local_best_ret = - np.inf self.rooms = [] self.local_rooms = [] self.scores = [] self.ob_space = ob_space self.ac_space = ac_space # define reward counter self.rew_counter = tf.get_variable('rew_counter', [], initializer=tf.constant_initializer(0.)) self.inc_rew_counter = tf.assign_add(self.rew_counter, 1./num_env) if use_sched: self.stochpol = stochpol_fn(rew_counter=self.rew_counter) else: self.stochpol = stochpol_fn() self.nepochs = nepochs self.cliprange = cliprange self.nsteps = nsteps self.nminibatches = nminibatches self.gamma = gamma self.gamma_ext = gamma_ext self.lam = lam self.adam_hps = adam_hps or dict() self.ph_adv = tf.placeholder(tf.float32, [None, None]) self.ph_ret_int = tf.placeholder(tf.float32, [None, None]) self.ph_ret_ext = tf.placeholder(tf.float32, [None, None]) self.ph_oldnlp = tf.placeholder(tf.float32, [None, None]) self.ph_oldvpred = tf.placeholder(tf.float32, [None, None]) self.ph_lr = tf.placeholder(tf.float32, []) self.ph_lr_pred = tf.placeholder(tf.float32, []) self.ph_cliprange = tf.placeholder(tf.float32, []) # Define loss. neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac) entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy()) vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int)) vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext)) vf_loss = vf_loss_int + vf_loss_ext ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old negadv = - self.ph_adv pg_losses1 = negadv * ratio pg_losses2 = negadv * tf.clip_by_value(ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange) pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) ent_loss = (- ent_coef) * entropy approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.ph_oldnlp)) maxkl = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.ph_cliprange))) loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss # Create optimizer. params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.abs_scope) logger.info("PPO: using MpiAdamOptimizer connected to %i peers" % self.comm_train_size) trainer = MpiAdamOptimizer(self.comm_train, learning_rate=self.ph_lr, **self.adam_hps) grads_and_vars = trainer.compute_gradients(loss, params) grads, vars = zip(*grads_and_vars) if max_grad_norm: _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) global_grad_norm = tf.global_norm(grads) grads_and_vars = list(zip(grads, vars)) self._train = trainer.apply_gradients(grads_and_vars) # Quantities for reporting. self._losses = [loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl, self.stochpol.aux_loss, self.stochpol.feat_var, self.stochpol.max_feat, global_grad_norm] self.loss_names = ['tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl', "auxloss", "featvar", "maxfeat", "gradnorm"] self.I = None self.disable_policy_update = None allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.abs_scope) if self.is_log_leader: tf_util.display_var_info(allvars) tf.get_default_session().run(tf.variables_initializer(allvars)) # Load saved checkpoint if model_to_load is not None: latest_ckpt = tf.train.latest_checkpoint(os.path.join('checkpoints', model_to_load)) if latest_ckpt: saver = tf.train.Saver() saver.restore(tf.get_default_session(), latest_ckpt) print("@@@@@@@@@@@@@@@@@ Loaded checkpoint %s" % latest_ckpt) sync_from_root(tf.get_default_session(), allvars) # Syncs initialization across mpi workers. self.t0 = time.time() self.global_tcount = 0
import time