def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm): super(Model, self).__init__(name='PPO2Model') self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.mode = self.train_model.mode self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables)
def switch_training_model(update, is_mpi_root, model_train, _run, iter_loss, session, comm, save=True): if is_mpi_root and save: save_model(model_train, "model", update, _run) # Copy train -> Old, overwriting burnin "burnin" parameters vars_train = tf.get_collection(tf.GraphKeys.VARIABLES, scope="ppo_iter_train") vars_burnin = tf.get_collection(tf.GraphKeys.VARIABLES, scope="ppo_iter_burnin") if not iter_loss["dont_switch_just_reset_burnin"]: # Copy variables over from burnin to train print("Switching variables") for train_var in vars_train: # Get var name: Remove the first part of the name: var_name = "/".join(train_var.name.split("/")[1:]) # Construct burnin var name by prepending the name with "ppo_iter_burnin" burnin_var_name = "/".join(["ppo_iter_burnin", var_name]) # Find the burnin var burnin_var = [v for v in tf.global_variables() if v.name == burnin_var_name][0] # Assign it the "train" value session.run(tf.assign(train_var, burnin_var)) else: print("NOT switching variables") print("Re-initialize burnin variables") # Reinitialize variables in "burnin". Should make them random again. re_init_train_op = tf.initialize_variables(vars_burnin) session.run(re_init_train_op) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(session, global_variables, comm=comm) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, agent_index, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model_%i_act_and_train' % agent_index, reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) with tf.variable_scope('ppo2_model_%i_e' % agent_index, reuse=tf.AUTO_REUSE): e_act_model = policy(nbatch_act, 1, sess) # Model for 'e'xtrinsic and 'c'uriosity rewards if microbatch_size is None: e_train_model = policy(nbatch_train, nsteps, sess) else: e_train_model = policy(microbatch_size, nsteps, sess) with tf.variable_scope('ppo2_model_%i_c' % agent_index, reuse=tf.AUTO_REUSE): c_act_model = policy(nbatch_act, 1, sess) # Model for 'e'xtrinsic and 'c'uriosity rewards if microbatch_size is None: c_train_model = policy(nbatch_train, nsteps, sess) else: c_train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model_%i_act_and_train' % agent_index) # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state # self.save = functools.partial(save_variables, sess=sess) # self.load = functools.partial(load_variables, sess=sess) # END OF TRAIN MODEL # BEGIN OF E_MODEL self.e_A = e_A = e_train_model.pdtype.sample_placeholder([None]) self.e_ADV = e_ADV = tf.placeholder(tf.float32, [None]) self.e_R = e_R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.e_OLDNEGLOGPAC = e_OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.e_OLDVPRED = e_OLDVPRED = tf.placeholder(tf.float32, [None]) self.e_LR = e_LR = tf.placeholder(tf.float32, []) # Cliprange self.e_CLIPRANGE = e_CLIPRANGE = tf.placeholder(tf.float32, []) e_neglogpac = e_train_model.pd.neglogp(e_A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. e_entropy = tf.reduce_mean(e_train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value e_vpred = e_train_model.vf e_vpredclipped = e_OLDVPRED + tf.clip_by_value(e_train_model.vf - e_OLDVPRED, - e_CLIPRANGE, e_CLIPRANGE) # Unclipped value e_vf_losses1 = tf.square(e_vpred - e_R) # Clipped value e_vf_losses2 = tf.square(e_vpredclipped - e_R) e_vf_loss = .5 * tf.reduce_mean(tf.maximum(e_vf_losses1, e_vf_losses2)) # Calculate ratio (pi current policy / pi old policy) e_ratio = tf.exp(e_OLDNEGLOGPAC - e_neglogpac) # Defining Loss = - J is equivalent to max J e_pg_losses = -e_ADV * e_ratio e_pg_losses2 = -e_ADV * tf.clip_by_value(e_ratio, 1.0 - e_CLIPRANGE, 1.0 + e_CLIPRANGE) # Final PG loss e_pg_loss = tf.reduce_mean(tf.maximum(e_pg_losses, e_pg_losses2)) e_approxkl = .5 * tf.reduce_mean(tf.square(e_neglogpac - e_OLDNEGLOGPAC)) e_clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(e_ratio - 1.0), e_CLIPRANGE))) # Total loss e_loss = e_vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters e_params = tf.trainable_variables('ppo2_model_%i_e' % agent_index) # 2. Build our trainer if MPI is not None: self.e_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=e_LR, epsilon=1e-5) else: self.e_trainer = tf.train.AdamOptimizer(learning_rate=e_LR, epsilon=1e-5) # 3. Calculate the gradients e_grads_and_var = self.e_trainer.compute_gradients(e_loss, e_params) e_grads, e_var = zip(*e_grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) e_grads, _e_grad_norm = tf.clip_by_global_norm(e_grads, max_grad_norm) e_grads_and_var = list(zip(e_grads, e_var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.e_grads = e_grads self.e_var = e_var self._e_train_op = self.e_trainer.apply_gradients(e_grads_and_var) self.e_loss_names = ['e_policy_loss', 'e_value_loss', 'e_policy_entropy', 'e_approxkl', 'e_clipfrac'] self.e_stats_list = [e_pg_loss, e_vf_loss, e_entropy, e_approxkl, e_clipfrac] self.e_train_model = e_train_model self.e_act_model = e_act_model self.e_value = e_act_model.value self.e_initial_state = e_act_model.initial_state self.e_step = e_act_model.step # END OF E_MODEL # BEGIN OF C_MODEL self.c_A = c_A = c_train_model.pdtype.sample_placeholder([None]) self.c_ADV = c_ADV = tf.placeholder(tf.float32, [None]) self.c_R = c_R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.c_OLDNEGLOGPAC = c_OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.c_OLDVPRED = c_OLDVPRED = tf.placeholder(tf.float32, [None]) self.c_LR = c_LR = tf.placeholder(tf.float32, []) # Cliprange self.c_CLIPRANGE = c_CLIPRANGE = tf.placeholder(tf.float32, []) c_neglogpac = c_train_model.pd.neglogp(c_A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. c_entropy = tf.reduce_mean(c_train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value c_vpred = c_train_model.vf c_vpredclipped = c_OLDVPRED + tf.clip_by_value(c_train_model.vf - c_OLDVPRED, - c_CLIPRANGE, c_CLIPRANGE) # Unclipped value c_vf_losses1 = tf.square(c_vpred - c_R) # Clipped value c_vf_losses2 = tf.square(c_vpredclipped - c_R) c_vf_loss = .5 * tf.reduce_mean(tf.maximum(c_vf_losses1, c_vf_losses2)) # Calculate ratio (pi current policy / pi old policy) c_ratio = tf.exp(c_OLDNEGLOGPAC - c_neglogpac) # Defining Loss = - J is equivalent to max J c_pg_losses = -c_ADV * c_ratio c_pg_losses2 = -c_ADV * tf.clip_by_value(c_ratio, 1.0 - c_CLIPRANGE, 1.0 + c_CLIPRANGE) # Final PG loss c_pg_loss = tf.reduce_mean(tf.maximum(c_pg_losses, c_pg_losses2)) c_approxkl = .5 * tf.reduce_mean(tf.square(c_neglogpac - c_OLDNEGLOGPAC)) c_clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(c_ratio - 1.0), c_CLIPRANGE))) # Total loss c_loss = c_vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters c_params = tf.trainable_variables('ppo2_model_%i_c' % agent_index) # 2. Build our trainer if MPI is not None: self.c_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=c_LR, epsilon=1e-5) else: self.c_trainer = tf.train.AdamOptimizer(learning_rate=c_LR, epsilon=1e-5) # 3. Calculate the gradients c_grads_and_var = self.c_trainer.compute_gradients(c_loss, c_params) c_grads, c_var = zip(*c_grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) c_grads, _c_grad_norm = tf.clip_by_global_norm(c_grads, max_grad_norm) c_grads_and_var = list(zip(c_grads, c_var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.c_grads = c_grads self.c_var = c_var self._c_train_op = self.c_trainer.apply_gradients(c_grads_and_var) self.c_loss_names = ['c_policy_loss', 'c_valuc_loss', 'c_policy_entropy', 'c_approxkl', 'c_clipfrac'] self.c_stats_list = [c_pg_loss, c_vf_loss, c_entropy, c_approxkl, c_clipfrac] self.c_train_model = c_train_model self.c_act_model = c_act_model self.c_value = c_act_model.value self.c_initial_state = c_act_model.initial_state self.c_step = c_act_model.step self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) # END OF C_MODEL initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) # pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): self.max_grad_norm = max_grad_norm self.head_idx_current_batch = 0 self.critic_idx_current_batch = 0 sess = tf.compat.v1.get_default_session() self.running_stats_s = RunningStats() self.running_stats_s_ = RunningStats() self.running_stats_r = RunningStats() self.running_stats_r_i = RunningStats() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, max_grad_norm) self.train_model = train_model # in case we don't use rep loss rep_loss = None # HEAD_IDX = tf.compat.v1.placeholder(tf.int32, [None]) A = train_model.pdtype.sample_placeholder([None], name='A') A_i = train_model.A_i LATENT_FACTORS = train_model.pdtype.sample_placeholder( [ Config.REP_LOSS_M, Config.POLICY_NHEADS, None, count_latent_factors(Config.ENVIRONMENT) ], name='LATENT_FACTORS') ADV = tf.compat.v1.placeholder(tf.float32, [None], name='ADV') R = tf.compat.v1.placeholder(tf.float32, [None], name='R') R_NCE = tf.compat.v1.placeholder(tf.float32, [Config.REP_LOSS_M, 1, None], name='R_NCE') OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None], name='OLDNEGLOGPAC') OLDNEGLOGPAC_i = tf.compat.v1.placeholder(tf.float32, [None], name='OLDNEGLOGPAC_i') LR = tf.compat.v1.placeholder(tf.float32, [], name='LR') CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [], name='CLIPRANGE') # TD loss for critic # VF loss OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None], name='OLDVPRED') vpred = train_model.vf_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: vpred = vpred[self.critic_idx_current_batch] vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean( input_tensor=tf.maximum(vf_losses1, vf_losses2)) neglogpac_train = train_model.pd_train[0].neglogp(A) ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train) pg_losses_train = -ADV * ratio_train pg_losses2_train = -ADV * tf.clip_by_value( ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean( input_tensor=tf.maximum(pg_losses_train, pg_losses2_train)) approxkl_train = .5 * tf.reduce_mean( input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC)) clipfrac_train = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(ratio_train - 1.0), CLIPRANGE), dtype=tf.float32)) if Config.BETA >= 0: entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0]. _components_distribution.entropy()) else: entropy = tf.reduce_mean( input_tensor=train_model.pd_train[0].entropy()) # Add entropy and policy loss for the samples as well if Config.SNI or Config.SNI2: neglogpac_run = train_model.pd_run.neglogp(A) ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run) pg_losses_run = -ADV * ratio_run pg_losses2_run = -ADV * tf.clip_by_value( ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss += tf.reduce_mean( input_tensor=tf.maximum(pg_losses_run, pg_losses2_run)) pg_loss /= 2. entropy += tf.reduce_mean( input_tensor=train_model.pd_run.entropy()) entropy /= 2. approxkl_run = .5 * tf.reduce_mean( input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC)) clipfrac_run = tf.reduce_mean( input_tensor=tf.cast(tf.greater(tf.abs(ratio_run - 1.0), CLIPRANGE), dtype=tf.float32)) else: approxkl_run = tf.constant(0.) clipfrac_run = tf.constant(0.) params = tf.compat.v1.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum( input_tensor=[tf.nn.l2_loss(v) for v in weight_params]) # The first occurance should be in the train_model if Config.BETA >= 0: info_loss = tf.compat.v1.get_collection(key="INFO_LOSS", scope="model/info_loss") beta = Config.BETA elif Config.BETA_L2A >= 0: info_loss = tf.compat.v1.get_collection(key="INFO_LOSS_L2A", scope="model/info_loss") beta = Config.BETA_L2A else: info_loss = [tf.constant(0.)] beta = 0 # print(info_loss) assert len(info_loss) == 1 info_loss = info_loss[0] loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss + tf.reduce_mean( train_model.curl_loss) aux_loss = tf.reduce_mean(train_model.curl_loss) if Config.SYNC_FROM_ROOT: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) trainer_aux = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=3e-3, epsilon=1e-5) else: trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) self.opt = trainer grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) tot_norm = tf.zeros((1, )) for g, v in grads_and_var: tot_norm += tf.norm(g) tot_norm = tf.reshape(tot_norm, []) _train = trainer.apply_gradients(grads_and_var) grads_and_var_aux = trainer_aux.compute_gradients(aux_loss, params) grads_aux, var_aux = zip(*grads_and_var_aux) if max_grad_norm is not None: grads_aux, _grad_norm_aux = tf.clip_by_global_norm( grads_aux, max_grad_norm) grads_and_var_aux = list(zip(grads_aux, var_aux)) _train_aux = trainer_aux.apply_gradients(grads_and_var_aux) def train(lr, cliprange, obs, returns, masks, actions, infos, values, neglogpacs, values_i, returns_i, states_nce, anchors_nce, labels_nce, actions_nce, neglogps_nce, rewards_nce, infos_nce, target, states=None): values = values[:, self. critic_idx_current_batch] if Config.CUSTOM_REP_LOSS else values advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) if Config.CUSTOM_REP_LOSS: td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, train_model.STATE_NCE: states_nce.transpose(1, 2, 0, 3, 4, 5), train_model.ANCH_NCE: anchors_nce, train_model.LAB_NCE: labels_nce.transpose(1, 0), R_NCE: rewards_nce.transpose(1, 2, 0), train_model.STATE: anchors_nce, train_model.A_i: actions_nce, OLDNEGLOGPAC_i: neglogps_nce[:, 0, self.head_idx_current_batch], ADV_i: advs_i, OLDVPRED_i: values_i, R_i: returns_i } else: td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks # import ipdb;ipdb.set_trace() if target == 'CURL': return sess.run([aux_loss, _train_aux], td_map)[:-1] else: return sess.run([ pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train, approxkl_run, clipfrac_run, l2_loss, info_loss, _train ], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train', 'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss', 'info_loss_cv', 'rep_loss', 'value_i_loss', 'policy_loss_i', 'gradient_norm' ] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load self.rep_vec = act_model.rep_vec self.custom_train = train_model.custom_train if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") sess.run(tf.compat.v1.global_variables_initializer()) sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): act_model = policy(nbatch_act, 1, sess) train_model = policy(nbatch_train, nsteps, sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = tf.trainable_variables('ppo2_model') trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps) norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) params = tf.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT if Config.SYNC_FROM_ROOT: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run([ pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, _train ], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss' ] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps) norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) # VF loss vpred = train_model.vf_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf_train - OLDVPRED, - CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) neglogpac_train = train_model.pd_train.neglogp(A) ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train) pg_losses_train = -ADV * ratio_train pg_losses2_train = -ADV * tf.clip_by_value(ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses_train, pg_losses2_train)) approxkl_train = .5 * tf.reduce_mean(tf.square(neglogpac_train - OLDNEGLOGPAC)) clipfrac_train = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio_train - 1.0), CLIPRANGE))) if Config.BETA >= 0: entropy = tf.reduce_mean(train_model.pd_train._components_distribution.entropy()) else: entropy = tf.reduce_mean(train_model.pd_train.entropy()) # Add entropy and policy loss for the samples as well if Config.SNI or Config.SNI2: neglogpac_run = train_model.pd_run.neglogp(A) ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run) pg_losses_run = -ADV * ratio_run pg_losses2_run = -ADV * tf.clip_by_value(ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss += tf.reduce_mean(tf.maximum(pg_losses_run, pg_losses2_run)) pg_loss /= 2. entropy += tf.reduce_mean(train_model.pd_run.entropy()) entropy /= 2. approxkl_run = .5 * tf.reduce_mean(tf.square(neglogpac_run - OLDNEGLOGPAC)) clipfrac_run = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio_run - 1.0), CLIPRANGE))) else: approxkl_run = tf.constant(0.) clipfrac_run = tf.constant(0.) params = tf.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) # The first occurance should be in the train_model if Config.BETA >= 0: info_loss = tf.get_collection( key="INFO_LOSS", scope="model/info_loss" ) beta = Config.BETA elif Config.BETA_L2A >= 0: info_loss = tf.get_collection( key="INFO_LOSS_L2A", scope="model/info_loss" ) beta = Config.BETA_L2A else: info_loss = [tf.constant(0.)] beta = 0 print(info_loss) assert len(info_loss) == 1 info_loss = info_loss[0] loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss if Config.SYNC_FROM_ROOT: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train, approxkl_run, clipfrac_run, l2_loss, info_loss, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train', 'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss', 'info_loss_cv'] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, fm_coeff=0.002): self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) act_model_clean = policy(nbatch_act, 1, sess, randomization=False) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) train_model_clean = policy(nbatch_train, nsteps, sess, randomization=False) else: train_model = policy(microbatch_size, nsteps, sess) train_model_clean = policy(microbatch_size, nsteps, sess, randomization=False) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) # Normalizing advantage ADV = (ADV - tf.reduce_mean(ADV)) / (reduce_std(ADV) + 1e-8) ############ Training with Randomized Obs ############ # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Calculate the entropy entropy = tf.reduce_mean(train_model.pd.entropy()) # Calculate value loss vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate policy gradient loss neglogpac = train_model.pd.neglogp(A) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) # Record some information approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) ############################################ ############ Training with Clean Obs ############ # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Calculate the entropy entropy_clean = tf.reduce_mean(train_model_clean.pd.entropy()) # Calculate value loss vpred_clean = train_model_clean.vf vpredclipped_clean = OLDVPRED + tf.clip_by_value( train_model_clean.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1_clean = tf.square(vpred_clean - R) vf_losses2_clean = tf.square(vpredclipped_clean - R) vf_loss_clean = .5 * tf.reduce_mean( tf.maximum(vf_losses1_clean, vf_losses2_clean)) # Calculate policy gradient loss neglogpac_clean = train_model_clean.pd.neglogp(A) ratio_clean = tf.exp(OLDNEGLOGPAC - neglogpac_clean) pg_losses_clean = -ADV * ratio_clean pg_losses2_clean = -ADV * tf.clip_by_value( ratio_clean, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss_clean = tf.reduce_mean( tf.maximum(pg_losses_clean, pg_losses2_clean)) # Record some information approxkl_clean = .5 * tf.reduce_mean( tf.square(neglogpac_clean - self.OLDNEGLOGPAC)) clipfrac_clean = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio_clean - 1.0), self.CLIPRANGE))) ############################################ ############ Calculate the total loss ############ fm_loss = tf.losses.mean_squared_error( labels=tf.stop_gradient(train_model_clean.latent_fts), predictions=train_model.latent_fts) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + fm_loss * fm_coeff loss_clean = pg_loss_clean - entropy_clean * ent_coef + vf_loss_clean * vf_coef + fm_loss * fm_coeff self.stats_list = [ loss, fm_loss, pg_loss, vf_loss, entropy, approxkl, clipfrac ] self.stats_list_clean = [ loss_clean, fm_loss, pg_loss_clean, vf_loss_clean, entropy_clean, approxkl_clean, clipfrac_clean ] ################################################## ############ UPDATE THE PARAMETERS ############ # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = mpi_adam.MpiAdamOptimizer( comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads_and_var_clean = self.trainer.compute_gradients( loss_clean, params) grads, var = zip(*grads_and_var) grads_clean, var_clean = zip(*grads_and_var_clean) # 4. Clip the gradient if required if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_clean, _grad_norm = tf.clip_by_global_norm( grads_clean, max_grad_norm) grads_and_var = list(zip(grads, var)) grads_and_var_clean = list(zip(grads_clean, var_clean)) ############################################### self.loss_names = [ 'total_loss', 'fm_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self._train_clean_op = self.trainer.apply_gradients( grads_and_var_clean) self.fm_coeff = fm_coeff self.clean_flag = False self._init_randcnn = tf.variables_initializer(act_model.randcnn_param) self.train_model = train_model self.train_model_clean = train_model_clean self.act_model = act_model self.act_model_clean = act_model_clean self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def initialize(self): if MPI is not None: sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights())
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): act_model = policy(nbatch_act, 1, sess) train_model = policy(nbatch_train, nsteps, sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = tf.trainable_variables('ppo2_model') trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state # def save(file_name): # save_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/" # ps = sess.run(params) # joblib.dump(ps, save_path+file_name) # print("\n------------\nModel with name '{}' saved successfully!\n------------\n".format(file_name)) # # def load(path_to_file): # load_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/promising_ones/" # file_name = LOAD_FILENAME # if path_to_file is None: # path_to_file = load_path + file_name # loaded_params = joblib.load(path_to_file) # restores = [] # for p, loaded_p in zip(params, loaded_params): # restores.append(p.assign(loaded_p)) # sess.run(restores) # print("Model with name '{}' was successfully loaded!".format(file_name)) # was uncommented self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) # uncommented # self.save = save # functools.partial(save_variables, sess=sess) # self.load = load # functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, unsupType='action'): self.sess = sess = get_session() # icm parameters self.unsup = unsupType is not None predictor = None self.numaction = ac_space.n designHead = 'universe' with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) if self.unsup: with tf.variable_scope("predictor", reuse=tf.AUTO_REUSE): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( ob_space, ac_space, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( ob_space, ac_space, designHead) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # computing predictor loss predloss = None if self.unsup: if 'state' in unsupType: predloss = constants[ 'PREDICTION_LR_SCALE'] * predictor.forwardloss else: predloss = constants['PREDICTION_LR_SCALE'] * ( predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) + predictor.forwardloss * constants['FORWARD_LOSS_WT']) # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) if self.unsup: predgrads_and_var = self.trainer.compute_gradients( predloss * 20.0, predictor.var_list) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # clip predictor gradients if self.unsup: predgrads, _ = zip(*predgrads_and_var) predgrads, _ = tf.clip_by_global_norm(predgrads, constants['GRAD_NORM_CLIP']) predgrads_and_var = list(zip(predgrads, predictor.var_list)) # combine the policy and predictor grads and vars grads_and_var = grads_and_var + predgrads_and_var # unzip the grads and var after adding predictor grads/vars grads, var = zip(*grads_and_var) # normalize gradients for logging predgrad_global_norm = tf.global_norm(predgrads) # normalize gradients for logging grad_global_norm = tf.global_norm(grads) self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'grad_global_norm' ] self.stats_list = [ pg_loss, vf_loss, entropy, approxkl, clipfrac, grad_global_norm ] if self.unsup: self.loss_names += [ 'predloss', 'pred_forwardloss', 'pred_invloss', 'predgrad_global_norm' ] self.stats_list += [ predloss, predictor.forwardloss, predictor.invloss, predgrad_global_norm ] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state # prediction bonus function for icm self.pred_bonus = predictor.pred_bonus self.pred_bonuses = predictor.pred_bonuses self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, proportion_of_exp_used_for_predictor_update, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('rnd_ppo_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # Create our RND model that will generate our intrinsic rewards rnd_model = RND(ob_space, proportion_of_exp_used_for_predictor_update) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.INT_R = INT_R = tf.placeholder(tf.float32, [None]) self.EXT_R = EXT_R = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean( tf.square(train_model.vf_int - self.INT_R)) vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean( tf.square(train_model.vf_ext - self.EXT_R)) vf_loss = vf_loss_int + vf_loss_ext # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss + rnd_model.rnd_loss # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('rnd_ppo_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'rnd_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [ pg_loss, vf_loss, rnd_model.rnd_loss, entropy, approxkl, clipfrac ] self.train_model = train_model self.act_model = act_model self.rnd_model = rnd_model self.step = act_model.step self.values = act_model.values self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, disc_coeff=None, num_levels=200): self.sess = sess = get_session() self.num_levels = num_levels if disc_coeff is not None: self.disc_coeff = disc_coeff else: self.disc_coeff = tf.placeholder(tf.float32, []) if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) with tf.variable_scope('vae'): reconstruction = build_reconstructor(train_model.z) with tf.variable_scope('discriminator_model', reuse=tf.AUTO_REUSE): # CREATE DISCRIMINTATOR MODEL discriminator_inputs = train_model.z predicted_logits = build_discriminator(discriminator_inputs, num_levels) self.predicted_labels = tf.nn.softmax(predicted_logits) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) self.TRAIN_GEN = tf.placeholder(tf.float32, []) # Seed labels for the discriminator self.LABELS = LABELS = tf.placeholder(tf.int32, [None]) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state # VAE-related reconstruction_loss = tf.reduce_mean(tf.square(tf.cast(self.train_model.X, tf.float32) - reconstruction * 255.), (1, 2, 3)) latent_loss = -0.5 * tf.reduce_sum(1. + self.train_model.z_log_std_sq - tf.square(self.train_model.z_mean) - tf.exp(self.train_model.z_log_std_sq), 1) vae_loss = tf.reduce_mean(reconstruction_loss + latent_loss) discriminator_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.LABELS, logits=predicted_logits)) discriminator_accuracy = tf.reduce_mean(tf.cast(tf.equal(self.LABELS, tf.argmax(predicted_logits, axis=-1, output_type=tf.int32)), tf.float32)) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = 1. * (pg_loss - entropy * ent_coef + vf_loss * vf_coef) pd_loss = tf.reduce_mean(-1. * tf.reduce_sum((1. / float(num_levels) * (tf.nn.log_softmax(predicted_logits, axis=-1))), axis=-1)) self.update_discriminator_params(comm, discriminator_loss, mpi_rank_weight, LR, max_grad_norm) self.update_vae_params(comm, vae_loss, mpi_rank_weight, LR, max_grad_norm=None) self.update_policy_params(comm, loss, mpi_rank_weight, LR, max_grad_norm) # self.update_all_params(comm, loss + (self.disc_coeff * pd_loss), discriminator_loss, mpi_rank_weight, LR, max_grad_norm) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'discriminator_loss', 'discriminator_accuracy', 'pd_loss', 'softmax_min', 'softmax_max', 'vae_loss', 'reconstruction_loss', 'latent_loss'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac, discriminator_loss, discriminator_accuracy, pd_loss, tf.reduce_min(self.predicted_labels), tf.reduce_max(self.predicted_labels), vae_loss, tf.reduce_mean(reconstruction_loss), tf.reduce_mean(latent_loss)] if isinstance(self.disc_coeff, tf.Tensor): self.loss_names.append("disc_coeff") self.stats_list.append(self.disc_coeff) self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101 self.training_i = 0
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps) norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) # Dipam: Add placeholder for discriminator labels and hyperparameters #DISC_LR = tf.placeholder(tf.float32, []) DISC_LAM = tf.placeholder(tf.float32, []) DISC_LABELS = tf.placeholder(tf.int64, [None]) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) #Dipam: Add loss for domain discriminator here disc_logits = train_model.disc_logits domain_onehot = tf.one_hot(DISC_LABELS, 2) disc_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=disc_logits, labels = domain_onehot)) #disc_trainer = tf.train.AdamOptimizer(learning_rate = DISC_LR, epsilon=1e-5) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) params = tf.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT #if Config.SYNC_FROM_ROOT: # trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) #else: orig_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) feat_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) disc_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) polc_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) feat_params = tf.trainable_variables("model/features") disc_params = tf.trainable_variables("model/discriminator") polc_params = tf.trainable_variables("model/policy") feat_loss = loss - tf.multiply(DISC_LAM,disc_loss) # Flip gradients from discriminator feat_grad_var = feat_trainer.compute_gradients(feat_loss, feat_params) polc_grad_var = polc_trainer.compute_gradients(loss, polc_params) disc_grad_var = disc_trainer.compute_gradients(disc_loss, disc_params) grads_and_var = orig_trainer.compute_gradients(loss, params) # Dipam: Compute discriminator gradients and apply here along with policy gradients grads, var = zip(*grads_and_var) # Dipam: Add discriminator gradients to policy gradients if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # def apply_max_grad_norm(grads_and_var): # grads, var = zip(*grads_and_var) # # if max_grad_norm is not None: # grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) # return list(zip(grads, var)) # Dipam : TODO: This separate grad norm clipping is not correct, # correct method: ppend all the grads and vars -> clip by global norm-> separate-> apply individually # feat_grad_var = apply_max_grad_norm(feat_grad_var) # polc_grad_var = apply_max_grad_norm(polc_grad_var) # disc_grad_var = apply_max_grad_norm(disc_grad_var) _train = orig_trainer.apply_gradients(grads_and_var) _train_feat = feat_trainer.apply_gradients(feat_grad_var) _train_polc = polc_trainer.apply_gradients(polc_grad_var) _train_disc = disc_trainer.apply_gradients(disc_grad_var) def train(lr, cliprange, disc_lam, obs, returns, masks, actions, values, neglogpacs, levelids, states=None): advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) domain_labels = levelids % 2 td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values, DISC_LABELS: domain_labels, DISC_LAM: disc_lam} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if disc_lam == 0: return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, loss,_train], td_map)[:-1] else: return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, loss , feat_loss, disc_loss, _train_feat, _train_polc, _train_disc], td_map)[:-3] self.loss_names = ['policy_grad_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss', 'total_loss'] self.disc_loss_names = ['policy_grad_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss', 'total_loss', 'feat_loss', 'disc_loss'] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): self.max_grad_norm = max_grad_norm self.running_stats_s = RunningStats() self.running_stats_s_ = RunningStats() self.running_stats_r = RunningStats() self.running_stats_r_i = RunningStats() sess = tf.compat.v1.get_default_session() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, max_grad_norm) # in case we don't use rep loss rep_loss = 0 SKILLS = tf.compat.v1.placeholder( tf.float32, shape=[nbatch_train, Config.N_SKILLS], name='mb_skill') A = train_model.pdtype.sample_placeholder([None]) ADV = tf.compat.v1.placeholder(tf.float32, [None]) ADV_2 = tf.compat.v1.placeholder(tf.float32, [None]) ADV = ADV + ADV_2 R = tf.compat.v1.placeholder(tf.float32, [None]) R_i = tf.compat.v1.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None]) OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None]) OLDVPRED_i = tf.compat.v1.placeholder(tf.float32, [None]) LR = tf.compat.v1.placeholder(tf.float32, []) CLIPRANGE = tf.compat.v1.placeholder(tf.float32, []) # VF loss vpred = train_model.vf_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not vpredclipped = OLDVPRED + tf.clip_by_value( train_model.vf_train - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean( input_tensor=tf.maximum(vf_losses1, vf_losses2)) vpred_i = train_model.vf_i_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not vpredclipped_i = OLDVPRED_i + tf.clip_by_value(vpred_i - OLDVPRED_i, -CLIPRANGE, CLIPRANGE) vf_losses1_i = tf.square(vpred_i - R_i) vf_losses2_i = tf.square(vpredclipped_i - R_i) vf_loss_i = .5 * tf.reduce_mean( input_tensor=tf.maximum(vf_losses1_i, vf_losses2_i)) neglogpac_train = train_model.pd_train[0].neglogp(A) ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train) pg_losses_train = -ADV * ratio_train pg_losses2_train = -ADV * tf.clip_by_value( ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean( input_tensor=tf.maximum(pg_losses_train, pg_losses2_train)) approxkl_train = .5 * tf.reduce_mean( input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC)) clipfrac_train = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(ratio_train - 1.0), CLIPRANGE), dtype=tf.float32)) if Config.BETA >= 0: entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0]. _components_distribution.entropy()) else: entropy = tf.reduce_mean( input_tensor=train_model.pd_train[0].entropy()) # Add entropy and policy loss for the samples as well if Config.SNI or Config.SNI2: neglogpac_run = train_model.pd_run.neglogp(A) ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run) pg_losses_run = -ADV * ratio_run pg_losses2_run = -ADV * tf.clip_by_value( ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss += tf.reduce_mean( input_tensor=tf.maximum(pg_losses_run, pg_losses2_run)) pg_loss /= 2. entropy += tf.reduce_mean( input_tensor=train_model.pd_run.entropy()) entropy /= 2. approxkl_run = .5 * tf.reduce_mean( input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC)) clipfrac_run = tf.reduce_mean( input_tensor=tf.cast(tf.greater(tf.abs(ratio_run - 1.0), CLIPRANGE), dtype=tf.float32)) else: approxkl_run = tf.constant(0.) clipfrac_run = tf.constant(0.) params = tf.compat.v1.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum( input_tensor=[tf.nn.l2_loss(v) for v in weight_params]) # The first occurance should be in the train_model if Config.BETA >= 0: info_loss = tf.compat.v1.get_collection(key="INFO_LOSS", scope="model/info_loss") beta = Config.BETA elif Config.BETA_L2A >= 0: info_loss = tf.compat.v1.get_collection(key="INFO_LOSS_L2A", scope="model/info_loss") beta = Config.BETA_L2A else: info_loss = [tf.constant(0.)] beta = 0 # print(info_loss) assert len(info_loss) == 1 info_loss = info_loss[0] rep_loss = tf.reduce_mean( tf.compat.v1.losses.softmax_cross_entropy( onehot_labels=SKILLS, logits=train_model.discriminator_logits)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss + ( rep_loss * Config.REP_LOSS_WEIGHT + vf_loss_i * vf_coef) if Config.SYNC_FROM_ROOT: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) self.opt = trainer grads_and_var = trainer.compute_gradients(loss, params) # idx 40 = v_i/w_0 grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) tot_norm = tf.zeros((1, )) for g, v in grads_and_var: tot_norm += tf.norm(g) tot_norm = tf.reshape(tot_norm, []) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, states_nce, anchors_nce, labels_nce, obs, returns, returns_i, masks, actions, values, values_i, skills, neglogpacs, states=None): advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) advs_i = returns_i - values_i adv_mean_i = np.mean(advs_i, axis=0, keepdims=True) adv_std_i = np.std(advs_i, axis=0, keepdims=True) advs_i = (advs_i - adv_mean_i) / (adv_std_i + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, train_model.STATE: obs, ADV_2: advs_i, OLDVPRED_i: values_i, R_i: returns_i, SKILLS: skills, train_model.Z: skills } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run([ pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train, approxkl_run, clipfrac_run, l2_loss, info_loss, rep_loss, vf_loss_i, _train ], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train', 'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss', 'info_loss_cv', 'discriminator_loss', 'gradient_norm' ] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.value_i = act_model.value_i self.initial_state = act_model.initial_state self.save = save self.load = load self.rep_vec = act_model.rep_vec self.custom_train = train_model.custom_train if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") sess.run(tf.compat.v1.global_variables_initializer()) sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, l1regpi, l2regpi, l1regvf, l2regvf, wclippi, wclipvf, todropoutpi, dropoutpi_keep_prob, dropoutpi_keep_prob_value, todropoutvf, dropoutvf_keep_prob, dropoutvf_keep_prob_value, isbnpitrainmode, isbnvftrainmode): self.sess = sess = get_session() #REGULARIZATION self.toregularizepi = l1regpi > 0 or l2regpi > 0 self.toregularizevf = l1regvf > 0 or l2regvf > 0 self.todropoutpi = todropoutpi self.todropoutvf = todropoutvf self.dropoutpi_keep_prob = dropoutpi_keep_prob #TENSOR self.dropoutpi_keep_prob_value = dropoutpi_keep_prob_value self.dropoutvf_keep_prob = dropoutvf_keep_prob self.dropoutvf_keep_prob_value = dropoutvf_keep_prob_value self.isbnpitrainmode = isbnpitrainmode self.isbnvftrainmode = isbnvftrainmode self.toweightclippi = wclippi > 0 self.toweightclipvf = wclipvf > 0 with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef if self.toregularizepi: print("regularizing policy network: L1 = {}, L2 = {}".format( l1regpi, l2regpi)) regularizerpi = tf.contrib.layers.l1_l2_regularizer( scale_l1=l1regpi, scale_l2=l2regpi, scope='ppo2_model/pi') all_trainable_weights_pi = tf.trainable_variables('ppo2_model/pi') regularization_penalty_pi = tf.contrib.layers.apply_regularization( regularizerpi, all_trainable_weights_pi) loss = loss + regularization_penalty_pi if self.toregularizevf: print("regularizing value network: L1 = {}, L2 = {}".format( l1regvf, l2regvf)) regularizervf = tf.contrib.layers.l1_l2_regularizer( scale_l1=l1regvf, scale_l2=l2regvf, scope='ppo2_model/vf') all_trainable_weights_vf = tf.trainable_variables('ppo2_model/vf') regularization_penalty_vf = tf.contrib.layers.apply_regularization( regularizervf, all_trainable_weights_vf) loss = loss + regularization_penalty_vf # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients #self._update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #with tf.control_dependencies(self._update_op): grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) if self.toweightclippi: print("clipping policy network = {}".format(wclippi)) policyparams = tf.trainable_variables('ppo2_model/pi') self._wclip_ops_pi = [] for toclipvar in policyparams: if 'logstd' in toclipvar.name: continue self._wclip_ops_pi.append( tf.assign(toclipvar, tf.clip_by_value(toclipvar, -wclippi, wclippi))) self._wclip_op_pi = tf.group(*self._wclip_ops_pi) if self.toweightclipvf: print("clipping value network = {}".format(wclipvf)) valueparams = tf.trainable_variables('ppo2_model/vf') self._wclip_ops_vf = [] for toclipvar in valueparams: self._wclip_ops_vf.append( tf.assign(toclipvar, tf.clip_by_value(toclipvar, -wclipvf, wclipvf))) self._wclip_op_vf = tf.group(*self._wclip_ops_vf) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] if self.toregularizepi: self.loss_names.append('regularization_pi') self.stats_list.append(regularization_penalty_pi) if self.toregularizevf: self.loss_names.append('regularization_vf') self.stats_list.append(regularization_penalty_vf) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, adaptive_kl): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(None, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) MEANNOW = train_model.pdtype.sample_placeholder([None]) LOGSTDNOW = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) NEGLOGPACNOW = tf.placeholder(tf.float32, [None]) RHO_NOW = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) KLCONST = tf.placeholder(tf.float32, []) KL_REST = tf.placeholder(tf.float32, [None]) neglogpac = train_model.pd.neglogp(A) mean = train_model.pd.mean logstd = train_model.pd.logstd # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(-0.5 * tf.square((A - mean) / tf.exp(logstd)) - logstd + 0.5 * tf.square((A - MEANNOW) / tf.exp(LOGSTDNOW)) + LOGSTDNOW) sgn = tf.ones_like(ratio) * tf.expand_dims(tf.sign(ADV), 1) ratio_clip = tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Defining Loss = - J is equivalent to max J r = tf.reduce_prod(sgn * tf.minimum(ratio * sgn, ratio_clip * sgn), axis=-1) pg_losses = -r * ADV / tf.stop_gradient( tf.reduce_mean(r)) # * tf.minimum(1.0,RHO_NOW) # Final PG loss # pg_loss = tf.reduce_mean(tf.stop_gradient(tf.maximum(pg_losses, pg_losses2))*(-neglogpac)) + .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - OLDNEGLOGPAC) * KL_REST) approxklold = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxklnow = .5 * tf.reduce_mean( tf.square(neglogpac - NEGLOGPACNOW) * tf.minimum(1.0, RHO_NOW)) kloldnew = tf.reduce_mean( tf.reduce_sum( logstd - LOGSTDNOW + 0.5 * (tf.square(tf.exp(LOGSTDNOW)) + tf.square(mean - MEANNOW)) / tf.square(tf.exp(logstd)) - 0.5, axis=1) * KL_REST) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) pg_loss = tf.reduce_mean(pg_losses) # * tf.minimum(1.0,RHO_NOW)) # Total loss# * tf.minimum(1.0,RHO_NOW)) if adaptive_kl: pg_loss = pg_loss + KLCONST * kloldnew # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') print(params) # 2. Build our trainer if MPI is not None: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) _grad_norm = tf.sqrt( tf.reduce_sum([tf.norm(grad)**2 for grad in grads])) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, klconst, rgae, trunc_rho, obs, returns, advs, masks, actions, values, neglogpacs, mean_now, logstd_now, kl_rest, rho_now, neglogpnow, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') # Normalize the advantages if rgae: r = np.minimum(trunc_rho, rho_now) radvs = r * advs advs = (advs - radvs.mean() / r.mean()) / (radvs.std() + 1e-8) else: advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, MEANNOW: mean_now, LOGSTDNOW: logstd_now, KLCONST: klconst, KL_REST: kl_rest, RHO_NOW: rho_now, NEGLOGPACNOW: neglogpnow } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run([ pg_loss, vf_loss, entropy, approxkl, clipfrac, kloldnew, approxklold, approxklnow, _grad_norm, _train ], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'kloldnew', 'approxklold', 'approxklnow', 'gradnorm' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.meanlogstd = act_model.meanlogstd self.value = act_model.value self.values = train_model.value self.meanlogstds = train_model.meanlogstd self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef, max_grad_norm, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MAPPO2Model') set_global_seeds(seed) # Get state_space and action_space ob_space = agent.observation_space ac_space = agent.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)( **network_kwargs) network = policy_network_fn(ob_space.shape) self.train_model = PolicyWithValue(ac_space, network) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() # if isinstance(network, str): # network = get_network_builder(network)(**network_kwargs) # policy_network = network(ob_space.shape) # value_network = network(ob_space.shape) # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network) # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables) # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables # if MPI is not None: # self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list) # self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list) # else: # self.pi_optimizer = tf.keras.optimizers.Adam() # self.vf_optimizer = tf.keras.optimizers.Adam() self.agent = agent self.nsteps = nsteps self.rho = rho self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=self.train_model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint)
def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale): sess = get_session() #TODO find a better way input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]] self.action_shape = 36 # Placeholders self.state_ = phi_state = tf.placeholder(tf.float32, [None, *input_shape], name="icm_state") self.next_state_ = phi_next_state = tf.placeholder( tf.float32, [None, *input_shape], name="icm_next_state") self.action_ = action = tf.placeholder(tf.float32, [None], name="icm_action") with tf.variable_scope('icm_model'): # Feature encoding # Aka pass state and next_state to create phi(state), phi(next_state) # state --> phi(state) phi_state = self.feature_encoding(self.state_) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # next_state to phi(next_state) phi_next_state = self.feature_encoding(self.next_state_) # INVERSE MODEL pred_actions_logits, pred_actions_prob = self.inverse_model( phi_state, phi_next_state) # FORWARD MODEL pred_phi_next_state = self.forward_model(action, phi_state) # CALCULATE THE ICM LOSS # Inverse Loss LI # We calculate the cross entropy between our ât and at # Squeeze the labels (required) labels = tf.cast(action, tf.int32) self.inv_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_actions_logits, labels=labels), name="inverse_loss") # Foward Loss # LF = 1/2 || pred_phi_next_state - phi_next_state || # TODO 0.5 * ? self.forw_loss = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), name="forward_loss") # Todo predictor lr scale ? # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale self.icm_loss = ( (1 - beta) * self.inv_loss + beta * self.forw_loss) * icm_lr_scale # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters icm_params = tf.trainable_variables('icm_model') # 2. Build our trainer icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=1e-4, epsilon=1e-5) # 3. Calculate the gradients icm_grads_and_var = icm_trainer.compute_gradients( self.icm_loss, icm_params) icm_grads, icm_var = zip(*icm_grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) icm_grads, icm__grad_norm = tf.clip_by_global_norm( icm_grads, max_grad_norm) icm_grads_and_var = list(zip(icm_grads, icm_var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _icm_train = icm_trainer.apply_gradients(icm_grads_and_var) if MPI.COMM_WORLD.Get_rank() == 0: print("Initialize") initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") print("GLOBAL VARIABLES", global_variables) sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn(*, network, env, total_timesteps, iter_loss, arch, _run, seed=None, nsteps=2048, ent_coef=0.0, learning_rate=3e-4, lr_schedule=None, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, load_path=None, mpi_rank_weight=1, comm=None, eval=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: The network model. Will only work with the one in this repo because of IBAC env: baselines.common.vec_env.VecEnv total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) iter_loss: dict the config dict as specified in default.yaml and/or overwritting by command line arguments see sacred for further documentation arch: dict config dict similar to iter_loss eval: dict config dict similar to iter_loss _run: sacred Experiment._run object. Used for logging ent_coef: float policy entropy coefficient in the optimization objective seed: float random seed nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) ent_coef: float value function loss coefficient in the optimization objective learning_rate: float learning rate lr_schedule: None or str If None, use a const. learning rate. If string, only "linear" is implemented at the moment vf_coef: float Coefficient for vf optimisation max_grad_norm: flaot Max gradient norm before it's clipped gamma: float Discount factor lam: float For GAE log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' # Set learning rate schedule lr = get_lr_fn(lr_schedule, start_learning_rate=learning_rate) set_global_seeds(seed) session = get_session() # if isinstance(lr, float): lr = constfn(lr) # else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) model_fn = Model policy = build_policy(env, network, arch, **network_kwargs) # Instantiate the model object (that creates act_model and train_model) def create_model(scope_name, **kwargs): return model_fn(scope_name=scope_name, policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight, iter_loss=iter_loss, arch=arch, **kwargs) # model_train is the teacher and always executed # model_burnin is trained. If teacher and student are swapped, the parameters from burnin are # copied into the teacher and burnin is re-initialized model_train = create_model("ppo_iter_train") model_burnin = create_model( "ppo_iter_burnin", target_vf=model_train.train_model.vf_run, target_dist_param=model_train.train_model.pi_run) get_session().run(tf.variables_initializer(tf.global_variables())) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(session, global_variables, comm=comm) # pylint: disable=E1101 if load_path is not None: print("Load model...") if eval["load_id"]: # Only works with mongodb as backend, not with tinydb raise NotImplementedError("Requires MongoDB backend to work") docs = get_docs(db_uri, db_name, "runs") projection = {'config': True} projection.update({'artifacts': True}) doc = docs.find_one({'_id': eval["load_id"]}, projection) print("Loading model from db to disc") file_id = get_file_id(doc, eval["file_name"]) load_path = os.path.join(logger.get_dir(), "loadmodel_{}".format(_run._id)) save_file_from_db(file_id, load_path, db_uri, db_name) model_train.load(load_path) if eval["switch_after_load"]: switch_training_model(0, is_mpi_root, model_train, _run, iter_loss, session, comm, save=False) # Instantiate the runner object runner = Runner(env=env, model=model_train, model_burnin=model_burnin, nsteps=nsteps, gamma=gamma, lam=lam, iter_loss=iter_loss, eval=eval) epinfobuf = deque(maxlen=100) burnin_data_idx = 0 all_burnin_data = None assert iter_loss["timesteps_anneal"] > iter_loss["v2_buffer_size"] * env.num_envs * nsteps, \ "{}, {}".format(iter_loss["timesteps_anneal"], iter_loss["v2_buffer_size"] * env.num_envs * nsteps) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch current_cycle_count = 0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 num_timesteps = update * nbatch # Start timer frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # 'Burnin_phase' tells us whether we need regularization cycle_count, alpha_reg, burnin_phase = scheduling( num_timesteps, iter_loss, "alpha_reg") if cycle_count != current_cycle_count: current_cycle_count = cycle_count if iter_loss["v2"]: logger.info("Training student") train_student( teacher=model_train, student=model_burnin, data=all_burnin_data, iter_loss=iter_loss, lr=lrnow, cliprange=cliprangenow, nminibatches=nminibatches, session=session, max_idx=burnin_data_idx, nenvs=env.num_envs, nsteps=nsteps, id=_run._id, ) switch_training_model(update, is_mpi_root, model_train, _run, iter_loss, session, comm) # Resetting all_burnin_data = None burnin_data_idx = 0 logger.info("Switched training model") tstart = time.perf_counter() if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, b_returns, masks, actions, values, b_values, neglogpacs, states, b_states, epinfos, burnin_data= \ runner.run(burnin_phase) #pylint: disable=E0632 if burnin_phase and (iter_loss["v2"] or eval["save_latent"]): print("Saving data") if iter_loss["v2_use_files"] or eval["save_latent"]: # Burnin_data_idx is incremented by nsteps, which is nr. of files save_data(burnin_data, burnin_data_idx, _run._id, nsteps) else: if all_burnin_data is None: all_burnin_data = get_all_burnin_data_dict( env, iter_loss, nsteps, comm) for key, value in burnin_data.items(): all_burnin_data[key][burnin_data_idx:burnin_data_idx + nsteps] = value burnin_data_idx += nsteps if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] mblossvals_burnin = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices_train = (arr[mbinds] for arr in (obs, returns, actions, values, neglogpacs)) slices_burnin = (arr[mbinds] for arr in (obs, b_returns, actions, b_values, neglogpacs)) stats_train, train_op_train, feed = model_train.train( lrnow, cliprangenow, *slices_train, ) stats_burnin, train_op_burnin, feed_burnin = model_burnin.train( lrnow, cliprangenow, *slices_burnin, alpha=alpha_reg, ) feed.update(feed_burnin) # Needs both! fetches = {} if eval["eval_only"]: pass session_outputs = {} elif not burnin_phase or iter_loss["v2"]: # For v2, normal PPO training is only the old policy, # The student policy is trained differently fetches.update({ "stats_train": stats_train, }) fetches.update({"train_op": train_op_train}) session_outputs = session.run(fetches, feed) elif (iter_loss["update_old_policy"] or (iter_loss["update_old_policy_in_initial"] and cycle_count == 0)): fetches.update({"stats_burnin": stats_burnin}) fetches.update({"train_op": train_op_burnin}) session_outputs_burnin = session.run(fetches, feed) fetches.update({ "stats_train": stats_train, }) fetches.update({"train_op": train_op_train}) session_outputs = session.run(fetches, feed) session_outputs.update(session_outputs_burnin) else: fetches.update({"stats_burnin": stats_burnin}) fetches.update({"train_op": train_op_burnin}) session_outputs = session.run(fetches, feed) if "stats_train" in session_outputs.keys(): mblossvals.append(session_outputs["stats_train"]) else: mblossvals.append( [0 for loss in model_train.loss_names]) if "stats_burnin" in session_outputs.keys(): mblossvals_burnin.append( session_outputs["stats_burnin"]) else: mblossvals_burnin.append( [0 for loss in model_burnin.loss_names]) else: # recurrent version raise NotImplementedError("Recurrent version not implemented") # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) lossvals_burnin = np.mean(mblossvals_burnin, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model_train.loss_names): logger.logkv('loss/' + lossname, lossval) for (lossval, lossname) in zip(lossvals_burnin, model_burnin.loss_names): logger.logkv('loss_burnin/' + lossname, lossval) logger.logkv("schedule/alpha_reg", alpha_reg) logger.logkv("schedule/current_cycle_count", current_cycle_count) logger.logkv("schedule/burnin_phase", burnin_phase) logger.dumpkvs() if is_mpi_root: save_model(model_train, "model", update, _run) return model_train
def __init__(self, ob_space, ac_space, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, normalize_observations=True, normalize_returns=True, use_tensorboard=False, tb_log_dir=None): self.sess = sess = get_session() self.use_tensorboard = use_tensorboard if MPI is not None and comm is None: comm = MPI.COMM_WORLD # CREATE OUR TWO MODELS network_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] vnetwork_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] # Act model that is used for both sampling act_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=True, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns) # Train model for training train_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=True, reuse=True, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns) # CREATE THE PLACEHOLDERS self.A = A = {k: v.sample_placeholder([None]) for k, v in train_model.pdtypes.items()} self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = sum([train_model.pds[k].neglogp(A[k]) for k in train_model.pdtypes.keys()]) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. #entropy = tf.reduce_mean(train_model.entropy) entropy = tf.reduce_mean(sum([train_model.pds[k].entropy() for k in train_model.pdtypes.keys()])) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.scaled_value_tensor vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables(scope="ppo") # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.act self.value = act_model.value self.initial_state = act_model.zero_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101 if self.use_tensorboard: self.attach_tensorboard(tb_log_dir) self.tb_step = 0
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, mix_mode='nomix', mix_alpha=0.2, mix_beta=0.2, fix_representation=False, use_l2reg=False, l2reg_coeff=1e-4): self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess, mix_mode=mix_mode) else: train_model = policy(microbatch_size, nsteps, sess, mix_mode=mix_mode) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) # Interpolating the supervision if mix_mode == 'mixreg': # get coeff and indices coeff = train_model.coeff indices = train_model.indices other_indices = train_model.other_indices # mixup OLDNEGLOGPAC = coeff * tf.gather(OLDNEGLOGPAC, indices, axis=0) \ + (1 - coeff) * tf.gather( OLDNEGLOGPAC, other_indices, axis=0) OLDVPRED = coeff * tf.gather(OLDVPRED, indices, axis=0) \ + (1 - coeff) * tf.gather(OLDVPRED, other_indices, axis=0) R = coeff * tf.gather(R, indices, axis=0) \ + (1 - coeff) * tf.gather(R, other_indices, axis=0) ADV = coeff * tf.gather(ADV, indices, axis=0) \ + (1 - coeff) * tf.gather(ADV, other_indices, axis=0) A = tf.gather(A, indices, axis=0) elif mix_mode == 'mixobs': # get indices indices = train_model.indices # gather OLDNEGLOGPAC = tf.gather(OLDNEGLOGPAC, train_model.indices, axis=0) OLDVPRED = tf.gather(OLDVPRED, train_model.indices, axis=0) R = tf.gather(R, train_model.indices, axis=0) ADV = tf.gather(ADV, train_model.indices, axis=0) A = tf.gather(A, train_model.indices, axis=0) elif mix_mode == 'nomix': pass else: raise ValueError(f"Unknown mixing mode: {mix_mode} !") # Store the nodes to be recorded self.loss_names = [] self.stats_list = [] ############ CALCULATE LOSS ############ # Total loss = Policy gradient loss - entropy * entropy coefficient # + Value coefficient * value loss # Normalizing advantage ADV = (ADV - tf.reduce_mean(ADV)) / (reduce_std(ADV) + 1e-8) # Calculate the entropy entropy = tf.reduce_mean(train_model.pd.entropy()) # Calculate value loss vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate policy gradient loss neglogpac = train_model.pd.neglogp(A) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Record some information approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) self.loss_names.extend([ 'total_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', ]) self.stats_list.extend([ loss, pg_loss, vf_loss, entropy, approxkl, clipfrac, ]) ############################################ ############ UPDATE THE PARAMETERS ############ # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') if use_l2reg: weight_params = [v for v in params if '/b' not in v.name] l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) self.loss_names.append('l2_loss') self.stats_list.append(l2_loss) loss = loss + l2_loss * l2reg_coeff if fix_representation: params = params[-4:] # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) # 4. Clip the gradient if required if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) ############################################### self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self._init_op = tf.variables_initializer(params) self._sync_param = lambda: sync_from_root(sess, params, comm=comm) self.mix_mode = mix_mode self.mix_alpha = mix_alpha # JAG: Add beta parameter self.mix_beta = mix_beta self.fix_representation = fix_representation self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.adv_gradient = act_model.adv_gradient self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # Exclude the random convolution layer from syncing global_variables = [ v for v in global_variables if 'randcnn' not in v.name ] if MPI is not None: sync_from_root(sess, global_variables, comm=comm)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, lf_coef, max_grad_norm, init_labda=1., microbatch_size=None, threshold=1.): self.sess = sess = get_session() with tf.variable_scope('ppo2_lyapunov_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.l_ADV = l_ADV = tf.placeholder(tf.float32, [None]) # 这两个R都是带衰减的R self.R = R = tf.placeholder(tf.float32, [None]) self.v_l = v_l = tf.placeholder(tf.float32, [None]) log_labda = tf.get_variable('ppo2_lyapunov_model/Labda', None, tf.float32, initializer=tf.log(init_labda)) self.labda = tf.exp(log_labda) self.safety_threshold = tf.placeholder(tf.float32, None, 'threshold') self.threshold = threshold # self.log_labda = tf.placeholder(tf.float32, None, 'Labda') # self.labda = tf.constant(10.) # self.Lam=10. # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.OLDLPRED = OLDLPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Get the predicted value lpred = train_model.lf lpredclipped = OLDLPRED + tf.clip_by_value(train_model.lf - OLDLPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value lf_losses1 = tf.square(lpred - v_l) # Clipped value lf_losses2 = tf.square(lpredclipped - v_l) lf_loss = .5 * tf.reduce_mean(tf.maximum(lf_losses1, lf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining safety loss lpred = train_model.lf lpred_ = train_model.lf_ # self.l_lambda = tf.reduce_mean(ratio * tf.stop_gradient(lpred_) - tf.stop_gradient(lpred)) l_lambda1 = tf.reduce_mean(ratio * l_ADV + v_l - self.safety_threshold) l_lambda2 = tf.reduce_mean( tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) * l_ADV + v_l - self.safety_threshold) l_lambda = tf.maximum(l_lambda1, l_lambda2) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))+ l_lambda*tf.stop_gradient(self.labda) - \ tf.stop_gradient(l_lambda) * log_labda # pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)+ self.l_lambda * self.labda) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + lf_loss * lf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_lyapunov_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'safety_value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'lagrangian' ] self.stats_list = [ pg_loss, vf_loss, lf_loss, entropy, approxkl, clipfrac, self.labda ] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.eval_step = act_model.eval_step self.value = act_model.value self.l_value = act_model.l_value self.l_value_ = act_model.l_value_ self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, network, env, lr=3e-4, cliprange=0.2, nsteps=128, nminibatches=4, noptepochs=4, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, mpi_rank_weight=1, comm=None, microbatch_size=None, load_path=None, **network_kwargs): """ Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies.py env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) nminibatches: int number of training minibatches per update. For recurrent policies.py, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update ent_coef: float policy entropy coefficient in the optimization objective vf_coef: float value function loss coefficient in the optimization objective gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. """ self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD policy = build_policy(env, network, **network_kwargs) self.env = env if isinstance(lr, float): self.lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): self.cliprange = constfn(cliprange) else: assert callable(cliprange) self.nminibatches = nminibatches # if eval_env is not None: # eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # Calculate the batch_size self.nenvs = self.env.num_envs self.nsteps = nsteps self.nbatch = self.nenvs * self.nsteps self.nbatch_train = self.nbatch // nminibatches self.noptepochs = noptepochs with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(self.nenvs, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(self.nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder( [None]) # action placeholder self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # ratio 裁剪量 # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.def_path_pre = os.path.dirname( os.path.abspath(__file__)) + '/tmp/' initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) # pylint: disable=E1101 if load_path is not None: self.load_newest(load_path) # Instantiate the runner object self.runner = Runner(env=self.env, model=self, nsteps=nsteps, gamma=gamma, lam=lam)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(nbatch_train, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, load_path, skip_layers=[], frozen_weights=[], transfer_weights=False, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS def print_weights(params): variables_names = [v.name for v in params] values = sess.run(variables_names) for k, v in zip(variables_names, values): if str(k) == 'ppo2_model/vf/w:0': print("Variable: " + str(k)) print("Shape: " + str(v.shape)) print(v) # Initialise the already_initialised array already_inits = [] # Transfer weights from an already trained model # TODO: this is if we are going to use transfer learning if transfer_weights: # Get all variables from the model. variables_to_restore = { v.name.split(":")[0]: v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) } # Skip some variables during restore. skip_pretrained_var = skip_layers variables_to_restore = { v: variables_to_restore[v] for v in variables_to_restore if not any(x in v for x in skip_pretrained_var) } already_inits = variables_to_restore # Restore the remaining variables if variables_to_restore: saver_pre_trained = tf.train.Saver( var_list=variables_to_restore) saver_pre_trained.restore( sess, tf.train.latest_checkpoint(load_path)) # Collect all trainale variables params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # Freeze certain variables params = tf.contrib.framework.filter_variables( params, include_patterns=['model'], exclude_patterns=frozen_weights) # Initialise all the other variables ''' """Initialize all the uninitialized variables in the global scope.""" new_variables = set(tf.global_variables()) new_variables = tf.contrib.framework.filter_variables( new_variables, include_patterns=[], exclude_patterns= variables_to_restore) tf.get_default_session().run(tf.variables_initializer(new_variables)) ''' else: # If we are not using transfer learning # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = zip(grads, var) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state #self.save = functools.partial(save_variables, sess=sess) #self.load = functools.partial(load_variables, sess=sess) initialize(already_inits) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale, idf): sess = get_session() #TODO find a better way input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]] # input_shape = ob_space print("ICM state Input shape ", np.shape(input_shape), " ", input_shape) self.action_shape = 36 self.idf = idf # Placeholders self.state_ = phi_state = tf.placeholder(tf.float32, [None, *input_shape], name="icm_state") self.next_state_ = phi_next_state = tf.placeholder( tf.float32, [None, *input_shape], name="icm_next_state") self.action_ = action = tf.placeholder(tf.float32, [None], name="icm_action") # self.R = rewards = tf.placeholder(tf.float32, shape=[None], name="maxR") with tf.variable_scope('icm_model'): # Feature encoding # Aka pass state and next_state to create phi(state), phi(next_state) # state --> phi(state) print("Feature Encodding of phi state with shape :: ", self.state_) phi_state = self.feature_encoding(self.state_) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # next_state to phi(next_state) phi_next_state = self.feature_encoding(self.next_state_) # INVERSE MODEL if self.idf: pred_actions_logits, pred_actions_prob = self.inverse_model( phi_state, phi_next_state) # FORWARD MODEL pred_phi_next_state = self.forward_model(action, phi_state) # CALCULATE THE ICM LOSS # Inverse Loss LI # We calculate the cross entropy between our ât and at # Squeeze the labels (required) labels = tf.cast(action, tf.int32) print("prediction pred_actions_logits") if self.idf: self.inv_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_actions_logits, labels=labels), name="inverse_loss") # Foward Loss # LF = 1/2 || pred_phi_next_state - phi_next_state || # TODO 0.5 * ? self.forw_loss_axis = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), axis=-1, name="forward_loss_axis") self.forw_loss = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), name="forward_loss") # Todo predictor lr scale ? # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale if self.idf: self.icm_loss = ((1 - beta) * self.inv_loss + beta * self.forw_loss ) #* icm_lr_scale else: self.icm_loss = self.forw_loss #### # self.icm_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # print("ICM var list ::: " , self.icm_var_list) #### # # if max_grad_norm is not None : # t_icm_grads , _ = tf.clip_by_global_norm(self.icm_loss, constants['GRAD_NORM_CLIP'] ) # t_icm_grads_and_vars = list(zip(self.icm_loss , self.icm_var_list)) # print("\n\n\nit works \n\n\n") # # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters self.icm_params = tf.trainable_variables( 'icm_model') ## var_list same as ## testing phase self.predgrads = tf.gradients(self.icm_loss, self.icm_params) self.predgrads, _ = tf.clip_by_global_norm(self.predgrads, max_grad_norm) self.pred_grads_and_vars = list(zip(self.predgrads, self.icm_params)) ## testing phase # print("\n\nTrainable variables \n ",icm_params) # # 2. Build our trainer self.icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=1e-3, epsilon=1e-5) # # 3. Calculate the gradients icm_grads_and_var = self.icm_trainer.compute_gradients( self.icm_loss, self.icm_params) # # t_grads_and_var = tf.gradients() icm_grads, icm_var = zip(*icm_grads_and_var) if max_grad_norm is not None: # # Clip the gradients (normalize) icm_grads, icm__grad_norm = tf.clip_by_global_norm( icm_grads, max_grad_norm) icm_grads_and_var = list(zip(icm_grads, icm_var)) # # zip aggregate each gradient with parameters associated # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self._icm_train = self.icm_trainer.apply_gradients(icm_grads_and_var) if MPI.COMM_WORLD.Get_rank() == 0: print("Initialize") initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # print("GLOBAL VARIABLES", global_variables) sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): self.max_grad_norm = max_grad_norm self.head_idx_current_batch = 0 sess = tf.compat.v1.get_default_session() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, max_grad_norm) # in case we don't use rep loss rep_loss = None # HEAD_IDX = tf.compat.v1.placeholder(tf.int32, [None]) LATENT_FACTORS = train_model.pdtype.sample_placeholder( [ Config.REP_LOSS_M, Config.POLICY_NHEADS, Config.NUM_ENVS, count_latent_factors(Config.ENVIRONMENT) ], name='LATENT_FACTORS') ADV = tf.compat.v1.placeholder(tf.float32, [None], name='ADV') U_T = tf.compat.v1.placeholder(tf.float32, [None, 256, 128]) Z_T_1 = tf.compat.v1.placeholder(tf.float32, [None, 256, 128]) R = tf.compat.v1.placeholder(tf.float32, [None], name='R') R_NCE = tf.compat.v1.placeholder( tf.float32, [Config.REP_LOSS_M, Config.POLICY_NHEADS, Config.NUM_ENVS], name='R_NCE') OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None], name='OLDNEGLOGPAC') LR = tf.compat.v1.placeholder(tf.float32, [], name='LR') CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [], name='CLIPRANGE') STEP = tf.compat.v1.placeholder(tf.float32, [], name='STEP') # TD loss for critic # VF loss OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None], name='OLDVPRED') vpred = train_model.vf_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: vpred = vpred[self.head_idx_current_batch] vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean( input_tensor=tf.maximum(vf_losses1, vf_losses2)) neglogpac_train = train_model.pd_train[0].neglogp(train_model.A) ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train) pg_losses_train = -ADV * ratio_train pg_losses2_train = -ADV * tf.clip_by_value( ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean( input_tensor=tf.maximum(pg_losses_train, pg_losses2_train)) approxkl_train = .5 * tf.reduce_mean( input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC)) clipfrac_train = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(ratio_train - 1.0), CLIPRANGE), dtype=tf.float32)) if Config.BETA >= 0: entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0]. _components_distribution.entropy()) else: entropy = tf.reduce_mean( input_tensor=train_model.pd_train[0].entropy()) # Add entropy and policy loss for the samples as well if Config.SNI or Config.SNI2: neglogpac_run = train_model.pd_run.neglogp(train_model.A) ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run) pg_losses_run = -ADV * ratio_run pg_losses2_run = -ADV * tf.clip_by_value( ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss += tf.reduce_mean( input_tensor=tf.maximum(pg_losses_run, pg_losses2_run)) pg_loss /= 2. entropy += tf.reduce_mean( input_tensor=train_model.pd_run.entropy()) entropy /= 2. approxkl_run = .5 * tf.reduce_mean( input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC)) clipfrac_run = tf.reduce_mean( input_tensor=tf.cast(tf.greater(tf.abs(ratio_run - 1.0), CLIPRANGE), dtype=tf.float32)) else: approxkl_run = tf.constant(0.) clipfrac_run = tf.constant(0.) adv_pred = tf.reduce_mean( input_tensor=tf.square(tf.stop_gradient(ADV) - train_model.adv_pi)) # v_pred = tf.reduce_mean(input_tensor=tf.square(tf.stop_gradient(vpred) - train_model.v_pi)) # bc = tf.reduce_mean(input_tensor=(tf.stop_gradient(OLDNEGLOGPAC)-neglogpac_train)) params = tf.compat.v1.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum( input_tensor=[tf.nn.l2_loss(v) for v in weight_params]) # The first occurence should be in the train_model if Config.BETA >= 0: info_loss = tf.compat.v1.get_collection(key="INFO_LOSS", scope="model/info_loss") beta = Config.BETA elif Config.BETA_L2A >= 0: info_loss = tf.compat.v1.get_collection(key="INFO_LOSS_L2A", scope="model/info_loss") beta = Config.BETA_L2A else: info_loss = [tf.constant(0.)] beta = 0 # print(info_loss) assert len(info_loss) == 1 info_loss = info_loss[0] """" Sinkhorn clustering of state sequences """ p_t = tf.nn.log_softmax( tf.linalg.matmul(train_model.u_t, train_model.protos) / 0.1, axis=1) cluster_loss = -tf.compat.v1.reduce_mean( tf.compat.v1.reduce_sum(tf.stop_gradient(train_model.codes) * p_t, axis=1)) #+ 0.25 * adv_pred pi_loss = pg_loss - entropy * ent_coef + Config.REP_LOSS_WEIGHT * train_model.rep_loss + Config.REP_LOSS_WEIGHT * cluster_loss #+ vf_coef*vf_loss v_loss = vf_loss * vf_coef aux_loss = ( (1 - 0.0368)**STEP ) * Config.REP_LOSS_WEIGHT * train_model.rep_loss + Config.REP_LOSS_WEIGHT * cluster_loss #0.5 * v_pred + bc if Config.SYNC_FROM_ROOT: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) trainer_v = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) trainer_aux = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) trainer_v = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) self.opt = trainer # import ipdb;ipdb.set_trace() pi_params = [p for p in params if 'pi_branch' in p.name] grads_and_var_pi = trainer.compute_gradients(pi_loss, pi_params) grads_pi, var_pi = zip(*grads_and_var_pi) if max_grad_norm is not None: grads_pi, _grad_norm_pi = tf.clip_by_global_norm( grads_pi, max_grad_norm) grads_and_var_pi = list(zip(grads_pi, var_pi)) tot_norm = tf.zeros((1, )) for g, v in grads_and_var_pi: tot_norm += tf.norm(g) tot_norm = tf.reshape(tot_norm, []) _train_pi = trainer.apply_gradients(grads_and_var_pi) v_params = [p for p in params if 'model_0' in p.name] grads_and_var_v = trainer_v.compute_gradients(v_loss, v_params) grads_v, var_v = zip(*grads_and_var_v) if max_grad_norm is not None: grads_v, _grad_norm_v = tf.clip_by_global_norm( grads_v, max_grad_norm) grads_and_var_v = list(zip(grads_v, var_v)) _train_v = trainer_v.apply_gradients(grads_and_var_v) grads_and_var_aux = trainer_aux.compute_gradients(aux_loss, pi_params) grads_aux, var_aux = zip(*grads_and_var_aux) if max_grad_norm is not None: grads_aux, _grad_norm_aux = tf.clip_by_global_norm( grads_aux, max_grad_norm) grads_and_var_aux = list(zip(grads_aux, var_aux)) _train_aux = trainer_aux.apply_gradients(grads_and_var_aux) def train(lr, cliprange, states_nce, anchors_nce, labels_nce, rewards_nce, infos_nce, obs, returns, masks, actions, infos, values, neglogpacs, step, states=None, train_target='pi'): values = values[:, self. head_idx_current_batch] if Config.CUSTOM_REP_LOSS else values advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) # import ipdb;ipdb.set_trace() td_map = { train_model.X: obs, train_model.A: actions, ADV: advs, R: returns, LR: lr, train_model.X_pi: obs, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, STEP: step, train_model.REP_PROC: states_nce } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if train_target == 'pi': pi_res = sess.run([ pi_loss, entropy, train_model.rep_loss, cluster_loss, _train_pi ], td_map)[:-1] return pi_res elif train_target == 'value': v_res = sess.run([v_loss, _train_v], td_map)[:-1] return v_res[0] elif train_target == 'aux': aux_res = sess.run( [train_model.rep_loss, cluster_loss, _train_aux], td_map)[:-1] return aux_res self.loss_names = ['policy_loss', 'rep_loss', 'value_loss'] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load self.rep_vec = act_model.rep_vec self.custom_train = train_model.custom_train if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") sess.run(tf.compat.v1.global_variables_initializer()) sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, model_index=0): self.sess = sess = get_session() self.model_index = model_index if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model%s' % model_index, reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model%s' % model_index) # print("para",model_index,params) # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_trainable_variables, scope="ppo2_model%s" % model_index, sess=sess) self.load = functools.partial(load_trainable_variables, scope="ppo2_model%s" % model_index, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # print("global_variables",model_index,global_variables) if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101
def main(): """Run DQN until the environment throws an exception.""" # Hyperparameters learning_rate = 2.5e-4 gamma = 0.99 nstep_return = 3 timesteps_per_proc = 50_000_000 train_interval = 4 target_interval = 8192 batch_size = 512 min_buffer_size = 20000 # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='starpilot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--data_aug', type=str, default='no_aug', choices=['no_aug', 'cutout_color', 'crop']) parser.add_argument('--PER', type=lambda x: bool(strtobool(x)), default=True, help='Whether to use PER') parser.add_argument('--num_envs', type=int, default=64) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_envs = args.num_envs # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup Rainbow models logger.info("building models") online_net, target_net = rainbow_models( sess, venv.action_space.n, gym_space_vectorizer(venv.observation_space), min_val=REWARD_RANGE_FOR_C51[env_name][0], max_val=REWARD_RANGE_FOR_C51[env_name][1]) dqn = MpiDQN(online_net, target_net, discount=gamma, comm=comm, mpi_rank_weight=mpi_rank_weight, mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, data_aug=args.data_aug) player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return) optimize = dqn.optimize(learning_rate=learning_rate) # Initialize and sync variables sess.run(tf.global_variables_initializer()) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if comm.Get_size() > 1: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E110 # Training logger.info("training") if args.PER: dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size) else: #set alpha and beta equal to 0 for uniform prioritization and no importance sampling dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0, 0, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size)