def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0): """ get the actor update, with noise. :param actor: (str) the actor :param perturbed_actor: (str) the pertubed actor :param param_noise_stddev: (float) the std of the parameter noise :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :return: (TensorFlow Operation) the update function """ # TODO: simplify this to this: # assert len(actor.vars) == len(perturbed_actor.vars) # assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor)) assert len([var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]) == \ len([var for var in tf_util.get_trainable_vars(perturbed_actor) if 'LayerNorm' not in var.name]) updates = [] for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)): if var in [var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]: if verbose >= 2: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) else: if verbose >= 2: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) assert len(updates) == len(tf_util.get_globals_vars(actor)) return tf.group(*updates)
def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') ### BSS LOSS ### all_vars = [v for v in tf.global_variables()] self.l2_loss = 0.0 for var in all_vars: if 'qf' in var.name: self.l2_loss += tf.losses.mean_squared_error( tf.zeros(var.shape), var) _, qf_features = self.policy_tf.feature_matrices() singular_qf = tf.linalg.svd(qf_features, compute_uv=False) self.bss_loss = tf.reduce_sum(tf.square(singular_qf[-1])) ### BSS LOSS ### normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + \ self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name ] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/') ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad( self.critic_loss, tf_util.get_trainable_vars('model/qf/'), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam( var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_target_network_updates(self): """ set the target update operations """ init_updates, soft_updates = get_target_updates( tf_util.get_trainable_vars('model/'), tf_util.get_trainable_vars('target/'), self.tau, self.verbose) self.target_init_updates = init_updates self.target_soft_updates = soft_updates
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def get_vars(scope): """ Alias for get_trainable_vars :param scope: (str) :return: [tf Variable] """ return tf_util.get_trainable_vars(scope)
def setup_model(self): with SetVerbosity(self.verbose): for i in range(self.num_agents): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy # print(test_policy.type) assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.params = [] print("AC SPC", self.action_space) for i in range(self.num_agents): with tf.variable_scope("agent" + str(i)): optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) act, _train_step, update_target, step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log= False, #self.full_tensorboard_log, double_q=self.double_q) self.act.append(act) self._train_step.append(_train_step) self.step_model.append(step_model) self.proba_step.append(step_model.proba_step) self.update_target.append(update_target) self.params.extend( tf_util.get_trainable_vars("agent" + str(i) + "/deepq")) print(self.params) # Initialize the parameters and copy them to the target network. tf_util.initialize( self.sess ) # TODO: copy this file, make two versions of the algorithm. for i in range(self.num_agents): self.update_target[i]( sess=self.sess ) # TODO: Not sure, seems like the best thing to do is try using each agents own target first.
def get_vars(scope): """ Alias for get_trainable_vars :param scope: (str) :return: [tf Variable] """ # prefix = tf.get_variable_scope().name.split('/')[0] + '/' # return tf_util.get_trainable_vars(prefix + scope) return tf_util.get_trainable_vars(scope)
def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self._setup_learn(self.seed) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) #optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, momentum=0.95, epsilon=0.01) self.act, self._train_step, self.update_target, self._train_phi_step, self.step_model, _ = deepq_kpi.build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, kappa=self.kappa, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") @contextmanager def timed(msg): if self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.timed = timed self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.num_action_streams = self.action_space.shape[0] self.num_actions = self.num_actions_pad*self.num_action_streams # total numb network outputs for action branching with one action dimension per branch self.low = self.action_space.low self.high = self.action_space.high self.actions_range = np.subtract(self.high, self.low) if issubclass(self.policy, ActionBranching): self.bdq = True # BDQ allows continous output assert isinstance(self.action_space, gym.spaces.Box), \ "Error: BDQ cannot output a gym.spaces.Discrete action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, BDQPolicy), "Error: the input policy for the BDQ model must be " \ "an instance of BDQPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, num_actions=self.num_actions, num_action_streams=self.num_action_streams, batch_size=self.batch_size, gamma=self.gamma, grad_norm_clipping=self.grad_norm_clipping, optimizer_name="Adam", learning_rate=self.learning_rate, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("bdq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') ### BSS LOSS ### all_vars = [v for v in tf.global_variables()] self.l2_loss = 0.0 for var in all_vars: if 'pi' in var.name: self.l2_loss += tf.losses.mean_squared_error( tf.zeros(var.shape), var) pi_features, _ = self.policy_tf.feature_matrices() singular_pi = tf.linalg.svd(pi_features, compute_uv=False) self.bss_loss = tf.reduce_sum(tf.square(singular_pi[-1])) ### BSS LOSS ### self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + \ self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss actor_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/') ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad( self.actor_loss, tf_util.get_trainable_vars('model/pi/'), clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam( var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_popart(self): """ setup pop-art normalization of the critic output See https://arxiv.org/pdf/1602.07714.pdf for details. Preserving Outputs Precisely, while Adaptively Rescaling Targets”. """ self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_q_outputs_op = [] for out_vars in [[var for var in tf_util.get_trainable_vars('model/qf/') if 'output' in var.name], [var for var in tf_util.get_trainable_vars('target/qf/') if 'output' in var.name]]: assert len(out_vars) == 2 # wieght and bias of the last layer weight, bias = out_vars assert 'kernel' in weight.name assert 'bias' in bias.name assert weight.get_shape()[-1] == 1 assert bias.get_shape()[-1] == 1 self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)] self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)]
def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.use_rmsprop: optimizer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate, decay=self.rmsprop_alpha, epsilon=self.rmsprop_epsilon, centered=True ) else: optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q ) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def init_network_continuous(self, input, name): with tf.variable_scope(name): model = tf.layers.dense(input, 8, activation=tf.nn.relu) model = tf.layers.dense(model, self.action_space.shape[0], activation=tf.nn.sigmoid) self._proba_distribution, _, _ = \ self._pdtype.proba_distribution_from_latent(model, model, init_scale=0.01) self.action_ph = self._pdtype.sample_placeholder([None], name='action_ph') self._policy_proba = [ self._proba_distribution.mean, self._proba_distribution.std ] self.params = tf_util.get_trainable_vars('net') self.pg_loss = tf.gradients( self._proba_distribution.neglogp(self.action_ph), self.params) return model
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps n_cpu = multiprocessing.cpu_count() if sys.platform == 'darwin': n_cpu //= 2 self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) n_batch_step = None n_batch_train = None if self.retrain_victim: # assert is mlp policy if self.env_name in ['multicomp/YouShallNotPassHumans-v0']: act_model = MlpPolicyValue( scope="victim_policy", reuse=False, ob_space=self.observation_space, ac_space=self.action_space, sess=self.sess, hiddens=[64, 64], normalize=self.norm_victim) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = MlpPolicyValue( scope="victim_policy", reuse=True, ob_space=self.observation_space, ac_space=self.action_space, sess=self.sess, hiddens=[64, 64], normalize=self.norm_victim) else: if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, " \ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) if self.black_box_att: with tf.variable_scope("mimic_model", reuse=False): self.mimic_model = RL_model(input_shape=self.observation_space.shape, \ out_shape=self.action_space.shape) self.mimic_model.load(self.mimic_model_path) with tf.variable_scope("loss", reuse=False): if self.retrain_victim: self.action_ph = tf.placeholder( shape=[None, self.action_space.shape[0]], dtype=tf.float32, name="action_ph") else: self.action_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder( tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") # Xian added self.action_opp_next_ph = tf.placeholder( dtype=tf.float32, shape=self.action_ph.shape, name="action_opp_next_ph") self.obs_opp_next_ph = tf.placeholder( dtype=tf.float32, shape=train_model.obs_ph.shape, name="obs_opp_next_ph") self.stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") self.ratio_ph = tf.placeholder( tf.float32, [], name="change_action_state_ratio_ph") action_ph_noise = train_model.deterministic_action with tf.variable_scope("statem", reuse=True): obs_oppo_predict, obs_oppo_noise_predict = modeling_state( self.action_ph, action_ph_noise, train_model.obs_ph) if not self.masking_attention: self.attention = tf.placeholder(dtype=tf.float32, shape=[None], name="attention_ph") else: self.attention = tf.placeholder( dtype=tf.float32, shape=[None, train_model.obs_ph.shape[1]], name="attention_ph") obs_oppo_noise_predict = tf.multiply( obs_oppo_noise_predict, self.attention) if not self.black_box_att: with tf.variable_scope("victim_param", reuse=tf.AUTO_REUSE): action_opp_mal_noise, _ = mlp_policy(obs_oppo_noise_predict, self.stochastic_ph, self.env.observation_space, \ self.env.action_space, [64, 64], True) else: # load the pretrained victim model with tf.variable_scope("victim_param", reuse=tf.AUTO_REUSE): victim_model = RL_func( self.observation_space.shape[0], self.action_space.shape[0]) action_opp_mal_noise = victim_model( obs_oppo_noise_predict) if not self.masking_attention: # update 2019/07/19, if not making, attention is only weighting loss # on action along with time # oppo's action change # change into L infinity norm self.change_opp_action_mse = tf.reduce_mean( tf.abs( tf.multiply( action_opp_mal_noise - self.action_opp_next_ph, tf.expand_dims(self.attention, axis=-1)))) else: self.change_opp_action_mse = tf.reduce_mean( tf.abs(action_opp_mal_noise - self.action_opp_next_ph)) # add change_state_mse self.change_state_mse = self.ratio_ph * tf.reduce_mean( tf.abs(obs_oppo_noise_predict - self.obs_opp_next_ph)) self.change_mse = self.change_opp_action_mse - self.change_state_mse # Prediction error on oppo's next observation # change into the L infinity norm # L(infinity) = max(0, ||l1 -l2|| - c)^2 self.state_modeling_mse = tf.reduce_mean( tf.square( tf.math.maximum( tf.abs(obs_oppo_predict - self.obs_opp_next_ph) - 1, 0))) neglogpac = train_model.proba_distribution.neglogp( self.action_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) if self.retrain_victim: vpred = tf.reshape(train_model.value_flat, [-1]) else: vpred = train_model.value_flat vpredclipped = self.old_vpred_ph + tf.clip_by_value( train_model.value_flat - self.old_vpred_ph, -self.clip_range_ph, self.clip_range_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpredclipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean( tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean( tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.cast( tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef + \ self.hyper_weights[1] * self.change_mse if self.black_box_att: if not self.pretrained_mimic: loss_mimic = self.hyper_weights[ 3] * self.state_modeling_mse else: loss_mimic = self.hyper_weights[ 3] * self.state_modeling_mse else: # if its' white box attack, then do not model the action output loss_mimic = self.hyper_weights[ 3] * self.state_modeling_mse tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) tf.summary.scalar('loss_mimic', loss_mimic) tf.summary.scalar( '_change oppo action mse', self.hyper_weights[1] * self.change_opp_action_mse) tf.summary.scalar('_predict state mse', self.state_modeling_mse) # add ppo loss tf.summary.scalar( '_PPO loss', loss - self.hyper_weights[1] * self.change_opp_action_mse) if self.retrain_victim: params = tf_util.get_trainable_vars("victim_policy") else: params = tf_util.get_trainable_vars("model") if self.full_tensorboard_log: for var in params: tf.summary.histogram(var.name, var) self.params = [ params, tf_util.get_trainable_vars("loss/statem") ] grads = tf.gradients(loss, self.params[0]) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params[0])) grads_mimic = tf.gradients(loss_mimic, self.params[1]) if self.max_grad_norm is not None: grads_mimic, _grad_norm_mimic = tf.clip_by_global_norm( grads_mimic, self.max_grad_norm) grads_mimic = list(zip(grads_mimic, self.params[1])) trainer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) trainer_mimic = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train_mimic = trainer_mimic.apply_gradients(grads_mimic) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', '_change_opp_a_loss', '_s_modeling_loss', '_a_modeling_loss' ] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) tf.summary.scalar('old_neglog_action_probabilty', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) # add attention onto the final results tf.summary.scalar( 'att_hyp', self.hyper_weights[1] * tf.reduce_mean(self.attention)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 # load the pretrained_value if self.retrain_victim: env_path = get_zoo_path(self.env_name, tag=2) param = load_from_file(param_pkl_path=env_path) ret_variable = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="victim_policy/retfilter") obs_variable = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="victim_policy/obsfilter") variables = ret_variable + obs_variable + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="victim_policy") setFromFlat(variables, param, self.sess) if True: victim_variable = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, "loss/victim_param") param = load_from_file(param_pkl_path=self.env_path) setFromFlat(victim_variable, param, sess=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACER model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Discrete): self.n_act = self.action_space.n continuous = False elif isinstance(self.action_space, Box): # self.n_act = self.action_space.shape[-1] # continuous = True raise NotImplementedError("WIP: Acer does not support Continuous actions yet.") else: raise ValueError("Error: ACER does not work with {} actions space.".format(self.action_space)) self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.set_random_seed(self.seed) n_batch_step = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * (self.n_steps + 1) step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = tf_util.get_trainable_vars("model") with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps + 1, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("moving_average"): # create averaged model ema = tf.train.ExponentialMovingAverage(self.alpha) ema_apply_op = ema.apply(self.params) def custom_getter(getter, name, *args, **kwargs): name = name.replace("polyak_model/", "") val = ema.average(getter(name, *args, **kwargs)) return val with tf.variable_scope("polyak_model", reuse=True, custom_getter=custom_getter): self.polyak_model = polyak_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps + 1, self.n_envs * (self.n_steps + 1), reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.done_ph = tf.placeholder(tf.float32, [self.n_batch]) # dones self.reward_ph = tf.placeholder(tf.float32, [self.n_batch]) # rewards, not returns self.mu_ph = tf.placeholder(tf.float32, [self.n_batch, self.n_act]) # mu's self.action_ph = train_model.pdtype.sample_placeholder([self.n_batch]) self.learning_rate_ph = tf.placeholder(tf.float32, []) eps = 1e-6 # Notation: (var) = batch variable, (var)s = sequence variable, # (var)_i = variable index by action at step i # shape is [n_envs * (n_steps + 1)] if continuous: value = train_model.value_flat else: value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1) rho, rho_i_ = None, None if continuous: action_ = strip(train_model.proba_distribution.sample(), self.n_envs, self.n_steps) distribution_f = tf.contrib.distributions.MultivariateNormalDiag( loc=strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps), scale_diag=strip(train_model.proba_distribution.logstd, self.n_envs, self.n_steps)) f_polyak = tf.contrib.distributions.MultivariateNormalDiag( loc=strip(polyak_model.proba_distribution.mean, self.n_envs, self.n_steps), scale_diag=strip(polyak_model.proba_distribution.logstd, self.n_envs, self.n_steps)) f_i = distribution_f.prob(self.action_ph) f_i_ = distribution_f.prob(action_) f_polyak_i = f_polyak.prob(self.action_ph) phi_i = strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps) q_value = strip(train_model.value_fn, self.n_envs, self.n_steps) q_i = q_value[:, 0] rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps) rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps) qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, tf.pow(rho_i, 1 / self.n_act), self.n_envs, self.n_steps, self.gamma) else: # strip off last step # f is a distribution, chosen to be Gaussian distributions # with fixed diagonal covariance and mean \phi(x) # in the paper distribution_f, f_polyak, q_value = \ map(lambda variables: strip(variables, self.n_envs, self.n_steps), [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value]) # Get pi and q values for actions taken f_i = get_by_index(distribution_f, self.action_ph) f_i_ = distribution_f phi_i = distribution_f f_polyak_i = f_polyak q_i = get_by_index(q_value, self.action_ph) # Compute ratios for importance truncation rho = distribution_f / (self.mu_ph + eps) rho_i = get_by_index(rho, self.action_ph) # Calculate Q_retrace targets qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, rho_i, self.n_envs, self.n_steps, self.gamma) # Calculate losses # Entropy entropy = tf.reduce_sum(train_model.proba_distribution.entropy()) # Policy Gradient loss, with truncated importance sampling & bias correction value = strip(value, self.n_envs, self.n_steps, True) # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4) # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2) # Truncated importance sampling adv = qret - value log_f = tf.log(f_i + eps) # [n_envs * n_steps] gain_f = log_f * tf.stop_gradient(adv * tf.minimum(self.correction_term, rho_i)) loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q_value - tf.reshape(value, [self.n_envs * self.n_steps, 1])) # [n_envs * n_steps, n_act] # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2) if continuous: gain_bc = tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (self.correction_term / (rho_i_ + eps))) * f_i_) else: log_f_bc = tf.log(f_i_ + eps) # / (f_old + eps) gain_bc = tf.reduce_sum(log_f_bc * tf.stop_gradient( adv_bc * tf.nn.relu(1.0 - (self.correction_term / (rho + eps))) * f_i_), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[self.n_envs * self.n_steps]] * 2) explained_variance = q_explained_variance(tf.reshape(q_i, [self.n_envs, self.n_steps]), tf.reshape(qret, [self.n_envs, self.n_steps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy tf.summary.scalar('entropy_loss', entropy) tf.summary.scalar('policy_gradient_loss', loss_policy) tf.summary.scalar('value_function_loss', loss_q) tf.summary.scalar('loss', loss) norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None if self.trust_region: # [n_envs * n_steps, n_act] grad = tf.gradients(- (loss_policy - self.ent_coef * entropy) * self.n_steps * self.n_envs, phi_i) # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f kl_grad = - f_polyak_i / (f_i_ + eps) k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - self.delta) / ( tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps)) # [n_envs * n_steps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(kl_grad) avg_norm_g = avg_norm(grad) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) grad = grad - tf.reshape(adj, [self.n_envs * self.n_steps, 1]) * kl_grad # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_f = -grad / (self.n_envs * self.n_steps) grads_policy = tf.gradients(f_i_, self.params, grads_f) grads_q = tf.gradients(loss_q * self.q_coef, self.params) grads = [gradient_add(g1, g2, param, verbose=self.verbose) for (g1, g2, param) in zip(grads_policy, grads_q, self.params)] avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps * self.n_envs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, self.params) norm_grads = None if self.max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.scalar('advantage', tf.reduce_mean(adv)) tf.summary.scalar('action_probability', tf.reduce_mean(self.mu_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.reward_ph) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.histogram('advantage', adv) tf.summary.histogram('action_probability', self.mu_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.rprop_alpha, epsilon=self.rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) # Ops/Summaries to run, and their names for logging assert norm_grads is not None run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if self.trust_region: self.run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] self.names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Penalty related variable with tf.variable_scope('penalty'): cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8)) penalty_param = tf.get_variable('penalty_param', initializer=float(param_init), trainable=True, dtype=tf.float32) penalty = tf.nn.softplus(penalty_param) penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim)) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # # Network for safety value function # with tf.variable_Scope("vc",reuse=False): # self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) # Surrogate for cost function surrcost = tf.reduce_mean(ratio * catarg) optimgain = surrgain + entbonus # Include surr_cost in pi_objective optimgain -= penalty * surrcost optimgain /= (1 + penalty) # # Loss function for pi is negative of pi_objective # optimgain = -optimgain # Should we?? losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # Fisher vector products fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('penalty_loss', penalty_loss) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('constraint_cost_function_loss', surrcost) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg], fvp) # Why need all inputs? Might for implementation easiness # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], # tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret], # tf_util.flatgrad(vcerr, vcf_var_list)) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret], [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)]) self.compute_lagrangiangrad = tf_util.function([cur_cost_ph], tf_util.flatgrad(penalty_loss, [penalty_param])) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() # optimizer for constraint costs value function self.vcadam = MpiAdam(vcf_var_list, sess=self.sess) self.vcadam.sync() # optimizer for lagragian value of safe RL self.penaltyadam = MpiAdam([penalty_param], sess=self.sess) self.penaltyadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('discounted_costs', tf.reduce_mean(cret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('discounted_rewards', cret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('penalty_learning_rate', self.penalty_lr) tf.summary.histogram('advantage', atarg) tf.summary.histogram('cost_advantage', catarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.support = tf.constant(np.arange(self.v_min, self.v_max + 1e-6, self.delta), dtype=tf.float32) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="v_projection") self.q_projection_ph = tf.placeholder(tf.float32, (None, self.n_spt), name="q_projection") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1_distr, qf2_distr, value_fn_distr = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi_distr, qf2_pi_distr, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target_distr = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target_distr = value_target_distr with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) # compute qf_pi, qf2_pi with pdf min_qf_pi_distr = tf.where(tf.less(tf.reduce_mean(qf1_pi_distr * self.support), tf.reduce_mean(qf2_pi_distr * self.support)), qf1_pi_distr, qf2_pi_distr) min_qf_pi = tf.reduce_mean(tf.reduce_sum(min_qf_pi_distr * self.support, axis=-1)) self.min_qf_pi = min_qf_pi q_backup_op = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.support ) q_backup_op = tf.clip_by_value(q_backup_op, self.v_min, self.v_max) self.q_backup_op = q_backup_op qf1_loss = -tf.reduce_mean(tf.log(qf1_distr + 1e-12) * tf.stop_gradient(self.projection_ph)) qf2_loss = -tf.reduce_mean(tf.log(qf2_distr + 1e-12) * tf.stop_gradient(self.projection_ph)) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) # to clip policy loss qf_pi = tf.reduce_mean(self.support * min_qf_pi_distr, axis=-1, keepdims=True) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. value_loss = -tf.reduce_mean(tf.log(value_fn_distr + 1e-12) * tf.stop_gradient(min_qf_pi_distr)) \ - tf.stop_gradient(tf.reduce_mean(self.ent_coef * logp_pi)) value_fn = tf.reduce_sum(value_fn_distr * self.support, axis=-1) # value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars('model/values_fn') source_params = tf_util.get_trainable_vars("model/values_fn") target_params = tf_util.get_trainable_vars("target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses qf1, qf2 = tf.reduce_mean(tf.reduce_sum(self.support * qf1_distr, axis=-1)), tf.reduce_mean(tf.reduce_sum(self.support * qf2_distr, axis=-1)) with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/values_fn") # Initialize Variables and target network self.projection_op = Projection(self.sess, self.graph, self.n_spt, self.v_min, self.v_max, self.delta, self.batch_size) with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier( self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leiber', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam( self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = find_trainable_variables("model") if self.using_gail: self.params.extend( self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') if self.ro: split_group_action_raw = tf.split(self.augmented_action_raw, self.batch_size, axis=0) split_group_action = tf.split(self.augmented_action, self.batch_size, axis=0) split_group_q = tf.split(self.augmented_critic_with_actor_tf, self.batch_size, axis=0) self.actor_loss = 0 q_stds = [] for idx in range(self.batch_size): # softmax = tf.nn.softmax(split_group_q[idx] - # tf.reduce_max(split_group_q[idx], axis=0, keepdims=True), axis=0) # self.actor_loss = self.actor_loss + tf.reduce_sum( # tf.reduce_sum(tf.square(split_group_action_raw[idx] - # tf.stop_gradient(split_group_action[idx])), # axis=1) # * tf.stop_gradient(softmax)) max_index = tf.argmax(split_group_q[idx], axis=0) q_std = tf.math.reduce_std(split_group_q[idx]) * 20 target_action = split_group_action[idx][max_index, :] if self.adjust_lr: self.actor_loss = self.actor_loss + \ tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action))) \ / tf.stop_gradient(q_std) else: self.actor_loss = self.actor_loss + \ tf.reduce_mean(tf.square(self.actor_tf[idx, :] - tf.stop_gradient(target_action))) q_stds.append(q_std) # tf.summary.histogram("q_std", tf.stack(q_stds, axis=0)) else: self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/') ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) # self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), # clip_norm=self.clip_norm) # self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, # epsilon=1e-08) self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.actor_lr) self.actor_gradients = self.actor_optimizer.compute_gradients( self.actor_loss, var_list=tf_util.get_trainable_vars("model/pi/")) hist_summary = [] for gradient, variable in self.actor_gradients: if gradient is not None: hist_summary.append( tf.summary.histogram("gradients/" + variable.name, gradient)) hist_summary.append( tf.summary.histogram("variables/" + variable.name, variable)) self.actor_gradient_summary = tf.summary.merge(hist_summary) self.actor_optimize_op = self.actor_optimizer.apply_gradients( self.actor_gradients)
def setup_model(self): with SetVerbosity(self.verbose): assert isinstance(self.action_space, gym.spaces.Box), \ "Error: DDPG cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) assert issubclass(self.policy, DDPGPolicy), "Error: the input policy for the DDPG model must be " \ "an instance of DDPGPolicy." self.graph = tf.Graph() with self.graph.as_default(): self._setup_learn(self.seed) # self.sess = tf_util.single_threaded_session(graph=self.graph) self.sess = tf_util.make_session() self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Observation normalization. # if self.normalize_observations: # with tf.variable_scope('obs_rms'): # self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) # else: # self.obs_rms = None # Return normalization. # if self.normalize_returns: # with tf.variable_scope('ret_rms'): # self.ret_rms = RunningMeanStd() # else: # self.ret_rms = None self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, **self.policy_kwargs) # Create target networks. self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, 1, 1, None, **self.policy_kwargs) self.obs_target = self.target_policy.obs_ph self.action_target = self.target_policy.action_ph # normalized_obs0 = tf.clip_by_value(normalize(self.policy_tf.processed_obs, self.obs_rms), # self.observation_range[0], self.observation_range[1]) # normalized_obs1 = tf.clip_by_value(normalize(self.target_policy.processed_obs, self.obs_rms), # self.observation_range[0], self.observation_range[1]) # Inputs. self.obs_train_ph = self.policy_tf.obs_ph self.action_train_ph = self.policy_tf.action_ph self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') # Create networks and core TF parts that are shared across setup parts. with tf.variable_scope("model", reuse=False): self.actor_tf = self.policy_tf.make_actor( self.policy_tf.processed_obs) self.critic_tf = self.policy_tf.make_critic( self.policy_tf.processed_obs, self.action_train_ph) self.critic_with_actor_tf = self.policy_tf.make_critic( self.policy_tf.processed_obs, self.actor_tf, reuse=True) if self.ro: def tf_repeat(tensor_to_repeat, repeat_num): tiled = tf.tile(tensor_to_repeat, [1, repeat_num]) repeated = tf.reshape( tiled, shape=[ self.batch_size * repeat_num, tensor_to_repeat.shape[1] ]) return repeated self.augmented_obs0 = tf_repeat( self.policy_tf.processed_obs, self.sample_number) self.augmented_action_raw = tf_repeat( self.actor_tf, self.sample_number) noises = [] for b_index in range(self.batch_size): noises.append( tf.random_uniform((self.sample_number - 1, ) + self.action_space.shape, -0.1, 0.1)) noises.append( tf.zeros((1, ) + self.action_space.shape)) noises = tf.concat(noises, axis=0) self.augmented_action = self.augmented_action_raw + noises self.augmented_action = tf.clip_by_value( self.augmented_action, -1, 1) self.augmented_critic_with_actor_tf = self.policy_tf.make_critic( self.augmented_obs0, self.augmented_action, reuse=True)[:, 0] with tf.variable_scope("target", reuse=False): critic_target = \ self.target_policy.make_critic(self.target_policy.processed_obs, self.target_policy.make_actor(self.target_policy.processed_obs)) with tf.variable_scope("loss", reuse=False): # self.critic_tf = denormalize( # tf.clip_by_value(self.critic_tf, self.return_range[0], self.return_range[1]), # self.ret_rms) # # self.critic_with_actor_tf = denormalize( # tf.clip_by_value(self.critic_with_actor_tf, # self.return_range[0], self.return_range[1]), # self.ret_rms) # # q_obs1 = denormalize(critic_target, self.ret_rms) self.target_q = self.rewards + ( 1. - self.terminals1) * self.gamma * critic_target # tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target)) if self.full_tensorboard_log: tf.summary.histogram('critic_target', self.critic_target) # Set up parts. self._setup_stats() self._setup_target_network_updates() with tf.variable_scope("input_info", reuse=False): self.reward_summary = tf.summary.scalar( 'rewards', tf.reduce_mean(self.rewards)) self.obs_summary = tf.summary.scalar( 'obs', tf.reduce_mean(self.obs_train_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.rewards) if len(self.observation_space.shape ) == 3 and self.observation_space.shape[0] in [ 1, 3, 4 ]: tf.summary.image('observation', self.obs_train_ph) else: tf.summary.histogram('observation', self.obs_train_ph) with tf.variable_scope("Adam_mpi", reuse=False): self._setup_actor_optimizer() self._setup_critic_optimizer() self.actor_loss_summary = tf.summary.scalar( 'actor_loss', self.actor_loss) self.critic_loss_summary = tf.summary.scalar( 'critic_loss', self.critic_loss) self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target") self.obs_rms_params = [ var for var in tf.global_variables() if "obs_rms" in var.name ] self.ret_rms_params = [ var for var in tf.global_variables() if "ret_rms" in var.name ] with self.sess.as_default(): self._initialize(self.sess)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects if self.recurrent_policy: import inspect policy_tf_args = inspect.signature(self.policy).parameters policy_tf_kwargs = {} if "my_size" in policy_tf_args: policy_tf_kwargs["my_size"] = len(self._get_env_parameters()) if "goal_size" in policy_tf_args: policy_tf_kwargs["goal_size"] = self.env.goal_dim # TODO: need to get this some other way or save it if self.buffer_kwargs is not None: sequence_length = self.buffer_kwargs.get("sequence_length", 1) else: sequence_length = 1 self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, n_batch=self.batch_size, n_steps=sequence_length, **policy_tf_kwargs, **self.policy_kwargs) self.policy_tf_act = self.policy(self.sess, self.observation_space, self.action_space, n_batch=1, **policy_tf_kwargs, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, n_batch=self.batch_size, n_steps=sequence_length, **policy_tf_kwargs, **self.policy_kwargs) self.dones_ph = self.policy_tf.dones_ph else: self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) if hasattr(self.policy_tf, "extra_phs"): for ph_name in self.policy_tf.extra_phs: if "target_" in ph_name: self.train_extra_phs[ph_name] = getattr(self.target_policy_tf, ph_name.replace("target_", "") + "_ph") else: self.train_extra_phs[ph_name] = getattr(self.policy_tf, ph_name + "_ph") # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.buffer_is_prioritized = self.buffer_type.__name__ in ["PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer"] if self.replay_buffer is None: if self.buffer_is_prioritized: if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0: self.replay_buffer = ReplayBuffer(self.buffer_size) else: buffer_kw = {"size": self.buffer_size, "alpha": 0.7} if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer": buffer_kw.update( {"learning_starts": self.prioritization_starts, "batch_size": self.batch_size}) self.replay_buffer = self.buffer_type(**buffer_kw) else: replay_buffer_kw = {"size": self.buffer_size} if self.buffer_kwargs is not None: replay_buffer_kw.update(self.buffer_kwargs) if self.recurrent_policy: replay_buffer_kw["rnn_inputs"] = self.policy_tf.rnn_inputs if hasattr(self.policy_tf, "extra_data_names"): replay_buffer_kw["extra_data_names"] = self.policy_tf.extra_data_names self.replay_buffer = self.buffer_type(**replay_buffer_kw) if self.recurrent_policy: self.sequence_length = self.replay_buffer.sequence_length self.scan_length = self.replay_buffer.scan_length assert self.scan_length % self.sequence_length == 0 with tf.variable_scope("model", reuse=False): # Create the policy if self.recurrent_policy: actor_args = inspect.signature(self.policy_tf.make_actor).parameters critic_args = inspect.signature(self.policy_tf.make_critics).parameters actor_kws = {k: v for k, v in self.train_extra_phs.items() if k in actor_args} critic_kws = {k: v for k, v in self.train_extra_phs.items() if k in critic_args} self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph, **actor_kws) self.policy_act = policy_act = self.policy_tf_act.make_actor(reuse=True) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, **critic_kws) _, _ = self.policy_tf_act.make_critics(None, self.actions_ph, reuse=True) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, **critic_kws, reuse=True) train_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" not in var.name] act_params = [var for var in tf_util.get_trainable_vars("model/pi") if "act" in var.name] self.act_ops = [ tf.assign(act, train) for act, train in zip(act_params, train_params) ] else: self.policy_out = policy_out = self.policy_tf.make_actor(self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, qf2_pi = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): if self.recurrent_policy: # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph, **actor_kws, dones=self.dones_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, noisy_target_action, dones=self.dones_ph, **critic_kws) else: # Create target networks target_policy_out = self.target_policy_tf.make_actor(self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal(tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value(target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics(self.processed_next_obs_ph, noisy_target_action) policy_pre_activation = self.policy_tf.policy_pre_activation if self.full_tensorboard_log: for var in tf_util.get_trainable_vars("model"): tf.summary.histogram(var.name, var) if self.recurrent_policy and self.policy_tf.keras_reuse: tf.summary.histogram("rnn/PI state", self.policy_tf.pi_state) tf.summary.histogram("rnn/QF1 state", self.policy_tf.qf1_state) tf.summary.histogram("rnn/QF2 state", self.policy_tf.qf2_state) # TODO: introduce somwehere here the placeholder for history which updates internal state? with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target ) if self.clip_q_target is not None: q_backup = tf.clip_by_value(q_backup, self.clip_q_target[0], self.clip_q_target[1], name="q_backup_clipped") # Compute Q-Function loss if self.buffer_is_prioritized: self.train_extra_phs["is_weights"] = tf.placeholder(tf.float32, shape=(None, 1), name="is_weights") qf1_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf1) ** 2) qf2_loss = tf.reduce_mean(self.is_weights_ph * (q_backup - qf2) ** 2) else: qf1_loss = tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = tf.reduce_mean((q_backup - qf2) ** 2) qvalues_losses = qf1_loss + qf2_loss rew_loss = tf.reduce_mean(qf1_pi) action_loss = self.action_l2_scale * tf.nn.l2_loss(policy_pre_activation) self.policy_loss = policy_loss = -rew_loss + action_loss # Policy loss: maximise q value if hasattr(self.policy_tf, "policy_loss"): tf.summary.scalar("custom_policy_loss", self.policy_tf.policy_loss) self.policy_loss += self.policy_tf.policy_loss policy_loss = self.policy_loss # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_vars = tf_util.get_trainable_vars("model/pi") + tf_util.get_trainable_vars("model/shared") policy_train_op = policy_optimizer.minimize(policy_loss, var_list=policy_vars) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars('model/values_fn/') + tf_util.get_trainable_vars("model/shared/") # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") if self.recurrent_policy: source_params = [var for var in tf_util.get_trainable_vars("model/") if "act" not in var.name] # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize(qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [qf1_loss, qf2_loss, qf1, qf2, train_values_op] if hasattr(self.policy_tf, "step_ops"): self.step_ops.extend(self.policy_tf.step_ops) self.policy_step_ops = [self.policy_train_op, self.target_ops, self.policy_loss] if hasattr(self.policy_tf, "policy_step_ops"): self.policy_step_ops.extend(self.policy_tf.policy_step_ops) if self.recurrent_policy and self.policy_tf.save_state: if self.policy_tf.share_lstm: state_objects = [self.policy_tf.state] if self.target_policy_tf.save_target_state: state_objects.append(self.target_policy_tf.state) else: state_objects = [self.policy_tf.pi_state, self.policy_tf.qf1_state, self.policy_tf.qf2_state] if self.target_policy_tf.save_target_state: state_objects.extend([self.target_policy_tf.pi_state, self.target_policy_tf.qf1_state, self.target_policy_tf.qf2_state]) self.step_ops.extend(state_objects) # Monitor losses and entropy in tensorboard tf.summary.scalar("rew_loss", rew_loss) tf.summary.scalar("action_loss", action_loss) tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") if self.full_tensorboard_log: policy_grads = policy_optimizer.compute_gradients(policy_loss) for g in policy_grads: if g[0] is not None and g[1] in policy_vars: tf.summary.histogram("grad-policy/{}".format(g[1].name), g[0]) qf_grads = qvalues_optimizer.compute_gradients(qvalues_losses) for g in qf_grads: if g[0] is not None and g[1] in qvalues_params: tf.summary.histogram("grad-qf/{}".format(g[1].name), g[0]) # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
#!/usr/bin/python
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." # Enable continuous actions tricks (normalized advantage) self.continuous_actions = isinstance(self.action_space, Box) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope( "loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.learning_rate_ph = learning_rate_ph = tf.placeholder( tf.float32, []) self.actions_ph = train_model.pdtype.sample_placeholder( [None]) neg_log_prob = train_model.proba_distribution.neglogp( self.actions_ph) # training loss pg_loss = tf.reduce_mean(advs_ph * neg_log_prob) self.entropy = entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse( tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean( neg_log_prob) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow( train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope( "kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=learning_rate_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=self.kfac_update, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.compat.v1.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.compat.v1.variable_scope("loss", reuse=False): # Target advantage function (if applicable) atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.compat.v1.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder( [None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(input_tensor=kloldnew) meanent = tf.reduce_mean(input_tensor=ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean( input_tensor=tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( input_tensor=tf.square(self.policy_pi.value_flat - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] tf.compat.v1.summary.scalar('entropy_loss', pol_entpen) tf.compat.v1.summary.scalar('policy_gradient_loss', pol_surr) tf.compat.v1.summary.scalar('value_function_loss', vf_loss) tf.compat.v1.summary.scalar('approximate_kullback-leibler', meankl) tf.compat.v1.summary.scalar('clip_factor', clip_param) tf.compat.v1.summary.scalar('loss', total_loss) self.params = tf_util.get_trainable_vars("model") self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame( tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model")) ]) with tf.compat.v1.variable_scope("Adam_mpi", reuse=False): self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar( 'discounted_rewards', tf.reduce_mean(input_tensor=ret)) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.optim_stepsize)) tf.compat.v1.summary.scalar( 'advantage', tf.reduce_mean(input_tensor=atarg)) tf.compat.v1.summary.scalar( 'clip_range', tf.reduce_mean(input_tensor=self.clip_param)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', ret) tf.compat.v1.summary.histogram('learning_rate', self.optim_stepsize) tf.compat.v1.summary.histogram('advantage', atarg) tf.compat.v1.summary.histogram('clip_range', self.clip_param) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', obs_ph) else: tf.compat.v1.summary.histogram( 'observation', obs_ph) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess) self.summary = tf.compat.v1.summary.merge_all() self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.replay_buffer and len(self.replay_buffer) > 0: # TODO: maybe substitute with a prioritized buffer to give preference to the transitions added # during continual learning pass else: self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name="terminals") self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name="rewards") self.actions_ph = tf.placeholder( tf.float32, shape=(None, ) + self.action_space.shape, name="actions", ) self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy ( self.deterministic_action, policy_out, logp_pi, ) = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True, ) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True, ) # Target entropy is used when learning the entropy coefficient if self.target_entropy == "auto": # automatically set target entropy if needed self.target_entropy = -np.prod( self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith("auto"): # Default initial value of ent_coef when learned init_value = 1.0 if "_" in self.ent_coef: init_value = float(self.ent_coef.split("_")[1]) assert init_value > 0.0, "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log(init_value).astype(np.float32), ) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars("model/pi")) # Value train op value_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars( "model/values_fn") source_params = tf_util.get_trainable_vars( "model/values_fn/vf") target_params = tf_util.get_trainable_vars( "target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ "policy_loss", "qf1_loss", "qf2_loss", "value_loss", "entropy", ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op, ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ "ent_coef_loss", "ent_coef" ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef, ] # Monitor losses and entropy in tensorboard tf.summary.scalar("policy_loss", policy_loss) tf.summary.scalar("qf1_loss", qf1_loss) tf.summary.scalar("qf2_loss", qf2_loss) tf.summary.scalar("value_loss", value_loss) tf.summary.scalar("entropy", self.entropy) if ent_coef_loss is not None: tf.summary.scalar("ent_coef_loss", ent_coef_loss) tf.summary.scalar("ent_coef", self.ent_coef) tf.summary.scalar("learning_rate", tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars( "target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp( self.actions_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph) # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4 # and https://github.com/dennybritz/reinforcement-learning/issues/34 # suggest to add an entropy component in order to improve exploration. loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('loss', loss) self.params = tf_util.get_trainable_vars("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state self.attention = step_model.attention tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): # prevent import loops with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() print("number of workers are", self.nworkers) self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) self._setup_learn(self.seed) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for phi with tf.variable_scope("phi", reuse=False): self.policy_phi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for phi old with tf.variable_scope("oldphi", reuse=False): self.policy_phi_old = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_flat - ret)) vf_phi_err = tf.reduce_mean( tf.square(self.policy_phi.value_flat - ret)) vf_phi_old_err = tf.reduce_mean( tf.square(self.policy_phi_old.value_flat)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] all_var_oldpi_list = tf_util.get_trainable_vars("oldpi") var_oldpi_list = [ v for v in all_var_oldpi_list if "/vf" not in v.name and "/q/" not in v.name ] all_var_phi_list = tf_util.get_trainable_vars("phi") vf_phi_var_list = [ v for v in all_var_phi_list if "/pi" not in v.name and "/logstd" not in v.name and "/q" not in v.name ] all_var_phi_old_list = tf_util.get_trainable_vars("oldphi") vf_phi_old_var_list = [ v for v in all_var_phi_old_list if "/pi" not in v.name and "/logstd" not in v.name and "/q" not in v.name ] #print("vars", vf_var_list) self.policy_vars = all_var_list self.oldpolicy_vars = all_var_oldpi_list print("all var list", all_var_list) print("phi vars", vf_phi_var_list) print("phi old vars", vf_phi_old_var_list) self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) self.compute_vf_phi_lossandgrad = tf_util.function( [observation, self.policy_phi.obs_ph, ret], tf_util.flatgrad(vf_phi_err, vf_phi_var_list)) self.compute_vf_loss = tf_util.function( [observation, old_policy.obs_ph, ret], vferr) self.compute_vf_phi_loss = tf_util.function( [observation, self.policy_phi.obs_ph, ret], vf_phi_err) #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err) #self.phi_old_obs = np.array([-0.012815 , -0.02076313, 0.07524705, 0.09407324, 0.0901745 , -0.09339058, 0.03544853, -0.03297224]) #self.phi_old_obs = self.phi_old_obs.reshape((1, 8)) update_phi_old_expr = [] for var, var_target in zip( sorted(vf_phi_var_list, key=lambda v: v.name), sorted(vf_phi_old_var_list, key=lambda v: v.name)): update_phi_old_expr.append(var_target.assign(var)) update_phi_old_expr = tf.group(*update_phi_old_expr) self.update_phi_old = tf_util.function( [], [], updates=[update_phi_old_expr]) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield @contextmanager def temp_seed(seed): state = np.random.get_state() np.random.seed(seed) try: yield finally: np.random.set_state(state) def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess) self.vfadam.sync() self.vf_phi_adam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) self.timed = timed self.allmean = allmean self.temp_seed = temp_seed self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars( "model") + tf_util.get_trainable_vars("oldpi") self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars( 'model/values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.risk_factor_ph = tf.placeholder(tf.float32, [], name='risk_factor_ph') with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) #double_policy = self.policy_tf.make_actor(self.processed_next_obs_ph,reuse=True) # Use two Q-functions to improve performance by reducing overestimation bias if self.model_type == "QR": self.qrtau = tf.tile( tf.reshape( tf.range(0.5 / self.n_support, 1, 1 / self.n_support), [1, self.n_support]), [tf.shape(self.processed_obs_ph)[0], 1]) qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, n_support=self.n_support) # Q value when following the current policy qrtau_pi = self.qrtau qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True, n_support=self.n_support) elif self.model_type == "IQN": self.qrtau = tf.random_uniform([ tf.shape(self.processed_obs_ph)[0], self.n_support ], minval=self.tau_clamp, maxval=1.0 - self.tau_clamp) qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, model_type=self.model_type, iqn_tau=self.qrtau, n_support=self.n_support) # Q value when following the current policy qrtau_pi = tf.random_uniform([ tf.shape(self.processed_obs_ph)[0], self.n_support ], minval=self.tau_clamp, maxval=1.0 - self.tau_clamp) qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, model_type=self.model_type, iqn_tau=qrtau_pi, reuse=True, n_support=self.n_support) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1 + 1e-2, 1 - 1e-2) # Q values when following the target policy if self.model_type == "QR": target_qrtau = self.qrtau qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action, n_support=self.n_support) elif self.model_type == "IQN": target_qrtau = tf.random_uniform([ tf.shape(self.processed_next_obs_ph)[0], self.n_support ], minval=self.tau_clamp, maxval=1.0 - self.tau_clamp) qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action, model_type=self.model_type, iqn_tau=target_qrtau, n_support=self.n_support) with tf.variable_scope("loss", reuse=False): quantile_weight = 1.0 - self.risk_factor_ph * ( 2.0 * qrtau_pi - 1.0) min_quantile = tf.reduce_mean(qf1_pi[:, 0]) max_quantile = tf.reduce_mean(qf1_pi[:, -1]) #min_qf_target = tf.minimum(qf1_target, qf2_target) #max_arg = tf.argmax(target_qrtau,axis=-1) qf1_t_flag = qf1_target[:, -1] qf2_t_flag = qf2_target[:, -1] #qf1_t_flag = qf1_target[:,max_arg] #qf2_t_flag = qf2_target[:,max_arg] #qf1_t_flag = tf.reduce_max(qf1_target,axis=-1) #qf2_t_flag = tf.reduce_max(qf2_target,axis=-1) #min_flag = qf1_t_flag > qf2_t_flag min_flag = qf1_t_flag < qf2_t_flag min_qf_target = tf.where(min_flag, qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1.0 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=2), [1, 1, self.n_support]) #qrtau = tf.tile(tf.expand_dims(self.qrtau, axis=1), [1, self.n_support, 1]) #mulmax = 2.0 logit_valid_tile = tf.tile( tf.expand_dims(q_backup, axis=1), [1, self.n_support, 1]) theta_loss_tile = tf.tile(tf.expand_dims(qf1, axis=2), [1, 1, self.n_support]) Huber_loss = tf.compat.v1.losses.huber_loss( logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE, delta=self.kappa) / self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Loss = tf.abs(qrtau - tf.stop_gradient( tf.to_float(bellman_errors < 0))) * Huber_loss qf1_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1) #qf1_gmul = qf1_losses - tf.reduce_min(qf1_losses) #qf1_gmul = 1.0 + mulmax*qf1_gmul/tf.reduce_max(qf1_gmul)#(1.0 - mulmax) + 2*mulmax*qf1_gmul/tf.reduce_max(qf1_gmul) qf1_loss = tf.reduce_mean(qf1_losses) theta_loss_tile = tf.tile(tf.expand_dims(qf2, axis=2), [1, 1, self.n_support]) Huber_loss = tf.compat.v1.losses.huber_loss( logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE, delta=self.kappa) / self.kappa bellman_errors = logit_valid_tile - theta_loss_tile Loss = tf.abs(qrtau - tf.stop_gradient( tf.to_float(bellman_errors < 0))) * Huber_loss qf2_losses = tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1) #qf2_gmul = qf2_losses - tf.reduce_min(qf2_losses) #qf2_gmul = 1.0 + mulmax*qf2_gmul/tf.reduce_max(qf2_gmul)#(1.0 - mulmax) + 2*mulmax*qf2_gmul/tf.reduce_max(qf2_gmul) qf2_loss = tf.reduce_mean(qf2_losses) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean( tf.multiply(qf1_pi, quantile_weight)) # + policy_update_ratio # Q Values optimizer #qvalues_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph) qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) #qvalues_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars( 'model/values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Policy train op # will be called only every n training steps, # where n is the policy delay #policy_optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph) policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) #policy_optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate_ph) # Initializing target to match source variables self.target_init_op = tf.group([ tf.assign(target, source) for target, source in zip(target_params, source_params) ]) train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) #grad_values = tf.gradients(qvalues_losses, qvalues_params) #grad_values = list(zip(grad_values, qvalues_params)) with tf.control_dependencies([train_values_op]): policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) #grad_policy = tf.gradients(policy_loss, tf_util.get_trainable_vars('model/pi')) #grad_policy = list(zip(grad_policy, tf_util.get_trainable_vars('model/pi'))) self.policy_train_op = policy_train_op with tf.control_dependencies([self.policy_train_op]): # Polyak averaging for target variables self.target_ops = tf.group([ tf.assign(target, (1.0 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ]) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('min_quantile', min_quantile) tf.summary.scalar('max_quantile', max_quantile) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) ''' for grad, var in grad_values + grad_policy: tf.summary.histogram(var.name, var) tf.summary.histogram(var.name + '/gradient', grad) ''' # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init_op) self.summary = tf.summary.merge_all()