def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdfromflat(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M: Mval}) logliks = calcloglik(Xval, Mval) entval_ll = -logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdfromflat(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = -entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
def test_mpi_adam(): """ tests the MpiAdam object's functionality """ np.random.seed(0) tf.set_random_seed(0) a_var = tf.Variable(np.random.randn(3).astype('float32')) b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var)) learning_rate = 1e-2 update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) do_update = tf_utils.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for step in range(10): print(step, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a_var, b_var] lossandgrad = tf_utils.function( [], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for step in range(10): loss, grad = lossandgrad() adam.update(grad, learning_rate) print(step, loss)
def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"): """ reward regression from observations and transitions :param env: (Gym Environment) :param hidden_size: ([int]) the hidden dimension for the MLP :param entcoeff: (float) the entropy loss weight :param scope: (str) tensorflow variable scope """ self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([ o + a for o, a in zip(self.observation_shape, self.actions_shape) ]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) var_list = self.get_trainable_variables() self.lossandgrad = tf_util.function([ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of input with that name :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. :param num_actions: (int) number of actions. :param scope: (str or VariableScope) optional scope for variable_scope. :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given observation. See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = tf_utils.function( inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) def act(obs, stochastic=True, update_eps=-1): return _act(obs, stochastic, update_eps) return act
def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, obs_phs=None): """ Creates the act function: :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder :param sess: (TensorFlow session) The current TensorFlow session :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), A tuple containing the observation placeholder and the processed observation placeholder respectivly. """ eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs) obs_phs = (policy.obs_ph, policy.processed_obs) #mask = tf.one_hot(mask_idx, depth=541, dtype=tf.float32) mask = tf.placeholder(dtype=tf.float32) feasible_q_values = tf.math.multiply(policy.q_values, mask) deterministic_actions = tf.argmax(feasible_q_values, axis=1) batch_size = tf.shape(policy.obs_ph)[0] n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = tf_util.function( inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, mask], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) def act(obs, stochastic=False, update_eps=-1, mask=None): return _act(obs, stochastic, update_eps, mask) return act, obs_phs
def validate_probtype(probtype, pdparam): """ validate probability distribution types :param probtype: (ProbabilityDistributionType) the type to validate :param pdparam: ([float]) the flat probabilities to test """ number_samples = 100000 # Check to see if mean negative log likelihood == differential entropy mval = np.repeat(pdparam[None, :], number_samples, axis=0) mval_ph = probtype.param_placeholder([number_samples]) action_mask_ph = probtype.param_placeholder([number_samples]) action_mask_ph = tf.placeholder_with_default( tf.zeros_like(action_mask_ph), shape=np.shape(action_mask_ph)) xval_ph = probtype.sample_placeholder([number_samples]) proba_distribution = probtype.proba_distribution_from_flat(mval_ph) calcloglik = tf_util.function([xval_ph, mval_ph], proba_distribution.logp(xval_ph)) calcent = tf_util.function([mval_ph], proba_distribution.entropy()) xval = tf.get_default_session().run(proba_distribution.sample(), feed_dict={mval_ph: mval}) logliks = calcloglik(xval, mval) entval_ll = -logliks.mean() entval_ll_stderr = logliks.std() / np.sqrt(number_samples) entval = calcent(mval).mean() assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] mval2_ph = probtype.param_placeholder([number_samples]) action_mask_ph2 = probtype.param_placeholder([number_samples]) action_mask_ph2 = tf.placeholder_with_default( tf.zeros_like(action_mask_ph2), shape=np.shape(action_mask_ph2)) pd2 = probtype.proba_distribution_from_flat(mval2_ph) tmp = pdparam + np.random.randn(pdparam.size) * 0.1 mval2 = np.repeat(tmp[None, :], number_samples, axis=0) calckl = tf_util.function([mval_ph, mval2_ph], proba_distribution.kl(pd2)) klval = calckl(mval, mval2).mean() logliks = calcloglik(xval, mval2) klval_ll = -entval - logliks.mean() klval_ll_stderr = logliks.std() / np.sqrt(number_samples) assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
def __init__(self, epsilon=1e-2, shape=()): """ calulates the running mean and std of a data stream https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm :param epsilon: (float) helps with arithmetic issues :param shape: (tuple) the shape of the data stream's output """ self._sum = tf.compat.v1.get_variable( dtype=tf.float64, shape=shape, initializer=tf.compat.v1.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.compat.v1.get_variable( dtype=tf.float64, shape=shape, initializer=tf.compat.v1.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.compat.v1.get_variable( dtype=tf.float64, shape=(), initializer=tf.compat.v1.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.cast(self._sum / self._count, tf.float32) self.std = tf.sqrt( tf.maximum( tf.cast(self._sumsq / self._count, tf.float32) - tf.square(self.mean), 1e-2)) newsum = tf.compat.v1.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.compat.v1.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.compat.v1.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = tf_util.function( [newsum, newsumsq, newcount], [], updates=[ tf.compat.v1.assign_add(self._sum, newsum), tf.compat.v1.assign_add(self._sumsq, newsumsq), tf.compat.v1.assign_add(self._count, newcount) ])
def test_function(): """ test the function function in tf_util """ with tf.Graph().as_default(): x_ph = tf.placeholder(tf.int32, (), name="x") y_ph = tf.placeholder(tf.int32, (), name="y") z_ph = 3 * x_ph + 2 * y_ph linear_fn = function([x_ph, y_ph], z_ph, givens={y_ph: 0}) with single_threaded_session(): initialize() assert linear_fn(2) == 6 assert linear_fn(2, 2) == 10
def test_multikwargs(): """ test the function function in tf_util """ with tf.Graph().as_default(): x_ph = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2_ph = tf.placeholder(tf.int32, (), name="x") z_ph = 3 * x_ph + 2 * x2_ph linear_fn = function([x_ph, x2_ph], z_ph, givens={x2_ph: 0}) with single_threaded_session(): initialize() assert linear_fn(2) == 6 assert linear_fn(2, 2) == 10
def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess): """ Creates the act function: :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder :param sess: (TensorFlow session) The current TensorFlow session :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), A tuple containing the observation placeholder and the processed observation placeholder respectivly. """ eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) policy = q_func(sess, ob_space, ac_space, 1, 1, None) obs_phs = (policy.obs_ph, policy.processed_obs) deterministic_actions = tf.argmax(tf.add(policy.q_values, policy.action_mask_ph), axis=1) batch_size = tf.shape(policy.obs_ph)[0] random_actions = tf.distributions.Categorical(probs=policy.action_mask_probs_ph, dtype=tf.int64).sample() chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, policy.action_mask_ph, policy.action_mask_probs_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) def act(obs, stochastic=True, update_eps=-1, action_mask=None): if action_mask is not None: return _act(obs, stochastic, update_eps, policy.prepare_action_mask(action_mask), action_mask) else: return _act(obs, stochastic, update_eps) return act, obs_phs
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Penalty related variable with tf.variable_scope('penalty'): cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8)) penalty_param = tf.get_variable('penalty_param', initializer=float(param_init), trainable=True, dtype=tf.float32) penalty = tf.nn.softplus(penalty_param) penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim)) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # # Network for safety value function # with tf.variable_Scope("vc",reuse=False): # self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) # Surrogate for cost function surrcost = tf.reduce_mean(ratio * catarg) optimgain = surrgain + entbonus # Include surr_cost in pi_objective optimgain -= penalty * surrcost optimgain /= (1 + penalty) # # Loss function for pi is negative of pi_objective # optimgain = -optimgain # Should we?? losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # Fisher vector products fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('penalty_loss', penalty_loss) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('constraint_cost_function_loss', surrcost) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg], fvp) # Why need all inputs? Might for implementation easiness # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], # tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret], # tf_util.flatgrad(vcerr, vcf_var_list)) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret], [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)]) self.compute_lagrangiangrad = tf_util.function([cur_cost_ph], tf_util.flatgrad(penalty_loss, [penalty_param])) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() # optimizer for constraint costs value function self.vcadam = MpiAdam(vcf_var_list, sess=self.sess) self.vcadam.sync() # optimizer for lagragian value of safe RL self.penaltyadam = MpiAdam([penalty_param], sess=self.sess) self.penaltyadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('discounted_costs', tf.reduce_mean(cret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('discounted_rewards', cret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('penalty_learning_rate', self.penalty_lr) tf.summary.histogram('advantage', atarg) tf.summary.histogram('cost_advantage', catarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): """ Trains an ACKTR model. :param env: (Gym environment) The environment to learn from :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) :param gamma: (float) The discount value :param lam: (float) the tradeoff between exploration and exploitation :param timesteps_per_batch: (int) the number of timesteps for each batch :param num_timesteps: (int) the total number of timesteps to run :param animate: (bool) if render env :param callback: (function) called every step, used for logging and saving :param desired_kl: (float) the Kullback leibler weight for the loss """ obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, epsilon=1e-2, stats_decay=0.99, async_eigen_decomp=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = tf_util.function(inputs, update_op) tf_util.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for queue_runner in [q_runner, value_fn.q_runner]: assert queue_runner is not None enqueue_threads.extend( queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) timesteps_this_batch += path["reward"].shape[0] timesteps_so_far += path["reward"].shape[0] if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = value_fn.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function value_fn.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl_loss = policy.compute_kl(ob_no, oldac_dist) if kl_loss > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl_loss < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular( "EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) logger.record_tabular("KL", kl_loss) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def __init__(self, ob_dim, ac_dim): """ Create a gaussian MLP policy :param ob_dim: (int) Observation dimention :param ac_dim: (int) action dimention """ # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations oldac_na = tf.placeholder( tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions # batch of actions previous action distributions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim * 2], name="oldac_dist") adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} layer_1 = tf.nn.tanh( dense(ob_no, 64, "h1", weight_init=tf_util.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) layer_2 = tf.nn.tanh( dense(layer_1, 64, "h2", weight_init=tf_util.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(layer_2, ac_dim, "mean", weight_init=tf_util.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict # Variance on outputs self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([ tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim]) ], 1) # This is the sampled action we'll perform. sampled_ac_na = tf.random_normal(tf.shape( ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim] logprobsampled_n = -tf.reduce_sum(tf.log( ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])), axis=1) # Logprob of sampled action logprob_n = -tf.reduce_sum( tf.log(ac_dist[:, ac_dim:]), axis=1 ) - 0.5 * tf.log(2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])), axis=1 ) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, # and new policy used to compute logprob_n surr = -tf.reduce_mean( adv_n * logprob_n ) # Loss function that we'll differentiate to get the policy gradient surr_sampled = -tf.reduce_mean(logprob_n) # Sampled loss of the policy # Generate a new action and its logprob self._act = tf_util.function( [ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss) # Input and output variables needed for computing loss self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) tf_util.initialize() # Initialize uninitialized TF variables
def build_train(q_func, ob_space, ac_space, sess, num_actions, num_action_streams, batch_size, learning_rate=5e-4, aggregator='reduceLocalMean', optimizer_name="Adam", grad_norm_clipping=None, gamma=0.99, double_q=True, scope="bdq", reuse=None, losses_version=2, independent=False, dueling=True, target_version="mean", loss_type="L2", full_tensorboard_log=False): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int total number of sub-actions to be represented at the output num_action_streams: int specifies the number of action branches in action value (or advantage) function representation batch_size: int size of the sampled mini-batch from the replay buffer reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for deep Q-learning grad_norm_clipping: float or None clip graident norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q-Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. BDQ uses it. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. losses_version: int specifies the version number for merging of losses across the branches version 2 is the best-performing loss used for BDQ. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select an action given an observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ assert independent and losses_version == 4 or not independent, 'independent needs to be used along with loss v4' assert independent and target_version == "indep" or not independent, 'independent needs to be used along with independent TD targets' with tf.variable_scope("input", reuse=reuse): stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") with tf.variable_scope(scope, reuse=reuse): act_f, obs_phs = build_act(q_func, ob_space, ac_space, sess, num_actions, num_action_streams, stochastic_ph, update_eps_ph) # Q-network evaluation with tf.variable_scope( "step_model", reuse=True, custom_getter=tf_util.outer_scope_getter("step_model")): step_model = q_func(sess, ob_space, ac_space, 1, 1, None, num_actions, reuse=True, obs_phs=obs_phs) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/model") # Target Q-network evalution with tf.variable_scope("target_q_func", reuse=False): target_policy = q_func(sess, ob_space, ac_space, 1, 1, None, num_actions, reuse=False) target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # compute estimate of best possible value starting from state at t + 1 double_obs_ph = target_policy.obs_ph if double_q: with tf.variable_scope( "q_func", reuse=True, custom_getter=tf_util.outer_scope_getter("q_func")): selection_q_tp1 = q_func(sess, ob_space, ac_space, 1, 1, None, num_actions, reuse=True) double_obs_ph = selection_q_tp1.obs_ph else: selection_q_tp1 = target_policy num_actions_pad = num_actions // num_action_streams with tf.variable_scope("loss", reuse=reuse): act_t_ph = tf.placeholder(tf.int32, [None, num_action_streams], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") q_values = [] for dim in range(num_action_streams): selected_a = tf.squeeze( tf.slice(act_t_ph, [0, dim], [batch_size, 1])) # TODO better? q_values.append( tf.reduce_sum(tf.one_hot(selected_a, num_actions_pad) * step_model.q_values[dim], axis=1)) if target_version == "indep": target_q_values = [] for dim in range(num_action_streams): selected_a = tf.argmax(selection_q_tp1.q_values[dim], axis=1) selected_q = tf.reduce_sum( tf.one_hot(selected_a, num_actions_pad) * target_policy.q_values[dim], axis=1) masked_selected_q = (1.0 - done_mask_ph) * selected_q target_q = rew_t_ph + gamma * masked_selected_q target_q_values.append(target_q) elif target_version == "max": for dim in range(num_action_streams): selected_a = tf.argmax(selection_q_tp1.q_values[dim], axis=1) selected_q = tf.reduce_sum( tf.one_hot(selected_a, num_actions_pad) * target_policy.q_values[dim], axis=1) masked_selected_q = (1.0 - done_mask_ph) * selected_q if dim == 0: max_next_q_values = masked_selected_q else: max_next_q_values = tf.maximum(max_next_q_values, masked_selected_q) target_q_values = [rew_t_ph + gamma * max_next_q_values ] * num_action_streams # TODO better? elif target_version == "mean": for dim in range(num_action_streams): selected_a = tf.argmax(selection_q_tp1.q_values[dim], axis=1) selected_q = tf.reduce_sum( tf.one_hot(selected_a, num_actions_pad) * target_policy.q_values[dim], axis=1) masked_selected_q = (1.0 - done_mask_ph) * selected_q if dim == 0: mean_next_q_values = masked_selected_q else: mean_next_q_values += masked_selected_q mean_next_q_values /= num_action_streams target_q_values = [rew_t_ph + gamma * mean_next_q_values ] * num_action_streams # TODO better? else: assert False, 'unsupported target version ' + str(target_version) if optimizer_name == "Adam": optimizer = tf.train.AdamOptimizer(learning_rate) else: assert False, 'unsupported optimizer ' + str(optimizer_name) if loss_type == "L2": loss_function = tf.square elif loss_type == "Huber": loss_function = tf_util.huber_loss else: assert False, 'unsupported loss type ' + str(loss_type) if losses_version == 1: mean_q_value = sum(q_values) / num_action_streams mean_target_q_value = sum(target_q_values) / num_action_streams td_error = mean_q_value - tf.stop_gradient(mean_target_q_value) loss = loss_function(td_error) weighted_mean_loss = tf.reduce_mean(importance_weights_ph * loss) optimize_expr = minimize_and_clip( optimizer, weighted_mean_loss, var_list=q_func_vars, total_n_streams=(num_action_streams + (1 if dueling else 0)), clip_val=grad_norm_clipping) optimize_expr = [optimize_expr] tf.summary.scalar("loss", weighted_mean_loss) else: stream_losses = [] for dim in range(num_action_streams): dim_td_error = q_values[dim] - tf.stop_gradient( target_q_values[dim]) dim_loss = loss_function(dim_td_error) # Scaling of learning based on importance sampling weights is optional, either way works stream_losses.append( tf.reduce_mean(dim_loss * importance_weights_ph)) # with scaling #stream_losses.append(tf.reduce_mean(dim_loss)) # without scaling if dim == 0: td_error = tf.abs(dim_td_error) else: td_error += tf.abs(dim_td_error) #td_error /= num_action_streams mean_loss = sum(stream_losses) / num_action_streams optimize_expr = minimize_and_clip( optimizer, mean_loss, var_list=q_func_vars, total_n_streams=(num_action_streams + (1 if dueling else 0)), clip_val=grad_norm_clipping) optimize_expr = [optimize_expr] tf.summary.scalar("loss", mean_loss) ## FIXME tf summary scalars are wrong. They are not matching the original code. tf.summary.scalar("td_error", tf.reduce_mean(td_error)) if full_tensorboard_log: tf.summary.histogram("td_error", td_error) # Target Q-network parameters are periodically updated with the Q-network's update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph)) tf.summary.scalar('importance_weights', tf.reduce_mean(importance_weights_ph)) if full_tensorboard_log: tf.summary.histogram('rewards', rew_t_ph) tf.summary.histogram('importance_weights', importance_weights_ph) if tf_util.is_image(obs_phs[0]): tf.summary.image('observation', obs_phs[0]) elif len(obs_phs[0].shape) == 1: tf.summary.histogram('observation', obs_phs[0]) # optimize_expr = optimizer.apply_gradients(gradients) summary = tf.summary.merge_all() train = tf_util.function( inputs=[ obs_phs[0], act_t_ph, rew_t_ph, target_policy.obs_ph, #obs_tp1_input, double_obs_ph, done_mask_ph, importance_weights_ph ], outputs=[summary, td_error], updates=[optimize_expr]) update_target = tf_util.function([], [], updates=[update_target_expr]) # q_values = tf_util.function([obs_phs[0]], step_model) return act_f, train, update_target, step_model #{'q_values': q_values}
def __init__(self, observation_space, action_space, hidden_size, entcoeff=0.001, scope="adversary", normalize=True): """ Reward regression from observations and transitions :param observation_space: (gym.spaces) :param action_space: (gym.spaces) :param hidden_size: ([int]) the hidden dimension for the MLP :param entcoeff: (float) the entropy loss weight :param scope: (str) tensorflow variable scope :param normalize: (bool) Whether to normalize the reward or not """ # TODO: support images properly (using a CNN) self.scope = scope self.observation_shape = observation_space.shape self.actions_shape = action_space.shape if isinstance(action_space, gym.spaces.Box): # Continuous action space self.discrete_actions = False self.n_actions = action_space.shape[0] elif isinstance(action_space, gym.spaces.Discrete): self.n_actions = action_space.n self.discrete_actions = True else: raise ValueError( 'Action space not supported: {}'.format(action_space)) self.hidden_size = hidden_size self.normalize = normalize self.obs_rms = None # Placeholders self.generator_obs_ph = tf.compat.v1.placeholder( observation_space.dtype, (None, ) + self.observation_shape, name="observations_ph") self.generator_acs_ph = tf.compat.v1.placeholder( action_space.dtype, (None, ) + self.actions_shape, name="actions_ph") self.expert_obs_ph = tf.compat.v1.placeholder( observation_space.dtype, (None, ) + self.observation_shape, name="expert_observations_ph") self.expert_acs_ph = tf.compat.v1.placeholder( action_space.dtype, (None, ) + self.actions_shape, name="expert_actions_ph") # Build graph generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean(input_tensor=tf.cast( tf.nn.sigmoid(generator_logits) < 0.5, tf.float32)) expert_acc = tf.reduce_mean(input_tensor=tf.cast( tf.nn.sigmoid(expert_logits) > 0.5, tf.float32)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(input_tensor=generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(input_tensor=expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(input_tensor=logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.math.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) var_list = self.get_trainable_variables() self.lossandgrad = tf_util.function([ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
def setup_model(self): # prevent import loops with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() print("number of workers are", self.nworkers) self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) self._setup_learn(self.seed) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for phi with tf.variable_scope("phi", reuse=False): self.policy_phi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for phi old with tf.variable_scope("oldphi", reuse=False): self.policy_phi_old = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_flat - ret)) vf_phi_err = tf.reduce_mean( tf.square(self.policy_phi.value_flat - ret)) vf_phi_old_err = tf.reduce_mean( tf.square(self.policy_phi_old.value_flat)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] all_var_oldpi_list = tf_util.get_trainable_vars("oldpi") var_oldpi_list = [ v for v in all_var_oldpi_list if "/vf" not in v.name and "/q/" not in v.name ] all_var_phi_list = tf_util.get_trainable_vars("phi") vf_phi_var_list = [ v for v in all_var_phi_list if "/pi" not in v.name and "/logstd" not in v.name and "/q" not in v.name ] all_var_phi_old_list = tf_util.get_trainable_vars("oldphi") vf_phi_old_var_list = [ v for v in all_var_phi_old_list if "/pi" not in v.name and "/logstd" not in v.name and "/q" not in v.name ] #print("vars", vf_var_list) self.policy_vars = all_var_list self.oldpolicy_vars = all_var_oldpi_list print("all var list", all_var_list) print("phi vars", vf_phi_var_list) print("phi old vars", vf_phi_old_var_list) self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) self.compute_vf_phi_lossandgrad = tf_util.function( [observation, self.policy_phi.obs_ph, ret], tf_util.flatgrad(vf_phi_err, vf_phi_var_list)) self.compute_vf_loss = tf_util.function( [observation, old_policy.obs_ph, ret], vferr) self.compute_vf_phi_loss = tf_util.function( [observation, self.policy_phi.obs_ph, ret], vf_phi_err) #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err) #self.phi_old_obs = np.array([-0.012815 , -0.02076313, 0.07524705, 0.09407324, 0.0901745 , -0.09339058, 0.03544853, -0.03297224]) #self.phi_old_obs = self.phi_old_obs.reshape((1, 8)) update_phi_old_expr = [] for var, var_target in zip( sorted(vf_phi_var_list, key=lambda v: v.name), sorted(vf_phi_old_var_list, key=lambda v: v.name)): update_phi_old_expr.append(var_target.assign(var)) update_phi_old_expr = tf.group(*update_phi_old_expr) self.update_phi_old = tf_util.function( [], [], updates=[update_phi_old_expr]) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield @contextmanager def temp_seed(seed): state = np.random.get_state() np.random.seed(seed) try: yield finally: np.random.set_state(state) def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess) self.vfadam.sync() self.vf_phi_adam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) self.timed = timed self.allmean = allmean self.temp_seed = temp_seed self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars( "model") + tf_util.get_trainable_vars("oldpi") self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, action_filter=None, OnExpo=False, threshold=None): # NKAM """ Creates the act function: :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder :param sess: (TensorFlow session) The current TensorFlow session :param action_filter: (function (TensorFlow Tensor): TensorFlow Tensor) filters actions according to a criterion :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), A tuple containing the observation placeholder and the processed observation placeholder respectivly. """ eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) policy = q_func(sess, ob_space, ac_space, 1, 1, None) obs_phs = (policy.obs_ph, policy.processed_obs) batch_size = tf.shape(policy.obs_ph)[0] n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n if action_filter != None: actions_mask = tf.py_func(func=action_filter, inp=[tf.constant(n_actions)], Tout=[tf.bool for _ in range(0, n_actions)]) total_possible_actions = tf.reduce_sum(tf.cast(actions_mask, dtype=tf.int64)) actions_mask = tf.reshape(tf.tile(actions_mask, [batch_size]), [batch_size, tf.shape(actions_mask)[0]]) if threshold != None: legal_q_values = tf.boolean_mask(policy.q_values, actions_mask) max_legal_q_value = tf.math.reduce_max(legal_q_values) max_legal_q_value = tf.math.scalar_mul(threshold, max_legal_q_value) threshold_tensor = tf.tile(tf.reshape(max_legal_q_value, (1,)), tf.constant(n_actions, dtype=tf.int64, shape=(1,))) threshold_tensor = tf.reshape(tf.tile(threshold_tensor, [batch_size]), [batch_size, tf.shape(threshold_tensor)[0]]) threshold_mask = tf.math.greater(policy.q_values, threshold_tensor) actions_mask = tf.math.logical_or(actions_mask, threshold_mask) # TODO: how to avoid getting stuck??? # if actions_mask == "empty": # actions_mask = None else: actions_mask = tf.constant([True for _ in range(0, n_actions)], dtype=tf.bool) total_possible_actions = n_actions actions_mask = tf.reshape(tf.tile(actions_mask, [batch_size]), [batch_size, tf.shape(actions_mask)[0]]) q_values_mask = tf.constant([-float('inf') for _ in range(0, n_actions)]) q_values_mask = tf.reshape(tf.tile(q_values_mask, [batch_size]), [batch_size, tf.shape(q_values_mask)[0]]) masked_q_values = tf.where(actions_mask, policy.q_values, q_values_mask) deterministic_actions = tf.argmax(masked_q_values, axis=1) possible_actions = tf.boolean_mask(tf.constant([action for action in range(0, n_actions)], dtype=tf.int64), actions_mask[0]) masked_random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=total_possible_actions, dtype=tf.int64) if not OnExpo: random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64) else: random_actions = tf.gather(possible_actions, masked_random_actions) # random_actions = tf.gather(possible_actions, masked_random_actions, batch_dims=0) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) def act(obs, stochastic=True, update_eps=-1): return _act(obs, stochastic, update_eps) return act, obs_phs
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): # observation_space 와 action_space # ob_space = env.observation_space # ac_space = env.action_space obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) # return :: # obs :: placeholder # pdtype :: action_space의 distribution정보를 담은 객체 with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # DeepMind의 observation normalization obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # ===========================[Value function prediction Model]==================================================== # dense()는 tf.nn.bias_add(tf.matmul(input_tensor, weight), bias) 를 리턴해준다. last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=tf_util.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0] # dense(last, 1) 의 경우 return shape 가 [None, 1]이므로, [None]을 얻기 위해서 [:, 0]를 해준다. # ===========================[Policy function MOdel]============================================================== last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=tf_util.normc_initializer(1.0))) # 선택한 gym 환경의 action space가 Box 형태인 경우 다음과 같이 # action distribution의 mean 과 std 를 concate 한 값을 # action output으로 사용한다. if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01)) # 이 모델에서의 action output은 다음 proba_distribution 에 담기도록 합니다. self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) self.state_in = [] self.state_out = [] # ppo1/mlp_policy class를 상송했으므로 # 해당 클래스에서 정의되어 있는 act 함수를 재정의 # _act를 정의해주면 # act()를 통해서 action / value 를 얻을수 있다. self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) self.action = action self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # normalization obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, kernel_initializer=U.normc_initializer(1.0), name="vffc%i" % (i + 1))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) # input to policy last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name="polfinal", kernel_initializer=U.normc_initializer(0.01)) # select option self.op_pi = tf.nn.softmax( tf.layers.dense(tf.stop_gradient(last_out), num_options, name="OPfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, task_name=None, verbose=False): """ Learn a behavior clone policy, and return the save location :param env: (Gym Environment) the environment :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy :param dataset: (Dset or MujocoDset) the dataset manager :param optim_batch_size: (int) the batch size :param max_iters: (int) the maximum number of iterations :param adam_epsilon: (float) the epsilon value for the adam optimizer :param optim_stepsize: (float) the optimizer stepsize :param ckpt_dir: (str) the save directory, can be None for temporary directory :param task_name: (str) the save name, can be None for saving directly to the directory name :param verbose: (bool) :return: (str) the save location for the TensorFlow model """ val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space policy = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder obs_ph = policy.obs_ph action_ph = policy.pdtype.sample_placeholder([None]) stochastic_ph = policy.stochastic_ph loss = tf.reduce_mean(tf.square(action_ph - policy.ac)) var_list = policy.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph], [loss] + [tf_util.flatgrad(loss, var_list)]) tf_util.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, grad = lossandgrad(ob_expert, ac_expert, True) adam.update(grad, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = os.path.join(ckpt_dir, task_name) tf_util.save_state(savedir_fname, var_list=policy.get_variables()) return savedir_fname
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.compat.v1.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.compat.v1.variable_scope("loss", reuse=False): # Target advantage function (if applicable) atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.compat.v1.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder( [None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(input_tensor=kloldnew) meanent = tf.reduce_mean(input_tensor=ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean( input_tensor=tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( input_tensor=tf.square(self.policy_pi.value_flat - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] tf.compat.v1.summary.scalar('entropy_loss', pol_entpen) tf.compat.v1.summary.scalar('policy_gradient_loss', pol_surr) tf.compat.v1.summary.scalar('value_function_loss', vf_loss) tf.compat.v1.summary.scalar('approximate_kullback-leibler', meankl) tf.compat.v1.summary.scalar('clip_factor', clip_param) tf.compat.v1.summary.scalar('loss', total_loss) self.params = tf_util.get_trainable_vars("model") self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame( tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model")) ]) with tf.compat.v1.variable_scope("Adam_mpi", reuse=False): self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar( 'discounted_rewards', tf.reduce_mean(input_tensor=ret)) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.optim_stepsize)) tf.compat.v1.summary.scalar( 'advantage', tf.reduce_mean(input_tensor=atarg)) tf.compat.v1.summary.scalar( 'clip_range', tf.reduce_mean(input_tensor=self.clip_param)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', ret) tf.compat.v1.summary.histogram('learning_rate', self.optim_stepsize) tf.compat.v1.summary.histogram('advantage', atarg) tf.compat.v1.summary.histogram('clip_range', self.clip_param) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', obs_ph) else: tf.compat.v1.summary.histogram( 'observation', obs_ph) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess) self.summary = tf.compat.v1.summary.merge_all() self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses)
def build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, param_noise_filter_func=None): """ Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder :param sess: (TensorFlow session) The current TensorFlow session :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), A tuple containing the observation placeholder and the processed observation placeholder respectivly. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) # Unmodified Q. policy = q_func(sess, ob_space, ac_space, 1, 1, None) obs_phs = (policy.obs_ph, policy.processed_obs) # Perturbable Q used for the actual rollout. with tf.variable_scope("perturbed_model", reuse=False): perturbable_policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs) def perturb_vars(original_scope, perturbed_scope): """ We have to wrap this code into a function due to the way tf.cond() works. See https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for a more detailed discussion. :param original_scope: (str or VariableScope) the original scope. :param perturbed_scope: (str or VariableScope) the perturbed scope. :return: (TensorFlow Operation) """ all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. operation = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. operation = tf.assign(perturbed_var, var) perturb_ops.append(operation) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. with tf.variable_scope("adaptive_model", reuse=False): adaptive_policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs) perturb_for_adaption = perturb_vars(original_scope="model", perturbed_scope="adaptive_model/model") kl_loss = tf.reduce_sum( tf.nn.softmax(policy.q_values) * (tf.log(tf.nn.softmax(policy.q_values)) - tf.log(tf.nn.softmax(adaptive_policy.q_values))), axis=-1) mean_kl = tf.reduce_mean(kl_loss) def update_scale(): """ update the scale expression :return: (TensorFlow Tensor) the updated scale expression """ with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond(mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_thres_expr = param_noise_threshold.assign( tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. perturbed_deterministic_actions = tf.argmax(perturbable_policy.q_values, axis=1) deterministic_actions = tf.argmax(policy.q_values, axis=1) batch_size = tf.shape(policy.obs_ph)[0] n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps perturbed_stochastic_actions = tf.where(chose_random, random_actions, perturbed_deterministic_actions) stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) perturbed_output_actions = tf.cond(stochastic_ph, lambda: perturbed_stochastic_actions, lambda: deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) updates = [ update_eps_expr, tf.cond(reset_ph, lambda: perturb_vars(original_scope="model", perturbed_scope="perturbed_model/model"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_thres_expr, ] _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) _perturbed_act = tf_util.function( inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], outputs=perturbed_output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, updates=updates) def act(obs, reset=None, update_param_noise_threshold=None, update_param_noise_scale=None, stochastic=True, update_eps=-1): """ get the action from the current observation :param obs: (Any) Observation that can be feed into the output of make_obs_ph :param reset: (bool) reset the perturbed policy by sampling a new perturbation :param update_param_noise_threshold: (float) the desired threshold for the difference between non-perturbed and perturbed policy :param update_param_noise_scale: (bool) whether or not to update the scale of the noise for the next time it is re-perturbed :param stochastic: (bool) if set to False all the actions are always deterministic (default False) :param update_eps: (float) update epsilon a new value, if negative not update happens (default: no update) :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for every element of the batch. """ if reset is None or update_param_noise_threshold is None or update_param_noise_scale is None: return _act(obs, stochastic, update_eps) else: return _perturbed_act(obs, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) return act, obs_phs
def build_act_mod(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess): """ Original from build_graph.py from stable_baselines.deepq. This modified version returns the full sorted action array instead of the single best action. Creates the act function: :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder :param sess: (TensorFlow session) The current TensorFlow session :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), A tuple containing the observation placeholder and the processed observation placeholder respectively. """ eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) policy = q_func(sess, ob_space, ac_space, 1, 1, None) obs_phs = (policy.obs_ph, policy.processed_obs) #################################################################### # MODIFICATION: # Get all sorted q_values instead of just the best one. Have to # cast to int64, since it comes back as int32 which is incompatible # with the random_actions, which is int64. deterministic_actions = tf.cast( tf.argsort(policy.q_values, axis=1, direction='DESCENDING'), tf.int64) #################################################################### # ORIGINAL: # Note that the original comes out as int64 since that's the default # from argmax. # deterministic_actions = tf.argmax(policy.q_values, axis=1) batch_size = tf.shape(policy.obs_ph)[0] n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n #################################################################### # MODIFICATION: # TODO: It feels wrong not to use "batch_size" like was done in the # original, but this seems to be working. random_actions = tf.random_uniform((1, n_actions), minval=0, maxval=n_actions, dtype=tf.int64) #################################################################### # ORIGINAL: # random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = tf_util.function( inputs=[policy.obs_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) def act(obs, stochastic=True, update_eps=-1): return _act(obs, stochastic, update_eps) return act, obs_phs
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """ Creates the train function: :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of input with that name :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: - observation_in: (Any) the output of observation placeholder - num_actions: int number of actions - scope: (str) - reuse: (bool) should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. :param num_actions: (int) number of actions :param reuse: (bool) whether or not to reuse the graph variables :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. :param gamma: (float) discount rate. :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. :param scope: (str or VariableScope) optional scope for variable_scope. :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. :return: (tuple) act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given observation. See the top of the file for details. train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) optimize the error in Bellman's equation. See the top of the file for details. update_target: (function) copy the parameters from optimized Q function to the target Q function. See the top of the file for details. debug: ({str: function}) a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = tf_utils.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = tf_utils.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = tf_utils.function([], [], updates=[update_target_expr]) q_values = tf_utils.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=2, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) # env._seed(seed) gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app dirname = '{}_{}opts_saves/'.format(gamename, num_options) if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'mlp_policy.py', 'run_main.py'] for i in range(len(files)): src = os.path.expanduser('~/baselines/baselines/ppo1/') + files[i] dest = os.path.expanduser('~/baselines/baselines/ppo1/') + dirname shutil.copy2(src, dest) # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # pdb.set_trace() ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) op_loss = -tf.reduce_sum(log_pi[0][option[0]] * atarg + entropy * 0.1) total_loss += op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option, term_adv], losses + [U.flatgrad(total_loss, var_list)]) termloss = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) results = [] if saves: results = open( gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean( seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if iters_so_far % 5 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) min_batch = 160 t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue # This part is only necessasry when we use options. # We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) termg = termloss(batch["ob"], [opt], tadv) adam.update(termg[0], 5e-7 * cur_lrmult) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if saves: out = "{},{}" for _ in range(num_options): out += ",{},{},{},{}" out += "\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']), axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=tf_util.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=tf_util.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01)) self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) self.state_in = [] self.state_out = [] # change for BC self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) self.action = action self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier( self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leiber', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam( self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = find_trainable_variables("model") if self.using_gail: self.params.extend( self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def __init__(self, ob_dim, ac_dim, verbose=1): """ Create an MLP policy for a value function :param ob_dim: (int) Observation dimention :param ac_dim: (int) action dimention :param verbose: (int) verbosity level """ obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2 ]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} layer_1 = tf.nn.elu( dense(obs_ph, 64, "h1", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) layer_2 = tf.nn.elu( dense(layer_1, 64, "h2", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(layer_2, 1, "hfinal", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:, 0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = tf.reduce_mean( tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = tf_util.function([obs_ph], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, clip_kl=0.3, epsilon=0.1, stats_decay=0.95, async=1, kfac_update=2, cold_iter=50, weight_decay_dict=wd_dict, max_grad_norm=None, verbose=verbose) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 tf_util.initialize() # Initialize uninitialized TF variables
def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """ Creates the train function: :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param reuse: (bool) whether or not to reuse the graph variables :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. :param sess: (TensorFlow session) The current TensorFlow session :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. :param gamma: (float) discount rate. :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. :param scope: (str or VariableScope) optional scope for variable_scope. :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. :return: (tuple) act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given observation. See the top of the file for details. train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) optimize the error in Bellman's equation. See the top of the file for details. update_target: (function) copy the parameters from optimized Q function to the target Q function. See the top of the file for details. step_model: (DQNPolicy) Policy for evaluation """ n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n with tf.variable_scope("input", reuse=reuse): stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") with tf.variable_scope(scope, reuse=reuse): if param_noise: act_f, obs_phs = build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, param_noise_filter_func=param_noise_filter_func) else: act_f, obs_phs = build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess) # q network evaluation with tf.variable_scope("step_model", reuse=True, custom_getter=tf_util.outer_scope_getter("step_model")): step_model = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True, obs_phs=obs_phs) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/model") # target q network evaluation with tf.variable_scope("target_q_func", reuse=False): target_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=False) target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # compute estimate of best possible value starting from state at t + 1 double_q_values = None double_obs_ph = target_policy.obs_ph if double_q: with tf.variable_scope("double_q", reuse=True, custom_getter=tf_util.outer_scope_getter("double_q")): double_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True) double_q_values = double_policy.q_values double_obs_ph = double_policy.obs_ph with tf.variable_scope("loss", reuse=reuse): # set up placeholders act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(step_model.q_values * tf.one_hot(act_t_ph, n_actions), axis=1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_best_using_online_net = tf.argmax(double_q_values, axis=1) q_tp1_best = tf.reduce_sum(target_policy.q_values * tf.one_hot(q_tp1_best_using_online_net, n_actions), axis=1) else: q_tp1_best = tf.reduce_max(target_policy.q_values, axis=1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = tf_util.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) tf.summary.scalar("td_error", tf.reduce_mean(td_error)) tf.summary.histogram("td_error", td_error) tf.summary.scalar("loss", weighted_error) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # compute optimization op (potentially with gradient clipping) gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) if grad_norm_clipping is not None: for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) with tf.variable_scope("input_info", reuse=False): # Edit: Add action tf.summary.scalar('actions', tf.reduce_mean(act_t_ph)) tf.summary.histogram('actions', act_t_ph) tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph)) tf.summary.histogram('rewards', rew_t_ph) tf.summary.scalar('importance_weights', tf.reduce_mean(importance_weights_ph)) tf.summary.histogram('importance_weights', importance_weights_ph) # Valid image: RGB, RGBD, GrayScale is_image = len(obs_phs[0].shape) == 3 and obs_phs[0].shape[-1] in [1, 3, 4] if is_image: tf.summary.image('observation', obs_phs[0]) elif len(obs_phs[0].shape) == 1: tf.summary.histogram('observation', obs_phs[0]) optimize_expr = optimizer.apply_gradients(gradients) summary = tf.summary.merge_all() # Create callable functions train = tf_util.function( inputs=[ obs_phs[0], act_t_ph, rew_t_ph, target_policy.obs_ph, double_obs_ph, done_mask_ph, importance_weights_ph ], outputs=[summary, td_error], updates=[optimize_expr] ) update_target = tf_util.function([], [], updates=[update_target_expr]) return act_f, train, update_target, step_model
def build_act(q_func, ob_space, ac_space, sess, num_actions, num_action_streams, stochastic_ph, update_eps_ph): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int total number of sub-actions to be represented at the output num_action_streams: int specifies the number of action branches in action value (or advantage) function representation scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select an action given observation. ` See the top of the file for details. """ eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) policy = q_func(sess, ob_space, ac_space, 1, 1, None, num_actions) obs_phs = (policy.obs_ph, policy.processed_obs) assert (num_action_streams >= 1), "number of action branches is not acceptable, has to be >=1" # TODO better: enable non-uniform number of sub-actions per joint num_actions_pad = num_actions // num_action_streams # number of sub-actions per action dimension output_actions = [] for dim in range(num_action_streams): q_values_batch = policy.q_values[dim][ 0] # TODO better: does not allow evaluating actions over a whole batch deterministic_action = tf.argmax(q_values_batch) random_action = tf.random_uniform([], minval=0, maxval=num_actions // num_action_streams, dtype=tf.int64) chose_random = tf.random_uniform( [], minval=0, maxval=1, dtype=tf.float32) < eps stochastic_action = tf.cond(chose_random, lambda: random_action, lambda: deterministic_action) output_action = tf.cond(stochastic_ph, lambda: stochastic_action, lambda: deterministic_action) output_actions.append(output_action) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = tf_util.function( inputs=[policy.obs_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) def act(obs, stochastic=True, update_eps=-1): return _act(obs, stochastic, update_eps) return act, obs_phs