class Model(object): def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, goal_shape): self.sess = sess self.nenv = nenvs self.goal_shape = goal_shape nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") step_goal_placeholder = tf.placeholder(tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = step_goal_placeholder train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * (nsteps + 1), ) + ob_space.shape, "train_ob") train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * (nsteps + 1), ) + goal_shape, "train_goal") train_goal_encoded = train_goal_placeholder concat_on_latent = False self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) self.v = v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets self.qret = qret = q_retrace(self.R, self.D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state tf.global_variables_initializer().run(session=self.sess) def train_policy(self, obs, actions, rewards, dones, mus, states, masks, steps, goal_obs, verbose=False): cur_lr = self.lr.value_steps(steps) td_map = { self.train_model.X: obs, self.polyak_model.X: obs, self.A: actions, self.R: rewards, self.D: dones, self.MU: mus, self.LR: cur_lr } assert hasattr(self.train_model, "goals") assert hasattr(self.polyak_model, "goals") if hasattr(self, "goal_rms"): self.goal_rms.update(goal_obs) ################################################ debug = False if debug: _obs, _actions, _dones, _goals, _mus, _rewards = self.generate_fake( obs, actions, dones, goal_obs, mus, rewards) td_map[self.train_model.goals] = _goals td_map[self.train_model.X] = _obs v = self.sess.run(self.v, feed_dict=td_map) print("v", v) td_map[self.A] = _actions td_map[self.R] = _rewards td_map[self.MU] = _mus td_map[self.D] = _dones qret = self.sess.run(self.qret, feed_dict=td_map) print("q_ret", qret) assert 0 ################################################ td_map[self.train_model.goals] = goal_obs td_map[self.polyak_model.goals] = goal_obs if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks td_map[self.polyak_model.S] = states td_map[self.polyak_model.M] = masks if verbose: names_ops_policy = self.names_ops_policy.copy() values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:] # strip off _train else: names_ops_policy = self.names_ops_policy.copy( )[:8] # not including trust region values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:][:8] return names_ops_policy, values_ops_policy def step(self, observation, **kwargs): return self.step_model.evaluate( [self.step_model.action, self.step_model_p, self.step_model.state], observation, **kwargs) def generate_fake(self, obs, actions, dones, goals, mus, rewards): _obs = np.ones_like(obs) _actions = np.ones_like(actions) _dones = dones _goals = np.zeros_like(goals) _mus = np.random.randn(*mus.shape) _mus = _mus / np.sum(_mus, axis=-1, keepdims=True) print(self.sess.run(self.params)) print("obs", obs) print("_mus", _mus) print("_dones", _dones) _rewards = np.ones_like(rewards) return _obs, _actions, _dones, _goals, _mus, _rewards
class Model(object): def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, load_path, debug, policy_inputs): self.sess = sess self.nenv = nenvs self.policy_inputs = policy_inputs.copy() nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="value_next") # (by lzn: we revise goal-conditioned next value) if isinstance(ob_space, gym.spaces.Dict): self.obs_shape = ob_space.spaces['observation'].shape self.obs_dtype = ob_space.spaces['observation'].dtype else: self.obs_shape = ob_space.shape self.obs_dtype = ob_space.dtype self.achieved_goal_sh = achieved_goal_sh = ACHIEVED_GOAL_SHAPE self.desired_goal_sh = desired_goal_sh = DESIRED_GOAL_SHAPE self.desired_goal_state_sh = desired_goal_state_sh = self.obs_shape self.step_obs_tf = tf.placeholder(self.obs_dtype, (nenvs,) + self.obs_shape, 'step_obs') self.step_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs,) + achieved_goal_sh, 'step_achieved_goal') self.step_desired_goal_tf = tf.placeholder(tf.float32, (nenvs, ) + desired_goal_sh, 'step_desired_goal') self.step_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs,) + desired_goal_state_sh, 'step_desired_goal_state') self.train_obs_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + self.obs_shape, 'train_obs') self.train_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + achieved_goal_sh, 'train_achieved_goal') self.train_desired_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + desired_goal_sh, 'train_desired_goal') self.train_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + desired_goal_state_sh, 'train_desired_goal_state') # normalize embedding normalizer = 2500 step_achieved_goal_tf = self.step_achieved_goal_tf / normalizer step_desired_goal_tf = self.step_desired_goal_tf / normalizer train_achieved_goal_tf = self.train_achieved_goal_tf / normalizer train_desired_goal_tf = self.train_desired_goal_tf / normalizer step_obs_tf = self.step_obs_tf step_desired_goal_state_tf = self.step_desired_goal_state_tf train_obs_tf = self.train_obs_tf train_desired_goal_state_tf = self.train_desired_goal_state_tf assert 'obs' in policy_inputs logger.info('policy_inputs:{}'.format(policy_inputs)) logger.info('achieved_goal_sh:{}'.format(self.achieved_goal_sh)) logger.info('desired_goal_sh:{}'.format(self.desired_goal_sh)) logger.info('normalizer:{}'.format(normalizer)) policy_inputs.remove('obs') if 'desired_goal_state' in policy_inputs: policy_inputs.remove('desired_goal_state') step_state_tf = tf.concat([step_obs_tf, step_desired_goal_state_tf], axis=-1, name='step_state') train_state_tf = tf.concat([train_obs_tf, train_desired_goal_state_tf], axis=-1, name='train_state') else: step_state_tf = step_obs_tf train_state_tf = train_obs_tf if 'achieved_goal' in policy_inputs and 'desired_goal' not in policy_inputs: policy_inputs.remove('achieved_goal') step_goal_tf = step_achieved_goal_tf train_goal_tf = train_achieved_goal_tf elif 'achieved_goal' not in policy_inputs and 'desired_goal' in policy_inputs: policy_inputs.remove('desired_goal') step_goal_tf = step_desired_goal_tf train_goal_tf = train_desired_goal_tf elif 'achieved_goal' in policy_inputs and 'desired_goal' in policy_inputs: policy_inputs.remove('achieved_goal') policy_inputs.remove('desired_goal') step_goal_tf = tf.concat([step_achieved_goal_tf, step_desired_goal_tf], axis=-1, name='step_goal') train_goal_tf = tf.concat([train_achieved_goal_tf, train_desired_goal_tf], axis=-1, name='train_goal') else: step_goal_tf, train_goal_tf = None, None if len(policy_inputs) > 0: raise ValueError("Unused policy inputs:{}".format(policy_inputs)) self.step_model = policy(nbatch=nenvs, nsteps=1, state_placeholder=step_state_tf, sess=self.sess, goal_placeholder=step_goal_tf) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf, sess=self.sess, goal_placeholder=train_goal_tf, summary_stats=True) variables = find_trainable_variables self.params = params = variables(scope) logger.info("========================== {} =============================".format(scope)) for var in params: logger.info(var) logger.info("========================== {} =============================\n".format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf, goal_placeholder=train_goal_tf, sess=self.sess,) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) # (todo by lizn, use this to calculate next value) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step # (todo by lizn, we don't need strip) f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # f, f_pol, q = map(lambda x: x, [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # (todo by lizn, use new next state value) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # (todo by lzn: we do not need the strip the last one) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [_train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] self.names_ops_policy = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] self.names_ops_policy = [scope + "_" + x for x in self.names_ops_policy] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state # with tf.variable_scope('stats'): # with tf.variable_scope('achieved_goal'): # self.ag_stats = Normalizer(size=self.achieved_goal_sh[0], sess=self.sess) # with tf.variable_scope('desired_goal'): # self.g_stats = Normalizer(size=self.desired_goal_sh[0], sess=self.sess) if debug: tf.global_variables_initializer().run(session=self.sess) load_variables(load_path, self.params, self.sess) else: tf.global_variables_initializer().run(session=self.sess) def train_policy(self, obs, next_obs, achieved_goal, next_achieved_goal, desired_goal, desired_goal_state, actions, rewards, mus, dones, steps): verbose = False cur_lr = self.lr.value_steps(steps) # 1. calculate v_{t+1} using obs_{t+1} and g_t td_map = self._feed_train_policy_inputs(next_obs, next_achieved_goal, desired_goal, desired_goal_state) v_next = self.sess.run(self.v, feed_dict=td_map) # 2. use obs_t, goal_t, v_{t+1} to train policy td_map.update({self.train_obs_tf: obs, self.train_achieved_goal_tf: achieved_goal, self.A: actions, self.R: rewards, self.D: dones, self.MU: mus, self.LR: cur_lr, self.V_NEXT: v_next}) if verbose: names_ops_policy = self.names_ops_policy.copy() values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:] # strip off _train else: names_ops_policy = self.names_ops_policy.copy()[:8] # not including trust region values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:][:8] return names_ops_policy, values_ops_policy def step(self, inputs): td_map = self._feed_step_policy_inputs(**inputs) return self.sess.run([self.step_model.action, self.step_model_p], feed_dict=td_map) def _feed_train_policy_inputs(self, obs, achieved_goal, desired_goal, desired_goal_state): td_map = dict() assert 'obs' in self.policy_inputs td_map[self.train_obs_tf] = obs if 'achieved_goal' in self.policy_inputs: td_map[self.train_achieved_goal_tf] = achieved_goal if 'desired_goal' in self.policy_inputs: td_map[self.train_desired_goal_tf] = desired_goal if 'desired_goal_state' in self.policy_inputs: td_map[self.train_desired_goal_state_tf] = desired_goal_state return td_map def _feed_step_policy_inputs(self, obs, achieved_goal=None, desired_goal=None, desired_goal_state=None): td_map = dict() assert 'obs' in self.policy_inputs td_map[self.step_obs_tf] = obs if 'achieved_goal' in self.policy_inputs: td_map[self.step_achieved_goal_tf] = achieved_goal if 'desired_goal' in self.policy_inputs: td_map[self.step_desired_goal_tf] = desired_goal if 'desired_goal_state' in self.policy_inputs: td_map[self.step_desired_goal_state_tf] = desired_goal_state return td_map
class Model(object): def __init__(self, sess, policy, dynamics, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, goal_shape, residual): self.sess = sess self.nenv = nenvs self.residual = residual self.goal_shape = goal_shape self.goal_as_image = goal_as_image = len(goal_shape) == 3 if self.goal_as_image: assert self.goal_shape == ob_space.shape else: logger.info("normalize goal using RunningMeanStd") with tf.variable_scope("RunningMeanStd", reuse=tf.AUTO_REUSE): self.goal_rms = RunningMeanStd(epsilon=1e-4, shape=self.goal_shape) nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.dynamics = dynamics self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="v_next") step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") if self.dynamics.dummy: step_goal_placeholder, concat_on_latent, step_goal_encoded = None, None, None else: if goal_as_image: step_goal_placeholder = tf.placeholder( ob_space.dtype, (nenvs, ) + ob_space.shape, "step_goal") concat_on_latent, train_goal_encoded, step_goal_encoded = False, None, None else: step_goal_placeholder = tf.placeholder( tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = tf.clip_by_value( (step_goal_placeholder - self.goal_rms.mean) / self.goal_rms.std, -5., 5.) train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_ob") if self.dynamics.dummy: train_goal_placeholder, concat_on_latent, train_goal_encoded = None, None, None else: if goal_as_image: train_goal_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_goal") concat_on_latent, train_goal_encoded = False, None else: train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * nsteps, ) + goal_shape, "train_goal") concat_on_latent = True train_goal_encoded = tf.clip_by_value( (train_goal_placeholder - self.goal_rms.mean) / self.goal_rms.std, -5., 5.) self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) logger.info( "======================={}: Aux & Dyna =========================". format(scope)) for var in self.dynamics.params: logger.info(var) logger.info( "======================={}: Aux & Dyna =========================\n" .format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) if not self.dynamics.dummy: _train_dynamics = trainer.minimize(self.dynamics.loss) self.run_ops_dynamics = [ _train_dynamics, self.dynamics.aux_loss, self.dynamics.dyna_loss, ] self.name_ops_dynamics = ["aux_loss", "dyna_loss"] # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state tf.global_variables_initializer().run(session=self.sess) def train_policy(self, obs, next_obs, actions, rewards, dones, mus, states, masks, steps, goal_obs, verbose=False): cur_lr = self.lr.value_steps(steps) # 1. calculate v_{t+1} using obs_{t+1} and g_t td_map = {self.train_model.X: next_obs} if not self.dynamics.dummy: assert hasattr(self.train_model, "goals") if self.residual: td_map[self.train_model.goals] = goal_obs - next_obs else: td_map[self.train_model.goals] = goal_obs v_next = self.sess.run(self.v, feed_dict=td_map) # 2. use obs_t, goal_t, v_{t+1} to train policy td_map = { self.train_model.X: obs, self.polyak_model.X: obs, self.A: actions, self.R: rewards, self.D: dones, self.MU: mus, self.LR: cur_lr, self.V_NEXT: v_next } if not self.dynamics.dummy: assert hasattr(self.train_model, "goals") assert hasattr(self.polyak_model, "goals") if hasattr(self, "goal_rms"): self.goal_rms.update(goal_obs) if self.residual: td_map[self.train_model.goals] = goal_obs - obs td_map[self.polyak_model.goals] = goal_obs - obs else: td_map[self.train_model.goals] = goal_obs td_map[self.polyak_model.goals] = goal_obs if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks td_map[self.polyak_model.S] = states td_map[self.polyak_model.M] = masks if verbose: names_ops_policy = self.names_ops_policy.copy() values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:] # strip off _train else: names_ops_policy = self.names_ops_policy.copy( )[:8] # not including trust region values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:][:8] unimportant_key = ["loss_f", "loss_bc"] for name in names_ops_policy.copy(): for suffix in unimportant_key: if name.endswith(suffix): index = names_ops_policy.index(name) names_ops_policy.pop(index) values_ops_policy.pop(index) break return names_ops_policy, values_ops_policy def train_dynamics(self, obs, actions, next_obs, steps, nb_epoch=1): value_ops_dynamics = [] for epoch in range(nb_epoch): cur_lr = self.lr.value_steps(steps) td_map = { self.dynamics.obs: obs, self.dynamics.next_obs: next_obs, self.dynamics.ac: actions, self.LR: cur_lr } value = self.sess.run(self.run_ops_dynamics, td_map)[1:] value_ops_dynamics.append(value) value_ops_dynamics = np.asarray(value_ops_dynamics) value_ops_dynamics = list(np.mean(value_ops_dynamics, axis=0)) return self.name_ops_dynamics.copy(), value_ops_dynamics def step(self, observation, **kwargs): if self.residual and not self.dynamics.dummy: kwargs["goals"] = kwargs["goals"] - observation return self.step_model.evaluate( [self.step_model.action, self.step_model_p, self.step_model.state], observation, **kwargs)