def __init__(self, dimo, dimu, o_stats, u_stats, clip_norm=5, norm_eps=1e-4, hidden=400, layers=4, learning_rate=1e-3): self.sess = U.get_session() with tf.variable_scope('forward_dynamics'): self.obs0 = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs1') self.actions = tf.placeholder(tf.float32, shape=(None, self.dimu), name='actions') self.dynamics_scope = tf.get_variable_scope().name obs0_norm = self.o_stats.normalize(self.obs0) obs1_norm = self.o_stats.normalize(self.obs1) actions_norm = self.u_stats.normalize(self.actions) input = tf.concat(values=[obs0_norm, actions_norm], axis=-1) self.next_state_diff_tf = nn(input, [hidden] * layers + [self.dimo]) self.next_state_denorm = self.o_stats.denormalize( self.next_state_diff_tf + obs0_norm) # no normalize # input = tf.concat(values=[self.obs0, self.actions], axis=-1) # self.next_state_diff_tf = nn(input,[hidden] * layers+ [self.dimo]) # self.next_state_tf = self.next_state_diff_tf + self.obs0 # self.next_state_denorm = self.next_state_tf # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.abs(self.next_state_diff_tf - obs1_norm + obs0_norm), axis=1) # self.per_sample_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.test_loss_tf = tf.reduce_mean( tf.abs(self.next_state_denorm - self.obs1)) # self.test_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1)) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False) # initial tf.variables_initializer(_vars(self.dynamics_scope)).run() self.dynamics_adam.sync()
def _create_network(self, reuse=False): logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() self._create_normalizer(reuse) batch_tf = self._get_batch_tf() # networks self._create_target_main(SAC_ActorCritic, reuse, batch_tf) # loss functions clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf) q_backup_tf = tf.stop_gradient(target_tf) v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf) q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2) q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2) v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2) self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf)) self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf) # virables value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v')) pi_params = get_var(self._name_variable('pi')) # gradients V_grads_tf = tf.gradients(self.value_loss_tf, value_params) pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params) self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params) # optimizers self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False) self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False) # polyak averaging self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v')) self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False)) self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \ zip(self.target_vars, self.main_vars))) # initialize all variables self.global_vars = get_var(self.scope, key='global') tf.variables_initializer(self.global_vars).run() self._sync_optimizers() self._init_target_net()
def __init__(self, dimo, dimu, clip_norm=5, norm_eps=1e-4, hidden=256, layers=8, learning_rate=1e-3): self.obs_normalizer = NormalizerNumpy(size=dimo, eps=norm_eps) self.action_normalizer = NormalizerNumpy(size=dimu, eps=norm_eps) self.sess = U.get_session() with tf.variable_scope('forward_dynamics_numpy'): self.obs0_norm = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs0') self.obs1_norm = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs1') self.actions_norm = tf.placeholder(tf.float32, shape=(None, self.dimu), name='actions') self.dynamics_scope = tf.get_variable_scope().name input = tf.concat(values=[self.obs0_norm, self.actions_norm], axis=-1) self.next_state_diff_tf = nn(input, [hidden] * layers + [self.dimo]) self.next_state_norm_tf = self.next_state_diff_tf + self.obs0_norm # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.abs(self.next_state_diff_tf - self.obs1_norm + self.obs0_norm), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False) # initial tf.variables_initializer(_vars(self.dynamics_scope)).run() self.dynamics_adam.sync()
def __init__(self, obs0, action, obs1, clip_norm, hidden, layers): logger.info("Using Random Network Distillation") rep_size = hidden with tf.variable_scope('random_network_distillation'): self.rnd_scope = tf.get_variable_scope().name # Random Target Network with tf.variable_scope('target_network'): xr = nn(obs1, [hidden] * layers + [rep_size]) with tf.variable_scope('predictor_network'): self.predictor_scope = tf.get_variable_scope().name xr_hat = nn(obs1, [hidden] * layers + [rep_size]) total_parameters = 0 for variable in _vars(self.predictor_scope): # shape is an array of tf.Dimension shape = variable.get_shape() # print(shape) # print(len(shape)) variable_parameters = 1 for dim in shape: # print(dim) variable_parameters *= dim.value # print(variable_parameters) total_parameters += variable_parameters logger.info( "params in target rnd network: {}".format(total_parameters)) self.feat_var = tf.reduce_mean(tf.nn.moments(xr, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(xr)) # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(xr) - xr_hat), axis=-1, keepdims=True) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.predictor_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.predictor_scope), scale_grad_by_procs=False)
class SAC(Algorithm): @store_args def __init__(self, buffer, input_dims, hidden, layers, polyak, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, subtract_goals, relative_goals, clip_pos_returns, clip_return, gamma, vloss_type='normal', priority=False, sac_alpha=0.03, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: sac_alpha: hyperparameter in SAC """ super(SAC, self).__init__(**self.__dict__) def _name_variable(self, name, main=True): if main: return self.scope + '/main/' + name else: return self.scope + '/target/' + name def _create_network(self, reuse=False): logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() self._create_normalizer(reuse) batch_tf = self._get_batch_tf() # networks self._create_target_main(SAC_ActorCritic, reuse, batch_tf) # loss functions clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf) q_backup_tf = tf.stop_gradient(target_tf) v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf) q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2) q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2) v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2) self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf)) self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf) # virables value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v')) pi_params = get_var(self._name_variable('pi')) # gradients V_grads_tf = tf.gradients(self.value_loss_tf, value_params) pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params) self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params) # optimizers self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False) self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False) # polyak averaging self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v')) self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False)) self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \ zip(self.target_vars, self.main_vars))) # initialize all variables self.global_vars = get_var(self.scope, key='global') tf.variables_initializer(self.global_vars).run() self._sync_optimizers() self._init_target_net() def _sync_optimizers(self): self.V_adam.sync() self.pi_adam.sync() def _grads(self): critic_loss, actor_loss, V_grad, pi_grad, abs_td_error = self.sess.run([ self.value_loss_tf, self.pi_loss_tf, self.V_grad_tf, self.pi_grad_tf, self.abs_tf_error_tf ]) return critic_loss, actor_loss, V_grad, pi_grad, abs_td_error def _update(self, V_grad, pi_grad): self.V_adam.update(V_grad, self.Q_lr) self.pi_adam.update(pi_grad, self.pi_lr) # sac doesn't need noise def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, compute_Q=False): o, g = self._preprocess_og(o=o, g=g, ag=ag) if not noise_eps and not random_eps: u = self.simple_get_action(o, g, use_target_net, deterministic=True) else: u = self.simple_get_action(o, g, use_target_net, deterministic=False) if compute_Q: Q_pi = self.get_Q_fun(o, g) u = np.clip(u, -self.max_u, self.max_u) if u.shape[0] == 1: u = u[0] if compute_Q: return [u, Q_pi] else: return u def simple_get_action(self, o, g, use_target_net=False, deterministic=False): o,g = self._preprocess_og(o=o,g=g) policy = self.target if use_target_net else self.main # in n-step self.target performs better act_tf = policy.mu_tf if deterministic else policy.pi_tf action, logp_pi, min_q_pi, q1_pi, q2_pi,log_std = self.sess.run( \ [act_tf, policy.logp_pi_tf, policy.min_q_pi_tf, policy.q1_pi_tf, policy.q2_pi_tf, policy.log_std], \ feed_dict={ policy.o_tf: o.reshape(-1, self.dimo), policy.g_tf: g.reshape(-1, self.dimg) }) return action
class ForwardDynamics: @store_args def __init__(self, dimo, dimu, o_stats, u_stats, clip_norm=5, norm_eps=1e-4, hidden=400, layers=4, learning_rate=1e-3): self.sess = U.get_session() with tf.variable_scope('forward_dynamics'): self.obs0 = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs1') self.actions = tf.placeholder(tf.float32, shape=(None, self.dimu), name='actions') self.dynamics_scope = tf.get_variable_scope().name obs0_norm = self.o_stats.normalize(self.obs0) obs1_norm = self.o_stats.normalize(self.obs1) actions_norm = self.u_stats.normalize(self.actions) input = tf.concat(values=[obs0_norm, actions_norm], axis=-1) self.next_state_diff_tf = nn(input, [hidden] * layers + [self.dimo]) self.next_state_denorm = self.o_stats.denormalize( self.next_state_diff_tf + obs0_norm) # no normalize # input = tf.concat(values=[self.obs0, self.actions], axis=-1) # self.next_state_diff_tf = nn(input,[hidden] * layers+ [self.dimo]) # self.next_state_tf = self.next_state_diff_tf + self.obs0 # self.next_state_denorm = self.next_state_tf # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.abs(self.next_state_diff_tf - obs1_norm + obs0_norm), axis=1) # self.per_sample_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.test_loss_tf = tf.reduce_mean( tf.abs(self.next_state_denorm - self.obs1)) # self.test_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1)) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False) # initial tf.variables_initializer(_vars(self.dynamics_scope)).run() self.dynamics_adam.sync() def predict_next_state(self, obs0, actions): obs1 = self.sess.run(self.next_state_denorm, feed_dict={ self.obs0: obs0, self.actions: actions }) return obs1 def _get_intrinsic_rewards(self, obs0, actions, obs1): intrinsic_rewards = self.sess.run(self.per_sample_loss_tf, feed_dict={ self.obs0: obs0, self.actions: actions, self.obs1: obs1 }) return intrinsic_rewards def update(self, obs0, actions, obs1): dynamics_grads, dynamics_loss, dynamics_per_sample_loss, test_loss = self.sess.run( [ self.dynamics_grads, self.mean_loss_tf, self.per_sample_loss_tf, self.test_loss_tf ], feed_dict={ self.obs0: obs0, self.actions: actions, self.obs1: obs1 }) self.dynamics_adam.update(dynamics_grads, stepsize=self.learning_rate) return dynamics_loss, test_loss def get_intrinsic_rewards(self, obs0, actions, obs1, update=True): if update: return self.update(obs0, actions, obs1) else: return self._get_intrinsic_rewards(obs0, actions, obs1)
class ForwardDynamicsNumpy: @store_args def __init__(self, dimo, dimu, clip_norm=5, norm_eps=1e-4, hidden=256, layers=8, learning_rate=1e-3): self.obs_normalizer = NormalizerNumpy(size=dimo, eps=norm_eps) self.action_normalizer = NormalizerNumpy(size=dimu, eps=norm_eps) self.sess = U.get_session() with tf.variable_scope('forward_dynamics_numpy'): self.obs0_norm = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs0') self.obs1_norm = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs1') self.actions_norm = tf.placeholder(tf.float32, shape=(None, self.dimu), name='actions') self.dynamics_scope = tf.get_variable_scope().name input = tf.concat(values=[self.obs0_norm, self.actions_norm], axis=-1) self.next_state_diff_tf = nn(input, [hidden] * layers + [self.dimo]) self.next_state_norm_tf = self.next_state_diff_tf + self.obs0_norm # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.abs(self.next_state_diff_tf - self.obs1_norm + self.obs0_norm), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False) # initial tf.variables_initializer(_vars(self.dynamics_scope)).run() self.dynamics_adam.sync() def predict_next_state(self, obs0, actions): obs0_norm = self.obs_normalizer.normalize(obs0) action_norm = self.action_normalizer.normalize(actions) obs1 = self.sess.run(self.next_state_norm_tf, feed_dict={ self.obs0_norm: obs0_norm, self.actions_norm: action_norm }) obs1_norm = self.obs_normalizer.denormalize(obs1) return obs1_norm def clip_gauss_noise(self, size): clip_range = 0.002 std = 0.001 return np.clip(np.random.normal(0, std, size), -clip_range, clip_range) # return 0 def update(self, obs0, actions, obs1, times=1): self.obs_normalizer.update(obs0) self.obs_normalizer.update(obs1) self.action_normalizer.update(actions) for _ in range(times): obs0_norm = self.obs_normalizer.normalize( obs0) + self.clip_gauss_noise(size=self.dimo) action_norm = self.action_normalizer.normalize( actions) + self.clip_gauss_noise(size=self.dimu) obs1_norm = self.obs_normalizer.normalize( obs1) #+ self.clip_gauss_noise(size=self.dimo) dynamics_grads, dynamics_loss, dynamics_per_sample_loss = self.sess.run( [ self.dynamics_grads, self.mean_loss_tf, self.per_sample_loss_tf ], feed_dict={ self.obs0_norm: obs0_norm, self.actions_norm: action_norm, self.obs1_norm: obs1_norm }) self.dynamics_adam.update(dynamics_grads, stepsize=self.learning_rate) return dynamics_loss
class DDPG(Algorithm): @store_args def __init__(self, buffer, input_dims, hidden, layers, polyak, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, subtract_goals, relative_goals, clip_pos_returns, clip_return, gamma, vloss_type='normal', priority=False, reuse=False, **kwargs): """ see algorithm """ super(DDPG, self).__init__(**self.__dict__) def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # normalizer for input self._create_normalizer(reuse) batch_tf = self._get_batch_tf() # networks self._create_target_main(ActorCritic, reuse, batch_tf) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf) self.abs_td_error_tf = tf.abs( tf.stop_gradient(target_tf) - self.main.Q_tf) self.Q_loss = tf.square(self.abs_td_error_tf) if self.priority: self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss) else: self.Q_loss_tf = tf.reduce_mean(self.Q_loss) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) # varibles self.main_Q_var = get_var(self.scope + '/main/Q') self.main_pi_var = get_var(self.scope + '/main/pi') self.target_Q_var = get_var(self.scope + '/target/Q') self.target_pi_var = get_var(self.scope + '/target/pi') Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var) pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var) assert len(self.main_Q_var) == len(Q_grads_tf) assert len(self.main_pi_var) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var) self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self.main_Q_var) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self.main_pi_var) # optimizers self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False) self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False) self.main_vars = self.main_Q_var + self.main_pi_var self.target_vars = self.target_Q_var + self.target_pi_var self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables self.global_vars = get_var(self.scope, key='global') tf.variables_initializer(self.global_vars).run() self._sync_optimizers() self._init_target_net() def _sync_optimizers(self): self.Q_adam.sync() self.pi_adam.sync() def _grads(self): # Avoid feed_dict here for performance! critic_loss, actor_loss, Q_grad, pi_grad, abs_td_error = self.sess.run( [ self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf, self.abs_td_error_tf ]) return critic_loss, actor_loss, Q_grad, pi_grad, abs_td_error def _update(self, Q_grad, pi_grad): self.Q_adam.update(Q_grad, self.Q_lr) self.pi_adam.update(pi_grad, self.pi_lr)
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # normalizer for input self._create_normalizer(reuse) batch_tf = self._get_batch_tf() # networks self._create_target_main(ActorCritic, reuse, batch_tf) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf) self.abs_td_error_tf = tf.abs( tf.stop_gradient(target_tf) - self.main.Q_tf) self.Q_loss = tf.square(self.abs_td_error_tf) if self.priority: self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss) else: self.Q_loss_tf = tf.reduce_mean(self.Q_loss) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) # varibles self.main_Q_var = get_var(self.scope + '/main/Q') self.main_pi_var = get_var(self.scope + '/main/pi') self.target_Q_var = get_var(self.scope + '/target/Q') self.target_pi_var = get_var(self.scope + '/target/pi') Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var) pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var) assert len(self.main_Q_var) == len(Q_grads_tf) assert len(self.main_pi_var) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var) self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self.main_Q_var) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self.main_pi_var) # optimizers self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False) self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False) self.main_vars = self.main_Q_var + self.main_pi_var self.target_vars = self.target_Q_var + self.target_pi_var self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables self.global_vars = get_var(self.scope, key='global') tf.variables_initializer(self.global_vars).run() self._sync_optimizers() self._init_target_net()