class PGAgent(TFAgent): NAME = 'PG' ACTOR_NET_KEY = 'ActorNet' ACTOR_STEPSIZE_KEY = 'ActorStepsize' ACTOR_MOMENTUM_KEY = 'ActorMomentum' ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay' ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale' CRITIC_NET_KEY = 'CriticNet' CRITIC_STEPSIZE_KEY = 'CriticStepsize' CRITIC_MOMENTUM_KEY = 'CriticMomentum' CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay' EXP_ACTION_FLAG = 1 << 0 def __init__(self, world, id, json_data): self._exp_action = False super().__init__(world, id, json_data) return def reset(self): super().reset() self._exp_action = False return def _check_action_space(self): action_space = self.get_action_space() return action_space == ActionSpace.Continuous def _load_params(self, json_data): super()._load_params(json_data) self.val_min, self.val_max = self._calc_val_bounds(self.discount) self.val_fail, self.val_succ = self._calc_term_vals(self.discount) return def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if ( self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions self.g_tf = tf.placeholder( tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals with tf.variable_scope('main'): with tf.variable_scope('actor'): self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.actor_tf != None): Logger.print('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print('Built critic net: ' + critic_net_name) return def _build_normalizers(self): super()._build_normalizers() with self.sess.as_default(), self.graph.as_default( ), tf.variable_scope(self.tf_scope): with tf.variable_scope(self.RESOURCE_SCOPE): val_offset, val_scale = self._calc_val_offset_scale( self.discount) self.val_norm = TFNormalizer(self.sess, 'val_norm', 1) self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale) return def _init_normalizers(self): super()._init_normalizers() with self.sess.as_default(), self.graph.as_default(): self.val_norm.update() return def _load_normalizers(self): super()._load_normalizers() self.val_norm.load() return def _build_losses(self, json_data): actor_weight_decay = 0 if ( self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if ( self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf( self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss( 'main/critic') norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf) norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1) self.actor_loss_tf *= self.adv_tf self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) a_bound_loss /= self.exp_params_curr.noise self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss( 'main/actor') return def _build_solvers(self, json_data): actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data ) else json_data[self.ACTOR_STEPSIZE_KEY] actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data ) else json_data[self.ACTOR_MOMENTUM_KEY] critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data ) else json_data[self.CRITIC_STEPSIZE_KEY] critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data ) else json_data[self.CRITIC_MOMENTUM_KEY] critic_vars = self._tf_vars('main/critic') critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum) self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars) self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars) actor_vars = self._tf_vars('main/actor') actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum) self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars) self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars) return def _build_net_actor(self, net_name, init_output_scale): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] h = NetBuilder.build_net(net_name, input_tfs) norm_a_tf = tf.layers.dense( inputs=h, units=self.get_action_size(), activation=None, kernel_initializer=tf.random_uniform_initializer( minval=-init_output_scale, maxval=init_output_scale)) a_tf = self.a_norm.unnormalize_tf(norm_a_tf) return a_tf def _build_net_critic(self, net_name): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] h = NetBuilder.build_net(net_name, input_tfs) norm_val_tf = tf.layers.dense( inputs=h, units=1, activation=None, kernel_initializer=TFUtil.xavier_initializer) norm_val_tf = tf.reshape(norm_val_tf, [-1]) val_tf = self.val_norm.unnormalize_tf(norm_val_tf) return val_tf def _initialize_vars(self): super()._initialize_vars() self._sync_solvers() return def _sync_solvers(self): self.actor_solver.sync() self.critic_solver.sync() return def _decide_action(self, s, g): with self.sess.as_default(), self.graph.as_default(): self._exp_action = False a = self._eval_actor(s, g)[0] logp = 0 if self._enable_stoch_policy(): # epsilon-greedy rand_action = MathUtil.flip_coin(self.exp_params_curr.rate) if rand_action: norm_exp_noise = np.random.randn(*a.shape) norm_exp_noise *= self.exp_params_curr.noise exp_noise = norm_exp_noise * self.a_norm.std a += exp_noise logp = self._calc_action_logp(norm_exp_noise) self._exp_action = True return a, logp def _enable_stoch_policy(self): return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END) def _eval_actor(self, s, g): s = np.reshape(s, [-1, self.get_state_size()]) g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None feed = {self.s_tf: s, self.g_tf: g} a = self.actor_tf.eval(feed) return a def _eval_critic(self, s, g): with self.sess.as_default(), self.graph.as_default(): s = np.reshape(s, [-1, self.get_state_size()]) g = np.reshape( g, [-1, self.get_goal_size()]) if self.has_goal() else None feed = {self.s_tf: s, self.g_tf: g} val = self.critic_tf.eval(feed) return val def _record_flags(self): flags = int(0) if (self._exp_action): flags = flags | self.EXP_ACTION_FLAG return flags def _train_step(self): super()._train_step() critic_loss = self._update_critic() actor_loss = self._update_actor() critic_loss = MPIUtil.reduce_avg(critic_loss) actor_loss = MPIUtil.reduce_avg(actor_loss) critic_stepsize = self.critic_solver.get_stepsize() actor_stepsize = self.actor_solver.get_stepsize() self.logger.log_tabular('Critic_Loss', critic_loss) self.logger.log_tabular('Critic_Stepsize', critic_stepsize) self.logger.log_tabular('Actor_Loss', actor_loss) self.logger.log_tabular('Actor_Stepsize', actor_stepsize) return def _update_critic(self): idx = self.replay_buffer.sample(self._local_mini_batch_size) s = self.replay_buffer.get('states', idx) g = self.replay_buffer.get('goals', idx) if self.has_goal() else None tar_V = self._calc_updated_vals(idx) tar_V = np.clip(tar_V, self.val_min, self.val_max) feed = {self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_V} loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed) self.critic_solver.update(grads) return loss def _update_actor(self): key = self.EXP_ACTION_FLAG idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key) has_goal = self.has_goal() s = self.replay_buffer.get('states', idx) g = self.replay_buffer.get('goals', idx) if has_goal else None a = self.replay_buffer.get('actions', idx) V_new = self._calc_updated_vals(idx) V_old = self._eval_critic(s, g) adv = V_new - V_old feed = {self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv} loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed) self.actor_solver.update(grads) return loss def _calc_updated_vals(self, idx): r = self.replay_buffer.get('rewards', idx) if self.discount == 0: new_V = r else: next_idx = self.replay_buffer.get_next_idx(idx) s_next = self.replay_buffer.get('states', next_idx) g_next = self.replay_buffer.get( 'goals', next_idx) if self.has_goal() else None is_end = self.replay_buffer.is_path_end(idx) is_fail = self.replay_buffer.check_terminal_flag( idx, Env.Terminate.Fail) is_succ = self.replay_buffer.check_terminal_flag( idx, Env.Terminate.Succ) is_fail = np.logical_and(is_end, is_fail) is_succ = np.logical_and(is_end, is_succ) V_next = self._eval_critic(s_next, g_next) V_next[is_fail] = self.val_fail V_next[is_succ] = self.val_succ new_V = r + self.discount * V_next return new_V def _calc_action_logp(self, norm_action_deltas): # norm action delta are for the normalized actions (scaled by self.a_norm.std) stdev = self.exp_params_curr.noise assert stdev > 0 a_size = self.get_action_size() logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1) logp += -0.5 * a_size * np.log(2 * np.pi) logp += -a_size * np.log(stdev) return logp def _log_val(self, s, g): val = self._eval_critic(s, g) norm_val = self.val_norm.normalize(val) self.world.env.log_val(self.id, norm_val[0]) return def _build_replay_buffer(self, buffer_size): super()._build_replay_buffer(buffer_size) self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG) return
class AMPAgent(PPOAgent): NAME = "AMP" TASK_REWARD_LERP_KEY = "TaskRewardLerp" DISC_NET_KEY = "DiscNet" DISC_INIT_OUTPUT_SCALE_KEY = "DiscInitOutputScale" DISC_WEIGHT_DECAY_KEY = "DiscWeightDecay" DISC_LOGIT_REG_WEIGHT_KEY = "DiscLogitRegWeight" DISC_STEPSIZE_KEY = "DiscStepSize" DISC_MOMENTUM_KEY = "DiscMomentum" DISC_BATCH_SIZE_KEY = "DiscBatchSize" DISC_STEPS_PER_BATCH_KEY = "DiscStepsPerBatch" DISC_EXPERT_BUFFER_SIZE_KEY = "DiscExpertBufferSize" DISC_AGENT_BUFFER_SIZE_KEY = "DiscAgentBufferSize" REWARD_SCALE_KEY = "RewardScale" DISC_GRAD_PENALTY_KEY = "DiscGradPenalty" DISC_LOGIT_NAME = "disc_logits" DISC_SCOPE = "disc" def __init__(self, id, world, json_data): super().__init__(id, world, json_data) self._disc_reward_mean = 0.0 self._disc_reward_std = 0.0 self._reward_min = np.inf self._reward_max = -np.inf self._build_disc_replay_buffer() return def __str__(self): info_str = super().__str__() info_str = info_str[:-2] + ',\n "AMPObsDim": "{:d}"'.format( self._get_amp_obs_size()) + info_str[-2:] return info_str def _load_params(self, json_data): super()._load_params(json_data) self._task_reward_lerp = 0.5 if ( self.TASK_REWARD_LERP_KEY not in json_data) else json_data[self.TASK_REWARD_LERP_KEY] self._disc_batchsize = int(256) if ( self.DISC_BATCH_SIZE_KEY not in json_data) else int( json_data[self.DISC_BATCH_SIZE_KEY]) self._disc_steps_per_batch = int(1) if ( self.DISC_STEPS_PER_BATCH_KEY not in json_data) else int( json_data[self.DISC_STEPS_PER_BATCH_KEY]) self._disc_expert_buffer_size = int(100000) if ( self.DISC_EXPERT_BUFFER_SIZE_KEY not in json_data) else int( json_data[self.DISC_EXPERT_BUFFER_SIZE_KEY]) self._disc_agent_buffer_size = int(100000) if ( self.DISC_AGENT_BUFFER_SIZE_KEY not in json_data) else int( json_data[self.DISC_AGENT_BUFFER_SIZE_KEY]) return def _build_disc_replay_buffer(self): num_procs = mpi_util.get_num_procs() local_disc_expert_buffer_size = int( np.ceil(self._disc_expert_buffer_size / num_procs)) self._disc_expert_buffer = ReplayBufferRandStorage( local_disc_expert_buffer_size) local_disc_agent_buffer_size = int( np.ceil(self._disc_agent_buffer_size / num_procs)) self._disc_agent_buffer = ReplayBufferRandStorage( local_disc_agent_buffer_size) return def _build_normalizers(self): super()._build_normalizers() with self.sess.as_default(), self.graph.as_default( ), tf.variable_scope(self.tf_scope): with tf.variable_scope(self.RESOURCE_SCOPE): self._amp_obs_norm = TFNormalizer( self.sess, "amp_obs_norm", self._get_amp_obs_size(), self._get_amp_obs_norm_group()) self._amp_obs_norm.set_mean_std(-self._get_amp_obs_offset(), 1 / self._get_amp_obs_scale()) return def _load_normalizers(self): super()._load_normalizers() self._amp_obs_norm.load() return def _update_normalizers(self): super()._update_normalizers() self._amp_obs_norm.update() return def _sync_solvers(self): super()._sync_solvers() self._disc_solver.sync() return def _build_nets(self, json_data): super()._build_nets(json_data) assert self.DISC_NET_KEY in json_data disc_net_name = json_data[self.DISC_NET_KEY] disc_init_output_scale = 1 if ( self.DISC_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.DISC_INIT_OUTPUT_SCALE_KEY] self._reward_scale = 1.0 if (self.REWARD_SCALE_KEY not in json_data ) else json_data[self.REWARD_SCALE_KEY] amp_obs_size = self._get_amp_obs_size() # setup input tensors self._amp_obs_expert_ph = tf.placeholder(tf.float32, shape=[None, amp_obs_size], name="amp_obs_expert") self._amp_obs_agent_ph = tf.placeholder(tf.float32, shape=[None, amp_obs_size], name="amp_obs_agent") self._disc_expert_inputs = self._get_disc_expert_inputs() self._disc_agent_inputs = self._get_disc_agent_inputs() with tf.variable_scope(self.MAIN_SCOPE): with tf.variable_scope(self.DISC_SCOPE): self._disc_logits_expert_tf = self._build_disc_net( disc_net_name, self._disc_expert_inputs, disc_init_output_scale) self._disc_logits_agent_tf = self._build_disc_net( disc_net_name, self._disc_agent_inputs, disc_init_output_scale, reuse=True) if (self._disc_logits_expert_tf != None): Logger.print("Built discriminator net: " + disc_net_name) self._disc_prob_agent_tf = tf.sigmoid(self._disc_logits_agent_tf) self._abs_logit_agent_tf = tf.reduce_mean( tf.abs(self._disc_logits_agent_tf)) self._avg_prob_agent_tf = tf.reduce_mean(self._disc_prob_agent_tf) return def _build_losses(self, json_data): super()._build_losses(json_data) disc_weight_decay = 0 if (self.DISC_WEIGHT_DECAY_KEY not in json_data ) else json_data[self.DISC_WEIGHT_DECAY_KEY] disc_logit_reg_weight = 0 if ( self.DISC_LOGIT_REG_WEIGHT_KEY not in json_data) else json_data[self.DISC_LOGIT_REG_WEIGHT_KEY] disc_grad_penalty = 0.0 if ( self.DISC_GRAD_PENALTY_KEY not in json_data) else json_data[self.DISC_GRAD_PENALTY_KEY] disc_loss_expert_tf = self.build_disc_loss_pos( self._disc_logits_expert_tf) disc_loss_agent_tf = self.build_disc_loss_neg( self._disc_logits_agent_tf) disc_loss_expert_tf = tf.reduce_mean(disc_loss_expert_tf) disc_loss_agent_tf = tf.reduce_mean(disc_loss_agent_tf) self._disc_loss_tf = 0.5 * (disc_loss_agent_tf + disc_loss_expert_tf) self._acc_expert_tf = tf.reduce_mean( tf.cast(tf.greater(self._disc_logits_expert_tf, 0), tf.float32)) self._acc_agent_tf = tf.reduce_mean( tf.cast(tf.less(self._disc_logits_agent_tf, 0), tf.float32)) if (disc_weight_decay != 0): self._disc_loss_tf += disc_weight_decay * self._weight_decay_loss( self.MAIN_SCOPE + "/" + self.DISC_SCOPE) if (disc_logit_reg_weight != 0): self._disc_loss_tf += disc_logit_reg_weight * self._disc_logit_reg_loss( ) if (disc_grad_penalty != 0): self._grad_penalty_loss_tf = self._disc_grad_penalty_loss( in_tfs=self._disc_expert_inputs, out_tf=self._disc_logits_expert_tf) self._disc_loss_tf += disc_grad_penalty * self._grad_penalty_loss_tf else: self._grad_penalty_loss_tf = tf.constant(0.0, dtype=tf.float32) return def _build_solvers(self, json_data): super()._build_solvers(json_data) disc_stepsize = 0.001 if (self.DISC_STEPSIZE_KEY not in json_data ) else json_data[self.DISC_STEPSIZE_KEY] disc_momentum = 0.9 if (self.DISC_MOMENTUM_KEY not in json_data ) else json_data[self.DISC_MOMENTUM_KEY] disc_vars = self._tf_vars(self.MAIN_SCOPE + "/" + self.DISC_SCOPE) disc_opt = tf.train.MomentumOptimizer(learning_rate=disc_stepsize, momentum=disc_momentum) self._disc_grad_tf = tf.gradients(self._disc_loss_tf, disc_vars) self._disc_solver = mpi_solver.MPISolver(self.sess, disc_opt, disc_vars) return def _build_disc_net(self, net_name, input_tfs, init_output_scale, reuse=False): out_size = 1 h = net_builder.build_net(net_name, input_tfs, reuse) logits_tf = tf.layers.dense( inputs=h, units=out_size, activation=None, reuse=reuse, kernel_initializer=tf.random_uniform_initializer( minval=-init_output_scale, maxval=init_output_scale), name=self.DISC_LOGIT_NAME) return logits_tf def _get_disc_expert_inputs(self): norm_obs_tf = self._amp_obs_norm.normalize_tf(self._amp_obs_expert_ph) input_tfs = [norm_obs_tf] return input_tfs def _get_disc_agent_inputs(self): norm_obs_tf = self._amp_obs_norm.normalize_tf(self._amp_obs_agent_ph) input_tfs = [norm_obs_tf] return input_tfs def _disc_logit_reg_loss(self): vars = self._tf_vars(self.MAIN_SCOPE + "/" + self.DISC_SCOPE) logit_vars = [ v for v in vars if (self.DISC_LOGIT_NAME in v.name and "bias" not in v.name) ] loss_tf = tf.add_n([tf.nn.l2_loss(v) for v in logit_vars]) return loss_tf def _disc_grad_penalty_loss(self, in_tfs, out_tf): grad_tfs = tf.gradients(ys=out_tf, xs=in_tfs) grad_tf = tf.concat(grad_tfs, axis=-1) norm_tf = tf.reduce_sum(tf.square(grad_tf), axis=-1) loss_tf = 0.5 * tf.reduce_mean(norm_tf) return loss_tf def reset(self): super().reset() self.path.amp_obs_expert = [] self.path.amp_obs_agent = [] return def _store_path(self, path): path_id = super()._store_path(path) valid_path = (path_id != MathUtil.INVALID_IDX) if (valid_path): disc_expert_path_id = self._disc_expert_buffer.store( path.amp_obs_expert) assert (disc_expert_path_id != MathUtil.INVALID_IDX) disc_agent_path_id = self._disc_agent_buffer.store( path.amp_obs_agent) assert (disc_agent_path_id != MathUtil.INVALID_IDX) return path_id def _update_new_action(self): first_step = self._is_first_step() super()._update_new_action() if (not first_step): self._record_amp_obs() return def _end_path(self): super()._end_path() self._record_amp_obs() return def _record_amp_obs(self): obs_expert = self._record_amp_obs_expert() obs_agent = self._record_amp_obs_agent() self.path.amp_obs_expert.append(obs_expert) self.path.amp_obs_agent.append(obs_agent) return def build_disc_loss_pos(self, logits_tf): loss_tf = 0.5 * tf.reduce_sum(tf.square(logits_tf - 1), axis=-1) return loss_tf def build_disc_loss_neg(self, logits_tf): loss_tf = 0.5 * tf.reduce_sum(tf.square(logits_tf + 1), axis=-1) return loss_tf def _enable_amp_task_reward(self): enable = self.world.env.enable_amp_task_reward() return enable def _get_amp_obs_size(self): amp_obs_size = self.world.env.get_amp_obs_size() return amp_obs_size def _get_amp_obs_offset(self): offset = np.array(self.world.env.get_amp_obs_offset()) return offset def _get_amp_obs_scale(self): offset = np.array(self.world.env.get_amp_obs_scale()) return offset def _get_amp_obs_norm_group(self): norm_group = np.array(self.world.env.get_amp_obs_norm_group(), dtype=np.int32) return norm_group def _record_amp_obs_expert(self): obs_expert = np.array(self.world.env.record_amp_obs_expert(self.id)) return obs_expert def _record_amp_obs_agent(self): obs_agent = np.array(self.world.env.record_amp_obs_agent(self.id)) return obs_agent def _record_normalizers(self, path): super()._record_normalizers(path) self._amp_obs_norm.record(np.array(path.amp_obs_expert)) self._amp_obs_norm.record(np.array(path.amp_obs_agent)) return def _logits_to_reward(self, logits): r = 1.0 - 0.25 * np.square(1.0 - logits) r = np.maximum(r, 0.0) return r def _train_step(self): disc_info = self._update_disc() disc_info["reward_mean"] = self._disc_reward_mean disc_info["reward_std"] = self._disc_reward_std disc_info = mpi_util.reduce_dict_mean(disc_info) super()._train_step() self.logger.log_tabular("Disc_Loss", disc_info["loss"]) self.logger.log_tabular("Disc_Acc_Expert", disc_info["acc_expert"]) self.logger.log_tabular("Disc_Acc_Agent", disc_info["acc_agent"]) self.logger.log_tabular("Disc_Stepsize", self.get_disc_stepsize()) self.logger.log_tabular("Disc_Steps", self.get_disc_steps()) self.logger.log_tabular("Disc_Prob", disc_info["prob_agent"]) self.logger.log_tabular("Disc_Abs_Logit", disc_info["abs_logit"]) self.logger.log_tabular("Disc_Reward_Mean", disc_info["reward_mean"]) self.logger.log_tabular("Disc_Reward_Std", disc_info["reward_std"]) if (self._enable_grad_penalty()): self.logger.log_tabular("Grad_Penalty", disc_info["grad_penalty"]) return def _update_disc(self): info = None num_procs = mpi_util.get_num_procs() local_expert_batch_size = int(np.ceil(self._disc_batchsize / num_procs)) local_agent_batch_size = local_expert_batch_size steps_per_batch = self._disc_steps_per_batch local_sample_count = self.replay_buffer.get_current_size() global_sample_count = int(mpi_util.reduce_sum(local_sample_count)) num_steps = int( np.ceil(steps_per_batch * global_sample_count / (num_procs * local_expert_batch_size))) for b in range(num_steps): disc_expert_batch = self._disc_expert_buffer.sample( local_expert_batch_size) obs_expert = self._disc_expert_buffer.get(disc_expert_batch) disc_agent_batch = self._disc_agent_buffer.sample( local_agent_batch_size) obs_agent = self._disc_agent_buffer.get(disc_agent_batch) curr_info = self._step_disc(obs_expert=obs_expert, obs_agent=obs_agent) if (info is None): info = curr_info else: for k, v in curr_info.items(): info[k] += v for k in info.keys(): info[k] /= num_steps return info def _step_disc(self, obs_expert, obs_agent): feed = { self._amp_obs_expert_ph: obs_expert, self._amp_obs_agent_ph: obs_agent, } run_tfs = [ self._disc_grad_tf, self._disc_loss_tf, self._acc_expert_tf, self._acc_agent_tf, self._avg_prob_agent_tf, self._abs_logit_agent_tf, self._grad_penalty_loss_tf ] results = self.sess.run(run_tfs, feed) grads = results[0] self._disc_solver.update(grads) info = { "loss": results[1], "acc_expert": results[2], "acc_agent": results[3], "prob_agent": results[4], "abs_logit": results[5], "grad_penalty": results[6], } return info def get_disc_stepsize(self): return self._disc_solver.get_stepsize() def get_disc_steps(self): return self._disc_solver.iter def _enable_grad_penalty(self): return self._grad_penalty_loss_tf.op.type != "Const" def _fetch_batch_rewards(self, start_idx, end_idx): idx = np.array(list(range(start_idx, end_idx))) rewards = self._batch_calc_reward(idx) return rewards def _batch_calc_reward(self, idx): obs_agent = self.replay_buffer.get("amp_obs_agent", idx) disc_r, _ = self._calc_disc_reward(obs_agent) end_mask = self.replay_buffer.is_path_end(idx) valid_mask = np.logical_not(end_mask) disc_r *= self._reward_scale valid_disc_r = disc_r[valid_mask] self._disc_reward_mean = np.mean(valid_disc_r) self._disc_reward_std = np.std(valid_disc_r) if (self._enable_amp_task_reward()): task_r = self.replay_buffer.get("rewards", idx) r = self._lerp_reward(disc_r, task_r) else: r = disc_r curr_reward_min = np.amin(r) curr_reward_max = np.amax(r) self._reward_min = np.minimum(self._reward_min, curr_reward_min) self._reward_max = np.maximum(self._reward_max, curr_reward_max) reward_data = np.array([self._reward_min, -self._reward_max]) reward_data = mpi_util.reduce_min(reward_data) self._reward_min = reward_data[0] self._reward_max = -reward_data[1] return r def _lerp_reward(self, disc_r, task_r): r = (1.0 - self._task_reward_lerp) * disc_r + self._task_reward_lerp * task_r return r def _calc_disc_reward(self, amp_obs): feed = { self._amp_obs_agent_ph: amp_obs, } logits = self.sess.run(self._disc_logits_agent_tf, feed_dict=feed) r = self._logits_to_reward(logits) r = r[:, 0] return r, logits def _compute_batch_vals(self, start_idx, end_idx): states = self.replay_buffer.get_all("states")[start_idx:end_idx] goals = self.replay_buffer.get_all( "goals")[start_idx:end_idx] if self.has_goal() else None vals = self._eval_critic(states, goals) val_min = self._reward_min / (1.0 - self.discount) val_max = self._reward_max / (1.0 - self.discount) vals = np.clip(vals, val_min, val_max) idx = np.array(list(range(start_idx, end_idx))) is_end = self.replay_buffer.is_path_end(idx) is_fail = self.replay_buffer.check_terminal_flag( idx, Env.Terminate.Fail) is_succ = self.replay_buffer.check_terminal_flag( idx, Env.Terminate.Succ) is_fail = np.logical_and(is_end, is_fail) is_succ = np.logical_and(is_end, is_succ) vals[is_fail] = self.val_fail vals[is_succ] = self.val_succ return vals
class PGAgent(TFAgent): NAME = 'PG' ACTOR_NET_KEY = 'ActorNet' ACTOR_STEPSIZE_KEY = 'ActorStepsize' ACTOR_MOMENTUM_KEY = 'ActorMomentum' ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay' ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale' CRITIC_NET_KEY = 'CriticNet' CRITIC_STEPSIZE_KEY = 'CriticStepsize' CRITIC_MOMENTUM_KEY = 'CriticMomentum' CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay' EXP_ACTION_FLAG = 1 << 0 def __init__(self, world, id, json_data): self._exp_action = False super().__init__(world, id, json_data) return def reset(self): super().reset() self._exp_action = False return def _check_action_space(self): return True def _load_params(self, json_data): super()._load_params(json_data) return def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals with tf.variable_scope('main'): with tf.variable_scope('actor'): self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.actor_tf != None): Logger.print('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print('Built critic net: ' + critic_net_name) return def _build_normalizers(self): super()._build_normalizers() with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope): with tf.variable_scope(self.RESOURCE_SCOPE): val_offset, val_scale = self._calc_val_offset_scale(self.discount) self.val_norm = TFNormalizer(self.sess, 'val_norm', 1) self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale) return def _init_normalizers(self): super()._init_normalizers() with self.sess.as_default(), self.graph.as_default(): self.val_norm.update() return def _load_normalizers(self): super()._load_normalizers() self.val_norm.load() return def _build_losses(self, json_data): actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic') norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf) norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1) self.actor_loss_tf *= self.adv_tf self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) a_bound_loss /= self.exp_params_curr.noise self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor') return def _build_solvers(self, json_data): actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY] actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY] critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY] critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY] critic_vars = self._tf_vars('main/critic') critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum) self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars) self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars) actor_vars = self._tf_vars('main/actor') actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum) self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars) self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars) return def _build_net_actor(self, net_name, init_output_scale): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] h = NetBuilder.build_net(net_name, input_tfs) norm_a_tf = tf.layers.dense(inputs=h, units=self.get_action_size(), activation=None, kernel_initializer=tf.random_uniform_initializer(minval=-init_output_scale, maxval=init_output_scale)) a_tf = self.a_norm.unnormalize_tf(norm_a_tf) return a_tf def _build_net_critic(self, net_name): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] h = NetBuilder.build_net(net_name, input_tfs) norm_val_tf = tf.layers.dense(inputs=h, units=1, activation=None, kernel_initializer=TFUtil.xavier_initializer); norm_val_tf = tf.reshape(norm_val_tf, [-1]) val_tf = self.val_norm.unnormalize_tf(norm_val_tf) return val_tf def _initialize_vars(self): super()._initialize_vars() self._sync_solvers() return def _sync_solvers(self): self.actor_solver.sync() self.critic_solver.sync() return def _decide_action(self, s, g): with self.sess.as_default(), self.graph.as_default(): self._exp_action = False a = self._eval_actor(s, g)[0] logp = 0 if self._enable_stoch_policy(): # epsilon-greedy rand_action = MathUtil.flip_coin(self.exp_params_curr.rate) if rand_action: norm_exp_noise = np.random.randn(*a.shape) norm_exp_noise *= self.exp_params_curr.noise exp_noise = norm_exp_noise * self.a_norm.std a += exp_noise logp = self._calc_action_logp(norm_exp_noise) self._exp_action = True return a, logp def _enable_stoch_policy(self): return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END) def _eval_actor(self, s, g): s = np.reshape(s, [-1, self.get_state_size()]) g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None feed = { self.s_tf : s, self.g_tf : g } a = self.actor_tf.eval(feed) return a def _eval_critic(self, s, g): with self.sess.as_default(), self.graph.as_default(): s = np.reshape(s, [-1, self.get_state_size()]) g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None feed = { self.s_tf : s, self.g_tf : g } val = self.critic_tf.eval(feed) return val def _calc_action_logp(self, norm_action_deltas): # norm action delta are for the normalized actions (scaled by self.a_norm.std) stdev = self.exp_params_curr.noise assert stdev > 0 a_size = self.get_action_size() logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1) logp += -0.5 * a_size * np.log(2 * np.pi) logp += -a_size * np.log(stdev) return logp
class PGAgent(TFAgent): NAME = 'PG' ACTOR_NET_KEY = 'ActorNet' ACTOR_STEPSIZE_KEY = 'ActorStepsize' ACTOR_MOMENTUM_KEY = 'ActorMomentum' ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay' ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale' CRITIC_NET_KEY = 'CriticNet' CRITIC_STEPSIZE_KEY = 'CriticStepsize' CRITIC_MOMENTUM_KEY = 'CriticMomentum' CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay' EXP_ACTION_FLAG = 1 << 0 def __init__(self, world, id, json_data): self._exp_action = False super().__init__(world, id, json_data) return def reset(self): super().reset() self._exp_action = False return def _check_action_space(self): action_space = self.get_action_space() return action_space == ActionSpace.Continuous def _load_params(self, json_data): super()._load_params(json_data) self.val_min, self.val_max = self._calc_val_bounds(self.discount) self.val_fail, self.val_succ = self._calc_term_vals(self.discount) return def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals with tf.variable_scope('main'): with tf.variable_scope('actor'): self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.actor_tf != None): Logger.print2('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print2('Built critic net: ' + critic_net_name) return def _build_normalizers(self): super()._build_normalizers() with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope): with tf.variable_scope(self.RESOURCE_SCOPE): val_offset, val_scale = self._calc_val_offset_scale(self.discount) self.val_norm = TFNormalizer(self.sess, 'val_norm', 1) self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale) return def _init_normalizers(self): super()._init_normalizers() with self.sess.as_default(), self.graph.as_default(): self.val_norm.update() return def _load_normalizers(self): super()._load_normalizers() self.val_norm.load() return def _build_losses(self, json_data): actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY] critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY] norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf) self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff)) if (critic_weight_decay != 0): self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic') norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf) norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1) self.actor_loss_tf *= self.adv_tf self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf) norm_a_bound_min = self.a_norm.normalize(self.a_bound_min) norm_a_bound_max = self.a_norm.normalize(self.a_bound_max) a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max) a_bound_loss /= self.exp_params_curr.noise self.actor_loss_tf += a_bound_loss if (actor_weight_decay != 0): self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor') return def _build_solvers(self, json_data): actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY] actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY] critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY] critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY] critic_vars = self._tf_vars('main/critic') critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum) self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars) self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars) actor_vars = self._tf_vars('main/actor') actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum) self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars) self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars) return def _build_net_actor(self, net_name, init_output_scale): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] h = NetBuilder.build_net(net_name, input_tfs) norm_a_tf = tf.layers.dense(inputs=h, units=self.get_action_size(), activation=None, kernel_initializer=tf.random_uniform_initializer(minval=-init_output_scale, maxval=init_output_scale)) a_tf = self.a_norm.unnormalize_tf(norm_a_tf) return a_tf def _build_net_critic(self, net_name): norm_s_tf = self.s_norm.normalize_tf(self.s_tf) input_tfs = [norm_s_tf] if (self.has_goal()): norm_g_tf = self.g_norm.normalize_tf(self.g_tf) input_tfs += [norm_g_tf] h = NetBuilder.build_net(net_name, input_tfs) norm_val_tf = tf.layers.dense(inputs=h, units=1, activation=None, kernel_initializer=TFUtil.xavier_initializer); norm_val_tf = tf.reshape(norm_val_tf, [-1]) val_tf = self.val_norm.unnormalize_tf(norm_val_tf) return val_tf def _initialize_vars(self): super()._initialize_vars() self._sync_solvers() return def _sync_solvers(self): self.actor_solver.sync() self.critic_solver.sync() return def _decide_action(self, s, g): with self.sess.as_default(), self.graph.as_default(): self._exp_action = False a = self._eval_actor(s, g)[0] logp = 0 if self._enable_stoch_policy(): # epsilon-greedy rand_action = MathUtil.flip_coin(self.exp_params_curr.rate) if rand_action: norm_exp_noise = np.random.randn(*a.shape) norm_exp_noise *= self.exp_params_curr.noise exp_noise = norm_exp_noise * self.a_norm.std a += exp_noise logp = self._calc_action_logp(norm_exp_noise) self._exp_action = True return a, logp def _enable_stoch_policy(self): return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END) def _eval_actor(self, s, g): s = np.reshape(s, [-1, self.get_state_size()]) g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None feed = { self.s_tf : s, self.g_tf : g } a = self.actor_tf.eval(feed) return a def _eval_critic(self, s, g): with self.sess.as_default(), self.graph.as_default(): s = np.reshape(s, [-1, self.get_state_size()]) g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None feed = { self.s_tf : s, self.g_tf : g } val = self.critic_tf.eval(feed) return val def _record_flags(self): flags = int(0) if (self._exp_action): flags = flags | self.EXP_ACTION_FLAG return flags def _train_step(self): super()._train_step() critic_loss = self._update_critic() actor_loss = self._update_actor() critic_loss = MPIUtil.reduce_avg(critic_loss) actor_loss = MPIUtil.reduce_avg(actor_loss) critic_stepsize = self.critic_solver.get_stepsize() actor_stepsize = self.actor_solver.get_stepsize() self.logger.log_tabular('Critic_Loss', critic_loss) self.logger.log_tabular('Critic_Stepsize', critic_stepsize) self.logger.log_tabular('Actor_Loss', actor_loss) self.logger.log_tabular('Actor_Stepsize', actor_stepsize) return def _update_critic(self): idx = self.replay_buffer.sample(self._local_mini_batch_size) s = self.replay_buffer.get('states', idx) g = self.replay_buffer.get('goals', idx) if self.has_goal() else None tar_V = self._calc_updated_vals(idx) tar_V = np.clip(tar_V, self.val_min, self.val_max) feed = { self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_V } loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed) self.critic_solver.update(grads) return loss def _update_actor(self): key = self.EXP_ACTION_FLAG idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key) has_goal = self.has_goal() s = self.replay_buffer.get('states', idx) g = self.replay_buffer.get('goals', idx) if has_goal else None a = self.replay_buffer.get('actions', idx) V_new = self._calc_updated_vals(idx) V_old = self._eval_critic(s, g) adv = V_new - V_old feed = { self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv } loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed) self.actor_solver.update(grads) return loss def _calc_updated_vals(self, idx): r = self.replay_buffer.get('rewards', idx) if self.discount == 0: new_V = r else: next_idx = self.replay_buffer.get_next_idx(idx) s_next = self.replay_buffer.get('states', next_idx) g_next = self.replay_buffer.get('goals', next_idx) if self.has_goal() else None is_end = self.replay_buffer.is_path_end(idx) is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail) is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ) is_fail = np.logical_and(is_end, is_fail) is_succ = np.logical_and(is_end, is_succ) V_next = self._eval_critic(s_next, g_next) V_next[is_fail] = self.val_fail V_next[is_succ] = self.val_succ new_V = r + self.discount * V_next return new_V def _calc_action_logp(self, norm_action_deltas): # norm action delta are for the normalized actions (scaled by self.a_norm.std) stdev = self.exp_params_curr.noise assert stdev > 0 a_size = self.get_action_size() logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1) logp += -0.5 * a_size * np.log(2 * np.pi) logp += -a_size * np.log(stdev) return logp def _log_val(self, s, g): val = self._eval_critic(s, g) norm_val = self.val_norm.normalize(val) self.world.env.log_val(self.id, norm_val[0]) return def _build_replay_buffer(self, buffer_size): super()._build_replay_buffer(buffer_size) self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG) return