def __init__( self, brain, lr=1e-4, lr_schedule=LearningRateSchedule.LINEAR, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, seed=0, stream_names=None, vis_encode_type=EncoderType.SIMPLE, ): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. :param seed: Seed to use for initialization of model. :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used. :return: a sub-class of PPOAgent tailored to the environment. """ LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed, stream_names) if num_layers < 1: num_layers = 1 if brain.vector_action_space_type == "continuous": self.create_cc_actor_critic(h_size, num_layers, vis_encode_type) self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy else: self.create_dc_actor_critic(h_size, num_layers, vis_encode_type) self.learning_rate = self.create_learning_rate(lr_schedule, lr, self.global_step, max_step) self.create_losses( self.log_probs, self.old_log_probs, self.value_heads, self.entropy, beta, epsilon, lr, max_step, )
def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): LearningModel.__init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names ) self.normalize = normalize self.use_recurrent = use_recurrent self.num_layers = num_layers self.stream_names = stream_names self.h_size = h_size self.activ_fn = self.swish self.policy_memory_in: Optional[tf.Tensor] = None self.policy_memory_out: Optional[tf.Tensor] = None self.value_memory_in: Optional[tf.Tensor] = None self.value_memory_out: Optional[tf.Tensor] = None self.q1: Optional[tf.Tensor] = None self.q2: Optional[tf.Tensor] = None self.q1_p: Optional[tf.Tensor] = None self.q2_p: Optional[tf.Tensor] = None self.q1_memory_in: Optional[tf.Tensor] = None self.q2_memory_in: Optional[tf.Tensor] = None self.q1_memory_out: Optional[tf.Tensor] = None self.q2_memory_out: Optional[tf.Tensor] = None self.action_holder: Optional[tf.Tensor] = None self.prev_action: Optional[tf.Tensor] = None self.action_masks: Optional[tf.Tensor] = None self.external_action_in: Optional[tf.Tensor] = None self.log_sigma_sq: Optional[tf.Tensor] = None self.entropy: Optional[tf.Tensor] = None self.deterministic_output: Optional[tf.Tensor] = None self.all_log_probs: Optional[tf.Tensor] = None self.normalized_logprobs: Optional[tf.Tensor] = None self.action_probs: Optional[tf.Tensor] = None self.selected_actions: Optional[tf.Tensor] = None self.output: Optional[tf.Tensor] = None self.output_oh: Optional[tf.Tensor] = None self.output_pre: Optional[tf.Tensor] = None self.value_vars = None self.q_vars = None self.critic_vars = None self.policy_vars = None self.q1_heads: Optional[Dict[str, tf.Tensor]] = None self.q2_heads: Optional[Dict[str, tf.Tensor]] = None self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None
def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, use_curiosity=False, curiosity_strength=0.01, curiosity_enc_size=128, scope='Model', seed=0): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. """ with tf.variable_scope(scope): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) self.use_curiosity = use_curiosity if num_layers < 1: num_layers = 1 self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder( ) if brain.vector_action_space_type == "continuous": self.create_cc_actor_critic(h_size, num_layers) self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy else: self.create_dc_actor_critic(h_size, num_layers) if self.use_curiosity: self.curiosity_enc_size = curiosity_enc_size self.curiosity_strength = curiosity_strength encoded_state, encoded_next_state = self.create_curiosity_encoders( ) self.create_inverse_model(encoded_state, encoded_next_state) self.create_forward_model(encoded_state, encoded_next_state) self.create_ppo_optimizer(self.log_probs, self.old_log_probs, self.value, self.entropy, beta, epsilon, lr, max_step)
def test_min_visual_size(): # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER assert set( LearningModel.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType) for encoder_type in EncoderType: with tf.Graph().as_default(): good_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[encoder_type] good_res = CameraResolution(width=good_size, height=good_size, num_channels=3) LearningModel._check_resolution_for_encoder(good_res, encoder_type) vis_input = LearningModel.create_visual_input( good_res, "test_min_visual_size") enc_func = LearningModel.get_encoder_for_type(encoder_type) enc_func(vis_input, 32, LearningModel.swish, 1, "test", False) # Anything under the min size should raise an exception. If not, decrease the min size! with pytest.raises(Exception): with tf.Graph().as_default(): bad_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[ encoder_type] - 1 bad_res = CameraResolution(width=bad_size, height=bad_size, num_channels=3) with pytest.raises(UnityTrainerException): # Make sure we'd hit a friendly error during model setup time. LearningModel._check_resolution_for_encoder( bad_res, encoder_type) vis_input = LearningModel.create_visual_input( bad_res, "test_min_visual_size") enc_func = LearningModel.get_encoder_for_type(encoder_type) enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False, seed=0): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) num_streams = 1 hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate") hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in') hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in, self.sequence_length) self.memory_out = tf.identify(self.memory_out, name='recurrent_out') if brain.vector_action_space_type == "discrete": policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden, size, activation=None, use_bias=False, kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))) self.action_probs = tf.concat( [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs") self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") self.sample_action_float, normalized_logits = self.create_discrete_action_masking_layer( tf.concat(policy_branches, axis=1), self.action_masks, self.act_size) tf.identity(normalized_logits, name='action') self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action") self.action_oh = tf.concat([ tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size))], axis=1) self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh) self.action_percent = tf.reduce_mean(tf.cast( tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32)) else: self.policy = tf.layers.dense(hidden_reg, self.act_size[0], activation=None, use_bias=False, name='pre_action', kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)) self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1) self.sample_action = tf.identity(self.clipped_sample_action, name="action") self.true_action = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action") self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1) self.loss = tf.reduce_sum(tf.squared_difference(self.clipped_true_action, self.sample_action)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)
def create_encoder( self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """ Creates the encoder for the discriminator :param state_in: The encoded observation input :param action_in: The action input :param done_in: The done flags input :param reuse: If true, the weights will be shared with the previous encoder created """ with tf.variable_scope("GAIL_model"): if self.use_actions: concat_input = tf.concat([state_in, action_in, done_in], axis=1) else: concat_input = state_in hidden_1 = tf.layers.dense( concat_input, self.h_size, activation=LearningModel.swish, name="gail_d_hidden_1", reuse=reuse, ) hidden_2 = tf.layers.dense( hidden_1, self.h_size, activation=LearningModel.swish, name="gail_d_hidden_2", reuse=reuse, ) z_mean = None if self.use_vail: # Latent representation z_mean = tf.layers.dense( hidden_2, self.z_size, reuse=reuse, name="gail_z_mean", kernel_initializer=LearningModel.scaled_init(0.01), ) self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32) # Sampled latent code self.z = z_mean + self.z_sigma * self.noise * self.use_noise estimate_input = self.z else: estimate_input = hidden_2 estimate = tf.layers.dense( estimate_input, 1, activation=tf.nn.sigmoid, name="gail_d_estimate", reuse=reuse, ) return estimate, z_mean, concat_input
def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed, stream_names) self.normalize = normalize self.use_recurrent = use_recurrent self.num_layers = num_layers self.stream_names = stream_names self.h_size = h_size self.activ_fn = self.swish
def __init__( self, brain, lr=1e-4, lr_schedule=LearningRateSchedule.CONSTANT, h_size=128, init_entcoef=0.1, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, seed=0, stream_names=None, tau=0.005, gammas=None, vis_encode_type=EncoderType.SIMPLE, ): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ self.tau = tau self.gammas = gammas self.brain = brain self.init_entcoef = init_entcoef if stream_names is None: stream_names = [] # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.use_dones_in_backup = { name: tf.Variable(1.0) for name in stream_names } self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed, stream_names) if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( brain=brain, m_size=m_size, h_size=h_size, normalize=normalize, use_recurrent=use_recurrent, num_layers=num_layers, seed=seed, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( brain=brain, m_size=m_size // 4 if m_size else None, h_size=h_size, normalize=normalize, use_recurrent=use_recurrent, num_layers=num_layers, seed=seed, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.create_inputs_and_outputs() self.learning_rate = self.create_learning_rate(lr_schedule, lr, self.global_step, max_step) self.create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, max_step, stream_names, discrete=self.brain.vector_action_space_type == "discrete", ) self.selected_actions = (self.policy_network.selected_actions ) # For GAIL and other reward signals if normalize: target_update_norm = self.target_network.copy_normalization( self.policy_network.running_mean, self.policy_network.running_variance, self.policy_network.normalization_steps, ) self.update_normalization = tf.group( [self.policy_network.update_normalization, target_update_norm]) self.running_mean = self.policy_network.running_mean self.running_variance = self.policy_network.running_variance self.normalization_steps = self.policy_network.normalization_steps
def create_cc_actor(self, hidden_policy, scope): """ Creates Continuous control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ # Create action input (continuous) self.action_holder = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder") self.external_action_in = self.action_holder scope = self.join_scopes(scope, "policy") with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=LearningModel.scaled_init(0.01), ) # Policy-dependent log_sigma_sq log_sigma_sq = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_std", kernel_initializer=LearningModel.scaled_init(0.01), ) self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX) sigma_sq = tf.exp(self.log_sigma_sq) # Do the reparameterization trick policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq _gauss_pre = -0.5 * (((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON))**2 + 2 * self.log_sigma_sq + np.log(2 * np.pi)) all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) self.entropy = tf.reduce_sum(self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) # Squash probabilities # Keep deterministic around in case we want to use it. self.deterministic_output = tf.tanh(mu) # Note that this is just for symmetry with PPO. self.output_pre = tf.tanh(policy_) # Squash correction all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 + EPSILON), axis=1, keepdims=True) self.all_log_probs = all_probs self.selected_actions = tf.stop_gradient(self.output_pre) self.action_probs = all_probs # Extract output for Barracuda self.output = tf.identity(self.output_pre, name="action") # Get all policy vars self.policy_vars = self.get_vars(scope)
def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]: """ Creates state encoders for current and future observations. Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction See https://arxiv.org/abs/1705.05363 for more details. :return: current and future state encoder tensors. """ encoded_state_list = [] encoded_next_state_list = [] if self.policy_model.vis_obs_size > 0: self.next_visual_in = [] visual_encoders = [] next_visual_encoders = [] for i in range(self.policy_model.vis_obs_size): # Create input ops for next (t+1) visual observations. next_visual_input = LearningModel.create_visual_input( self.policy_model.brain.camera_resolutions[i], name="curiosity_next_visual_observation_" + str(i), ) self.next_visual_in.append(next_visual_input) # Create the encoder ops for current and next visual input. # Note that these encoders are siamese. encoded_visual = self.policy_model.create_visual_observation_encoder( self.policy_model.visual_in[i], self.encoding_size, LearningModel.swish, 1, "curiosity_stream_{}_visual_obs_encoder".format(i), False, ) encoded_next_visual = self.policy_model.create_visual_observation_encoder( self.next_visual_in[i], self.encoding_size, LearningModel.swish, 1, "curiosity_stream_{}_visual_obs_encoder".format(i), True, ) visual_encoders.append(encoded_visual) next_visual_encoders.append(encoded_next_visual) hidden_visual = tf.concat(visual_encoders, axis=1) hidden_next_visual = tf.concat(next_visual_encoders, axis=1) encoded_state_list.append(hidden_visual) encoded_next_state_list.append(hidden_next_visual) if self.policy_model.vec_obs_size > 0: # Create the encoder ops for current and next vector input. # Note that these encoders are siamese. # Create input op for next (t+1) vector observation. self.next_vector_in = tf.placeholder( shape=[None, self.policy_model.vec_obs_size], dtype=tf.float32, name="curiosity_next_vector_observation", ) encoded_vector_obs = self.policy_model.create_vector_observation_encoder( self.policy_model.vector_in, self.encoding_size, LearningModel.swish, 2, "curiosity_vector_obs_encoder", False, ) encoded_next_vector_obs = self.policy_model.create_vector_observation_encoder( self.next_vector_in, self.encoding_size, LearningModel.swish, 2, "curiosity_vector_obs_encoder", True, ) encoded_state_list.append(encoded_vector_obs) encoded_next_state_list.append(encoded_next_vector_obs) encoded_state = tf.concat(encoded_state_list, axis=1) encoded_next_state = tf.concat(encoded_next_state_list, axis=1) return encoded_state, encoded_next_state
def create_cc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 2, h_size, num_layers, vis_encode_type) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name="lstm_policy", ) hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name="lstm_value", ) self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name="recurrent_out") else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, kernel_initializer=LearningModel.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) self.log_sigma_sq = tf.get_variable( "log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq - 0.5 * tf.log(2.0 * np.pi) - 0.5 * self.log_sigma_sq) self.all_log_probs = tf.identity(all_probs, name="action_probs") self.entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + self.log_sigma_sq) self.create_value_heads(self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities") # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
def create_dc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 1, h_size, num_layers, vis_encode_type) hidden = hidden_streams[0] if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden = tf.concat([hidden, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden, memory_out = self.create_recurrent_encoder( hidden, self.memory_in, self.sequence_length) self.memory_out = tf.identity(memory_out, name="recurrent_out") policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden, size, activation=None, use_bias=False, kernel_initializer=LearningModel.scaled_init(0.01), )) self.all_log_probs = tf.concat(policy_branches, axis=1, name="action_probs") self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") output, _, normalized_logits = self.create_discrete_action_masking_layer( self.all_log_probs, self.action_masks, self.act_size) self.output = tf.identity(output) self.normalized_logits = tf.identity(normalized_logits, name="action") self.create_value_heads(self.stream_names, hidden) self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.action_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.selected_actions = tf.stop_gradient(self.action_oh) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities") _, _, old_normalized_logits = self.create_discrete_action_masking_layer( self.all_old_log_probs, self.action_masks, self.act_size) action_idx = [0] + list(np.cumsum(self.act_size)) self.entropy = tf.reduce_sum( (tf.stack( [ tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.nn.softmax( self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]), logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, ) self.log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, keepdims=True, ) self.old_log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=old_normalized_logits[:, action_idx[i]: action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, keepdims=True, )