示例#1
0
    def __init__(
        self,
        brain,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        seed=0,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        LearningModel.__init__(
            self, m_size, normalize, use_recurrent, brain, seed, stream_names
        )
        self.normalize = normalize
        self.use_recurrent = use_recurrent
        self.num_layers = num_layers
        self.stream_names = stream_names
        self.h_size = h_size
        self.activ_fn = self.swish

        self.policy_memory_in: Optional[tf.Tensor] = None
        self.policy_memory_out: Optional[tf.Tensor] = None
        self.value_memory_in: Optional[tf.Tensor] = None
        self.value_memory_out: Optional[tf.Tensor] = None
        self.q1: Optional[tf.Tensor] = None
        self.q2: Optional[tf.Tensor] = None
        self.q1_p: Optional[tf.Tensor] = None
        self.q2_p: Optional[tf.Tensor] = None
        self.q1_memory_in: Optional[tf.Tensor] = None
        self.q2_memory_in: Optional[tf.Tensor] = None
        self.q1_memory_out: Optional[tf.Tensor] = None
        self.q2_memory_out: Optional[tf.Tensor] = None
        self.action_holder: Optional[tf.Tensor] = None
        self.prev_action: Optional[tf.Tensor] = None
        self.action_masks: Optional[tf.Tensor] = None
        self.external_action_in: Optional[tf.Tensor] = None
        self.log_sigma_sq: Optional[tf.Tensor] = None
        self.entropy: Optional[tf.Tensor] = None
        self.deterministic_output: Optional[tf.Tensor] = None
        self.all_log_probs: Optional[tf.Tensor] = None
        self.normalized_logprobs: Optional[tf.Tensor] = None
        self.action_probs: Optional[tf.Tensor] = None
        self.selected_actions: Optional[tf.Tensor] = None
        self.output: Optional[tf.Tensor] = None
        self.output_oh: Optional[tf.Tensor] = None
        self.output_pre: Optional[tf.Tensor] = None

        self.value_vars = None
        self.q_vars = None
        self.critic_vars = None
        self.policy_vars = None

        self.q1_heads: Optional[Dict[str, tf.Tensor]] = None
        self.q2_heads: Optional[Dict[str, tf.Tensor]] = None
        self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None
        self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None
 def __init__(
     self,
     brain,
     lr=1e-4,
     lr_schedule=LearningRateSchedule.LINEAR,
     h_size=128,
     epsilon=0.2,
     beta=1e-3,
     max_step=5e6,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     m_size=None,
     seed=0,
     stream_names=None,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param lr_schedule: Learning rate decay schedule.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     :param seed: Seed to use for initialization of model.
     :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
     :return: a sub-class of PPOAgent tailored to the environment.
     """
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                            seed, stream_names)
     if num_layers < 1:
         num_layers = 1
     if brain.vector_action_space_type == "continuous":
         self.create_cc_actor_critic(h_size, num_layers, vis_encode_type)
         self.entropy = tf.ones_like(tf.reshape(self.value,
                                                [-1])) * self.entropy
     else:
         self.create_dc_actor_critic(h_size, num_layers, vis_encode_type)
     self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                    self.global_step,
                                                    max_step)
     self.create_losses(
         self.log_probs,
         self.old_log_probs,
         self.value_heads,
         self.entropy,
         beta,
         epsilon,
         lr,
         max_step,
     )
示例#3
0
 def __init__(self,
              brain,
              lr=1e-4,
              h_size=128,
              epsilon=0.2,
              beta=1e-3,
              max_step=5e6,
              normalize=False,
              use_recurrent=False,
              num_layers=2,
              m_size=None,
              use_curiosity=False,
              curiosity_strength=0.01,
              curiosity_enc_size=128,
              scope='Model',
              seed=0):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :return: a sub-class of PPOAgent tailored to the environment.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     """
     with tf.variable_scope(scope):
         LearningModel.__init__(self, m_size, normalize, use_recurrent,
                                brain, seed)
         self.use_curiosity = use_curiosity
         if num_layers < 1:
             num_layers = 1
         self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder(
         )
         if brain.vector_action_space_type == "continuous":
             self.create_cc_actor_critic(h_size, num_layers)
             self.entropy = tf.ones_like(tf.reshape(self.value,
                                                    [-1])) * self.entropy
         else:
             self.create_dc_actor_critic(h_size, num_layers)
         if self.use_curiosity:
             self.curiosity_enc_size = curiosity_enc_size
             self.curiosity_strength = curiosity_strength
             encoded_state, encoded_next_state = self.create_curiosity_encoders(
             )
             self.create_inverse_model(encoded_state, encoded_next_state)
             self.create_forward_model(encoded_state, encoded_next_state)
         self.create_ppo_optimizer(self.log_probs, self.old_log_probs,
                                   self.value, self.entropy, beta, epsilon,
                                   lr, max_step)
示例#4
0
    def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128,
                normalize=False, use_recurrent=False, seed=0):
                LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
                num_streams = 1
                hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
                hidden = hidden_streams[0]
                self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate")
                hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
                if self.use_recurrent:
                    tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
                    self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
                    hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in,
                                                                                            self.sequence_length)
                    self.memory_out = tf.identify(self.memory_out, name='recurrent_out')

                if brain.vector_action_space_type == "discrete":
                    policy_branches = []
                    for size in self.act_size:
                        policy_branches.append(
                            tf.layers.dense(
                                hidden,
                                size,
                                activation=None,
                                use_bias=False,
                                kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)))
                    self.action_probs = tf.concat(
                        [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs")
                    self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks")
                    self.sample_action_float, normalized_logits = self.create_discrete_action_masking_layer(
                        tf.concat(policy_branches, axis=1), self.action_masks, self.act_size)
                    tf.identity(normalized_logits, name='action')
                    self.sample_action = tf.cast(self.sample_action_float, tf.int32)
                    self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action")
                    self.action_oh = tf.concat([
                        tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size))], axis=1)
                    self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh)
                    self.action_percent = tf.reduce_mean(tf.cast(
                        tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32))
                else:
                    self.policy = tf.layers.dense(hidden_reg, self.act_size[0], activation=None, use_bias=False, name='pre_action',
                                                  kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
                    self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
                    self.sample_action = tf.identity(self.clipped_sample_action, name="action")
                    self.true_action = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action")
                    self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
                    self.loss = tf.reduce_sum(tf.squared_difference(self.clipped_true_action, self.sample_action))

                optimizer = tf.train.AdamOptimizer(learning_rate=lr)
                self.update = optimizer.minimize(self.loss)
示例#5
0
 def __init__(
     self,
     brain,
     m_size=None,
     h_size=128,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     stream_names=None,
     seed=0,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                            seed, stream_names)
     self.normalize = normalize
     self.use_recurrent = use_recurrent
     self.num_layers = num_layers
     self.stream_names = stream_names
     self.h_size = h_size
     self.activ_fn = self.swish
示例#6
0
    def __init__(
        self,
        brain,
        lr=1e-4,
        lr_schedule=LearningRateSchedule.CONSTANT,
        h_size=128,
        init_entcoef=0.1,
        max_step=5e6,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        m_size=None,
        seed=0,
        stream_names=None,
        tau=0.005,
        gammas=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: BrainInfo used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        self.tau = tau
        self.gammas = gammas
        self.brain = brain
        self.init_entcoef = init_entcoef
        if stream_names is None:
            stream_names = []
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.use_dones_in_backup = {
            name: tf.Variable(1.0)
            for name in stream_names
        }
        self.disable_use_dones = {
            name: self.use_dones_in_backup[name].assign(0.0)
            for name in stream_names
        }
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                               seed, stream_names)
        if num_layers < 1:
            num_layers = 1

        self.target_init_op: List[tf.Tensor] = []
        self.target_update_op: List[tf.Tensor] = []
        self.update_batch_policy: Optional[tf.Operation] = None
        self.update_batch_value: Optional[tf.Operation] = None
        self.update_batch_entropy: Optional[tf.Operation] = None

        self.policy_network = SACPolicyNetwork(
            brain=brain,
            m_size=m_size,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.target_network = SACTargetNetwork(
            brain=brain,
            m_size=m_size // 4 if m_size else None,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.create_inputs_and_outputs()
        self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                       self.global_step,
                                                       max_step)
        self.create_losses(
            self.policy_network.q1_heads,
            self.policy_network.q2_heads,
            lr,
            max_step,
            stream_names,
            discrete=self.brain.vector_action_space_type == "discrete",
        )

        self.selected_actions = (self.policy_network.selected_actions
                                 )  # For GAIL and other reward signals
        if normalize:
            target_update_norm = self.target_network.copy_normalization(
                self.policy_network.running_mean,
                self.policy_network.running_variance,
                self.policy_network.normalization_steps,
            )
            self.update_normalization = tf.group(
                [self.policy_network.update_normalization, target_update_norm])
            self.running_mean = self.policy_network.running_mean
            self.running_variance = self.policy_network.running_variance
            self.normalization_steps = self.policy_network.normalization_steps