示例#1
0
 def init_load_weights(self):
     with self.graph.as_default():
         _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
         values = [v.eval(session=self.sess) for v in _vars]
         for var, value in zip(_vars, values):
             assign_ph = tf.placeholder(var.dtype, shape=value.shape)
             self.assign_phs.append(assign_ph)
             self.assign_ops.append(tf.assign(var, assign_ph))
示例#2
0
    def _create_sac_optimizer_ops(self) -> None:
        """
        Creates the Adam optimizers and update ops for SAC, including
        the policy, value, and entropy updates, as well as the target network update.
        """
        policy_optimizer = self.create_optimizer_op(
            learning_rate=self.learning_rate, name="sac_policy_opt")
        entropy_optimizer = self.create_optimizer_op(
            learning_rate=self.learning_rate, name="sac_entropy_opt")
        value_optimizer = self.create_optimizer_op(
            learning_rate=self.learning_rate, name="sac_value_opt")

        self.target_update_op = [
            tf.assign(target, (1 - self.tau) * target + self.tau * source)
            for target, source in zip(self.target_network.value_vars,
                                      self.policy_network.value_vars)
        ]
        logger.debug("value_vars")
        self.print_all_vars(self.policy_network.value_vars)
        logger.debug("targvalue_vars")
        self.print_all_vars(self.target_network.value_vars)
        logger.debug("critic_vars")
        self.print_all_vars(self.policy_network.critic_vars)
        logger.debug("q_vars")
        self.print_all_vars(self.policy_network.q_vars)
        logger.debug("policy_vars")
        policy_vars = self.policy.get_trainable_variables()
        self.print_all_vars(policy_vars)

        self.target_init_op = [
            tf.assign(target, source) for target, source in zip(
                self.target_network.value_vars, self.policy_network.value_vars)
        ]

        self.update_batch_policy = policy_optimizer.minimize(
            self.policy_loss, var_list=policy_vars)

        # Make sure policy is updated first, then value, then entropy.
        with tf.control_dependencies([self.update_batch_policy]):
            self.update_batch_value = value_optimizer.minimize(
                self.total_value_loss,
                var_list=self.policy_network.critic_vars)
            # Add entropy coefficient optimization operation
            with tf.control_dependencies([self.update_batch_value]):
                self.update_batch_entropy = entropy_optimizer.minimize(
                    self.entropy_loss, var_list=self.log_ent_coef)
示例#3
0
    def create_normalizer_update(
        vector_input: tf.Tensor,
        steps: tf.Tensor,
        running_mean: tf.Tensor,
        running_variance: tf.Tensor,
    ) -> Tuple[tf.Operation, tf.Operation]:
        """
        Creates the update operation for the normalizer.
        :param vector_input: Vector observation to use for updating the running mean and variance.
        :param running_mean: Tensorflow tensor representing the current running mean.
        :param running_variance: Tensorflow tensor representing the current running variance.
        :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
        :return: A TF operation that updates the normalization based on vector_input.
        """
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(steps, tf.cast(steps_increment,
                                                dtype=tf.int64))

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, running_mean)
        new_mean = running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(running_mean, new_mean)
        update_variance = tf.assign(running_variance, new_variance)
        update_norm_step = tf.assign(steps, total_new_steps)
        # First mean and variance calculated normally
        initial_mean, initial_variance = tf.nn.moments(vector_input, axes=[0])
        initialize_mean = tf.assign(running_mean, initial_mean)
        # Multiplied by total_new_step because it is divided by total_new_step in the normalization
        initialize_variance = tf.assign(
            running_variance,
            (initial_variance + EPSILON) *
            tf.cast(total_new_steps, dtype=tf.float32),
        )
        return (
            tf.group([initialize_mean, initialize_variance, update_norm_step]),
            tf.group([update_mean, update_variance, update_norm_step]),
        )
示例#4
0
    def make_beta_update(self) -> None:
        """
        Creates the beta parameter and its updater for GAIL
        """

        new_beta = tf.maximum(
            self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
        )
        with tf.control_dependencies([self.update_batch]):
            self.update_beta = tf.assign(self.beta, new_beta)
示例#5
0
 def create_global_steps():
     """Creates TF ops to track and increment global training step."""
     global_step = tf.Variable(
         0, name="global_step", trainable=False, dtype=tf.int32
     )
     steps_to_increment = tf.placeholder(
         shape=[], dtype=tf.int32, name="steps_to_increment"
     )
     increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment))
     return global_step, increment_step, steps_to_increment
示例#6
0
    def create_normalizer_update(self, vector_input):
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(self.normalization_steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, self.running_mean)
        new_mean = self.running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = self.running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(self.running_mean, new_mean)
        update_variance = tf.assign(self.running_variance, new_variance)
        update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
    def start_learning(self, env_manager: EnvManager, inital_weights):
        self._create_output_path(self.output_path)
        # tf.reset_default_graph()
        global_step = 0
        last_brain_behavior_ids: Set[str] = set()
        try:
            # Initial reset
            self._reset_env(env_manager)
            first_step = True
            while self._not_done_training():
                external_brain_behavior_ids = set(
                    env_manager.external_brains.keys())
                new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
                self._create_trainers_and_managers(env_manager,
                                                   new_behavior_ids)
                # Load inital weights
                if (inital_weights is not None and first_step):
                    print("Loading init weights!")
                    # Set weights
                    with self.trainers['Brain'].get_policy(
                            0).graph.as_default():
                        _vars = tf.get_collection(
                            tf.GraphKeys.GLOBAL_VARIABLES)
                        values = [
                            v.eval(session=self.trainers['Brain'].get_policy(
                                0).sess) for v in _vars
                        ]
                        self.trainers['Brain'].get_policy(0).assign_phs = []
                        self.trainers['Brain'].get_policy(0).assign_ops = []
                        for var, value in zip(_vars, values):
                            assign_ph = tf.placeholder(var.dtype,
                                                       shape=value.shape)
                            self.trainers['Brain'].get_policy(
                                0).assign_phs.append(assign_ph)
                            self.trainers['Brain'].get_policy(
                                0).assign_ops.append(tf.assign(var, assign_ph))
                        # print(self.trainers['Brain'].get_policy(0).assign_ops)
                        # print(self.trainers['Brain'].get_policy(0).assign_phs)
                    self.trainers['Brain'].get_policy(0).load_weights(
                        inital_weights)
                    print("Inital weights loaded succesfully!")

                last_brain_behavior_ids = external_brain_behavior_ids
                n_steps = self.advance(env_manager)
                # print("Current weights: " + str(self.trainers['Brain'].get_policy(0).get_weights()[8]))

                for _ in range(n_steps):
                    global_step += 1
                    self.reset_env_if_ready(env_manager, global_step)
            # Stop advancing trainers, Killing trainers
                first_step = False

            self.step = self.trainers['Brain'].step
            self.join_threads()
        except (
                KeyboardInterrupt,
                UnityCommunicationException,
                UnityEnvironmentException,
                UnityCommunicatorStoppedException,
        ) as ex:
            self.join_threads()
            self.logger.info(
                "Learning was interrupted. Please wait while the graph is generated."
            )
            if isinstance(ex, KeyboardInterrupt) or isinstance(
                    ex, UnityCommunicatorStoppedException):
                pass
            else:
                # If the environment failed, we want to make sure to raise
                # the exception so we exit the process with an return code of 1.
                raise ex
        finally:
            # print("Weights after train: " + str(self.trainers['Brain'].get_policy(0).get_weights()[8]))
            # self.weights = self.trainers['Brain'].get_policy(0).get_weights()
            if self.train_model:
                self._save_model()
                self._export_graph()
            return self.trainers['Brain'].get_policy(0).get_weights()