Пример #1
0
    def make_actor(self, obs, reuse=False, scope="pi"):
        """Create an actor tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the actor
        """
        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            pi_h = create_conv(
                obs=obs,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )
        else:
            pi_h = obs

        # Create the output mean.
        policy_mean = create_fcnet(
            obs=pi_h,
            layers=self.model_params["layers"],
            num_output=self.ac_space.shape[0],
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
        )

        # Create the output log_std.
        log_std = tf.get_variable(name='logstd',
                                  shape=[1, self.ac_space.shape[0]],
                                  initializer=tf.zeros_initializer())

        # Create a method to sample from the distribution.
        std = tf.exp(log_std)
        action = policy_mean + std * tf.random_normal(
            shape=tf.shape(policy_mean), dtype=tf.float32)

        return action, policy_mean, log_std
Пример #2
0
    def make_critic(self, obs, action, reuse=False, scope="qf"):
        """Create a critic tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the critic
        """
        # Concatenate the observations and actions.
        qf_h = tf.concat([obs, action], axis=-1)

        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            qf_h = create_conv(
                obs=qf_h,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                batch_norm=self.model_params["batch_norm"],
                phase=self.phase_ph,
                dropout=self.model_params["dropout"],
                rate=self.rate_ph,
                scope=scope,
                reuse=reuse,
            )

        return create_fcnet(
            obs=qf_h,
            layers=self.model_params["layers"],
            num_output=1,
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            batch_norm=self.model_params["batch_norm"],
            phase=self.phase_ph,
            dropout=self.model_params["dropout"],
            rate=self.rate_ph,
            output_pre="qf_",
            scope=scope,
            reuse=reuse,
        )
Пример #3
0
    def make_actor(self, obs, ac_space, reuse=False, scope="pi"):
        """Create an actor tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder of the individual agent
        ac_space : gym.space.*
            the action space of the individual agent
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the actor
        """
        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            pi_h = create_conv(
                obs=obs,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )
        else:
            pi_h = obs

        # Create the model.
        policy = create_fcnet(
            obs=pi_h,
            layers=self.model_params["layers"],
            num_output=ac_space.shape[0],
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
        )

        # Scaling terms to the output from the policy.
        ac_means = (ac_space.high + ac_space.low) / 2.
        ac_magnitudes = (ac_space.high - ac_space.low) / 2.

        # Apply squashing and scale by action space.
        return ac_means + ac_magnitudes * tf.nn.tanh(policy)
Пример #4
0
    def make_critic(self, obs, reuse=False, scope="qf"):
        """Create a critic tensor.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the critic
        """
        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            vf_h = create_conv(
                obs=obs,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )
        else:
            vf_h = obs

        return create_fcnet(
            obs=vf_h,
            layers=self.model_params["layers"],
            num_output=1,
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
        )
Пример #5
0
    def make_critic(self,
                    obs,
                    action=None,
                    reuse=False,
                    scope="value_fns",
                    create_qf=True,
                    create_vf=True):
        """Create the critic variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor
        create_qf : bool
            whether to create the Q-functions
        create_vf : bool
            whether to create the value function

        Returns
        -------
        tf.Variable
            the output from the first Q-function. Set to None if `create_qf` is
            False.
        tf.Variable
            the output from the second Q-function. Set to None if `create_qf`
            is False.
        tf.Variable
            the output from the value function. Set to None if `create_vf` is
            False.
        """
        conv_params = dict(
            image_height=self.model_params["image_height"],
            image_width=self.model_params["image_width"],
            image_channels=self.model_params["image_channels"],
            ignore_flat_channels=self.model_params["ignore_flat_channels"],
            ignore_image=self.model_params["ignore_image"],
            filters=self.model_params["filters"],
            kernel_sizes=self.model_params["kernel_sizes"],
            strides=self.model_params["strides"],
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            reuse=reuse,
        )

        fcnet_params = dict(
            layers=self.model_params["layers"],
            num_output=1,
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            reuse=reuse,
        )

        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # Value function
            if create_vf:
                if self.model_params["model_type"] == "conv":
                    vf_h = create_conv(obs=obs, scope="vf", **conv_params)
                else:
                    vf_h = obs

                value_fn = create_fcnet(obs=vf_h,
                                        scope="vf",
                                        output_pre="vf_",
                                        **fcnet_params)
            else:
                value_fn = None

            # Double Q values to reduce overestimation
            if create_qf:
                # Concatenate the observations and actions.
                qf_h = tf.concat([obs, action], axis=-1)

                if self.model_params["model_type"] == "conv":
                    qf1_h = create_conv(obs=qf_h, scope="qf1", **conv_params)
                    qf2_h = create_conv(obs=qf_h, scope="qf2", **conv_params)
                else:
                    qf1_h = qf_h
                    qf2_h = qf_h

                qf1 = create_fcnet(obs=qf1_h,
                                   scope="qf1",
                                   output_pre="qf_",
                                   **fcnet_params)
                qf2 = create_fcnet(obs=qf2_h,
                                   scope="qf2",
                                   output_pre="qf_",
                                   **fcnet_params)
            else:
                qf1, qf2 = None, None

        return qf1, qf2, value_fn
Пример #6
0
    def make_actor(self, obs, action, reuse=False, scope="pi"):
        """Create the actor variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor

        Returns
        -------
        tf.Variable
            the output from the deterministic actor
        tf.Variable
            the output from the stochastic actor
        tf.Variable
            the log-probability of a given observation given the output action
            from the policy
        tf.Variable
            the log-probability of a given observation given a fixed action
        """
        # Initial image pre-processing (for convolutional policies).
        if self.model_params["model_type"] == "conv":
            pi_h = create_conv(
                obs=obs,
                image_height=self.model_params["image_height"],
                image_width=self.model_params["image_width"],
                image_channels=self.model_params["image_channels"],
                ignore_flat_channels=self.model_params["ignore_flat_channels"],
                ignore_image=self.model_params["ignore_image"],
                filters=self.model_params["filters"],
                kernel_sizes=self.model_params["kernel_sizes"],
                strides=self.model_params["strides"],
                act_fun=self.model_params["act_fun"],
                layer_norm=self.model_params["layer_norm"],
                scope=scope,
                reuse=reuse,
            )
        else:
            pi_h = obs

        # Create the model.
        policy_mean, log_std = create_fcnet(
            obs=pi_h,
            layers=self.model_params["layers"],
            num_output=self.ac_space.shape[0],
            stochastic=True,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            scope=scope,
            reuse=reuse,
        )

        # OpenAI Variation to cap the standard deviation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        std = tf.exp(log_std)

        # Reparameterization trick
        policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std
        logp_pi = gaussian_likelihood(policy, policy_mean, log_std)
        logp_ac = gaussian_likelihood(action, policy_mean, log_std)

        # Apply squashing and account for it in the probability
        _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac)
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            policy_mean, policy, logp_pi)

        return deterministic_policy, policy, logp_pi, logp_ac
Пример #7
0
    def make_critic(self,
                    obs,
                    action=None,
                    reuse=False,
                    scope="value_fns",
                    create_qf=True,
                    create_vf=True):
        """Create the critic variables.

        Parameters
        ----------
        obs : tf.compat.v1.placeholder
            the input observation placeholder
        action : tf.compat.v1.placeholder
            the input action placeholder
        reuse : bool
            whether or not to reuse parameters
        scope : str
            the scope name of the actor
        create_qf : bool
            whether to create the Q-functions
        create_vf : bool
            whether to create the value function

        Returns
        -------
        tf.Variable
            the output from the first Q-function. Set to None if `create_qf` is
            False.
        tf.Variable
            the output from the second Q-function. Set to None if `create_qf`
            is False.
        tf.Variable
            the output from the value function. Set to None if `create_vf` is
            False.
        """
        conv_params = dict(
            image_height=self.model_params["image_height"],
            image_width=self.model_params["image_width"],
            image_channels=self.model_params["image_channels"],
            ignore_flat_channels=self.model_params["ignore_flat_channels"],
            ignore_image=self.model_params["ignore_image"],
            filters=self.model_params["filters"],
            kernel_sizes=self.model_params["kernel_sizes"],
            strides=self.model_params["strides"],
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            reuse=reuse,
        )

        fcnet_params = dict(
            layers=self.model_params["layers"],
            num_output=1,
            stochastic=False,
            act_fun=self.model_params["act_fun"],
            layer_norm=self.model_params["layer_norm"],
            reuse=reuse,
        )

        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # Value function
            if create_vf:
                if self.model_params["model_type"] == "conv":
                    vf_h = create_conv(obs=obs, scope="vf", **conv_params)
                else:
                    vf_h = obs

                    # if an image is present in the observation
                    # extra processing steps are needed
                    if self.includes_image:

                        batch_size = tf.shape(vf_h)[0]
                        image_size = (self.image_height * self.image_width *
                                      self.image_channels)

                        original_vf_h = vf_h
                        vf_h = original_vf_h[:, image_size:]

                        vf_h = tf.gather(vf_h, [
                            i for i in range(vf_h.shape[1])
                            if i not in self.ignore_flat_channels
                        ],
                                         axis=1)

                        # ignoring the image is useful for the lower level
                        # for creating an abstraction barrier
                        if not self.ignore_image:

                            vf_h_image = tf.reshape(
                                original_vf_h[:, :image_size], [
                                    batch_size, self.image_height,
                                    self.image_width, self.image_channels
                                ])

                            # create the hidden convolutional layers
                            for i, (filters, kernel_size,
                                    strides) in enumerate(
                                        zip(self.filters, self.kernel_sizes,
                                            self.strides)):

                                vf_h_image = self._conv_layer(
                                    vf_h_image,
                                    filters,
                                    kernel_size,
                                    strides,
                                    'conv{}'.format(i),
                                    act_fun=self.act_fun,
                                    layer_norm=self.layer_norm)

                            h = vf_h_image.shape[1]
                            w = vf_h_image.shape[2]
                            c = vf_h_image.shape[3]
                            vf_h = tf.concat([
                                tf.reshape(vf_h_image, [batch_size, h * w * c])
                                / tf.cast(h * w * c, tf.float32), vf_h
                            ], 1)

                value_fn = create_fcnet(obs=vf_h,
                                        scope="vf",
                                        output_pre="vf_",
                                        **fcnet_params)
            else:
                value_fn = None

            # Double Q values to reduce overestimation
            if create_qf:
                # Concatenate the observations and actions.
                qf_h = tf.concat([obs, action], axis=-1)

                if self.model_params["model_type"] == "conv":
                    qf1_h = create_conv(obs=qf_h, scope="qf1", **conv_params)
                    qf2_h = create_conv(obs=qf_h, scope="qf2", **conv_params)
                else:
                    qf1_h = qf_h
                    qf2_h = qf_h

                qf1 = create_fcnet(obs=qf1_h,
                                   scope="qf1",
                                   output_pre="qf_",
                                   **fcnet_params)
                qf2 = create_fcnet(obs=qf2_h,
                                   scope="qf2",
                                   output_pre="qf_",
                                   **fcnet_params)
            else:
                qf1, qf2 = None, None

        return qf1, qf2, value_fn