Пример #1
0
    def test_hparams(self):
        """Tests the priority of the encoder architecture parameter.
        """

        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])

        # case 1: set "pretrained_mode_name" by constructor argument
        encoder = XLNetEncoder(pretrained_model_name="xlnet-large-cased",
                               hparams={})
        encoder(inputs)
        self.assertEqual(len(encoder.attn_layers), 24)
        self.assertEqual(len(encoder.ff_layers), 24)

        # case 2: set "pretrained_mode_name" by hparams
        hparams = {"pretrained_model_name": "xlnet-base-cased"}
        encoder = XLNetEncoder(hparams=hparams)
        encoder(inputs)
        self.assertEqual(len(encoder.attn_layers), 12)
        self.assertEqual(len(encoder.ff_layers), 12)

        # case 3: set to None in both hparams and constructor argument
        # load no pre-trained model
        hparams = {"pretrained_model_name": None, "num_layers": 16}
        encoder = XLNetEncoder(hparams=hparams)
        encoder(inputs)
        self.assertEqual(len(encoder.attn_layers), 16)
        self.assertEqual(len(encoder.ff_layers), 16)

        # case 4: using default hparams
        encoder = XLNetEncoder()
        encoder(inputs)
        self.assertEqual(len(encoder.attn_layers), 12)
        self.assertEqual(len(encoder.ff_layers), 12)
Пример #2
0
    def test_model_loading(self):
        r"""Tests model loading functionality."""

        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])

        for pretrained_model_name in XLNetEncoder.available_checkpoints():
            encoder = XLNetEncoder(pretrained_model_name=pretrained_model_name)
            _ = encoder(inputs)
Пример #3
0
    def __init__(self,
                 pretrained_model_name=None,
                 cache_dir=None,
                 hparams=None):
        super(XLNetClassifier, self).__init__(hparams=hparams)

        with tf.variable_scope(self.variable_scope):
            tf.get_variable_scope().set_initializer(
                get_initializer(self._hparams.initializer))
            # Creates the underlying encoder
            encoder_hparams = dict_fetch(hparams,
                                         XLNetEncoder.default_hparams())
            if encoder_hparams is not None:
                encoder_hparams['name'] = "encoder"
            self._encoder = XLNetEncoder(
                pretrained_model_name=pretrained_model_name,
                cache_dir=cache_dir,
                hparams=encoder_hparams)
            if self._hparams.use_projection:
                self.projection = get_layer(
                    hparams={
                        "type": "Dense",
                        "kwargs": {
                            "units": self._encoder.output_size
                        }
                    })

            # Creates an dropout layer
            drop_kwargs = {"rate": self._hparams.dropout}
            layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs}
            self._dropout_layer = get_layer(hparams=layer_hparams)

            # Creates an additional classification layer if needed
            self._num_classes = self._hparams.num_classes
            if self._num_classes <= 0:
                self._logit_layer = None
            else:
                logit_kwargs = self._hparams.logit_layer_kwargs
                if logit_kwargs is None:
                    logit_kwargs = {}
                elif not isinstance(logit_kwargs, HParams):
                    raise ValueError(
                        "hparams['logit_layer_kwargs'] must be a dict.")
                else:
                    logit_kwargs = logit_kwargs.todict()
                logit_kwargs.update({"units": self._num_classes})
                if 'name' not in logit_kwargs:
                    logit_kwargs['name'] = "logit_layer"

                layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
                self._logit_layer = get_layer(hparams=layer_hparams)
Пример #4
0
    def default_hparams():
        r"""Returns a dictionary of hyperparameters with default values.

        .. code-block:: python

            {
                # (1) Same hyperparameters as in XLNetEncoder
                ...
                # (2) Additional hyperparameters
                "regr_strategy": "cls_time",
                "use_projection": True,
                "logit_layer_kwargs": None,
                "name": "xlnet_regressor",
            }

        Here:

        1. Same hyperparameters as in
           :class:`~texar.tf.modules.XLNetEncoder`.
           See the :meth:`~texar.tf.modules.XLNetEncoder.default_hparams`.
           An instance of XLNetEncoder is created for feature extraction.

        2. Additional hyperparameters:

            `"regr_strategy"`: str
                The regression strategy, one of:

                - **cls_time**: Sequence-level regression based on the
                  output of the first time step (which is the `CLS` token).
                  Each sequence has a prediction.
                - **all_time**: Sequence-level regression based on
                  the output of all time steps. Each sequence has a prediction.
                - **time_wise**: Step-wise regression, i.e., make
                  regression for each time step based on its output.

            `"logit_layer_kwargs"` : dict
                Keyword arguments for the logit Dense layer constructor,
                except for argument "units" which is set to "num_classes".
                Ignored if no extra logit layer is appended.

            `"use_projection"`: bool
                If `True`, an additional dense layer is added after
                the summary step.

            `"name"`: str
                Name of the regressor.
        """
        hparams = XLNetEncoder.default_hparams()
        hparams.update({
            "logit_layer_kwargs": None,
            "regr_strategy": "cls_time",
            "dropout": 0.1,
            "use_projection": True,
            "name": "xlnet_regressor"
        })
        return hparams
Пример #5
0
    def test_encode(self):
        """Tests encoding.
        """
        # case 1: XLNet pre-trained
        hparams = {"pretrained_model_name": None, "untie_r": False}
        encoder = XLNetEncoder(hparams=hparams)

        max_time = 8
        batch_size = 128
        inputs = tf.random_uniform([batch_size, max_time],
                                   maxval=30521,
                                   dtype=tf.int32)
        outputs, _ = encoder(inputs)

        outputs_dim = encoder.hparams.hidden_dim
        with self.session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)
            self.assertEqual(outputs_.shape,
                             (batch_size, max_time, outputs_dim))

        # case 2: XLNet pre-trained, untie_r=True
        hparams = {"pretrained_model_name": None, "untie_r": True}

        encoder = XLNetEncoder(hparams=hparams)
        outputs, _ = encoder(inputs)
        with self.session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)
            self.assertEqual(outputs_.shape,
                             (batch_size, max_time, outputs_dim))

        # case 3: XLNet with no pre-trained model
        hparams = {"pretrained_model_name": None}
        encoder = XLNetEncoder(hparams=hparams)
        outputs_dim = encoder.hparams.hidden_dim
        outputs, _ = encoder(inputs)
        with self.session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)
            self.assertEqual(outputs_.shape,
                             (batch_size, max_time, outputs_dim))
Пример #6
0
    def test_trainable_variables(self):
        """Tests the functionality of automatically collecting trainable
        variables.
        """

        inputs = tf.placeholder(dtype=tf.int32, shape=[None, None])

        # case 1: XLNet with no pre-trained model
        encoder = XLNetEncoder(hparams={
            "pretrained_model_name": None,
            "untie_r": False
        })
        encoder(inputs)

        n_word_embed_vars = 1
        n_mask_embed_vars = 1
        n_bias_vars = 3  # r_r_bias, r_w_bias, r_s_bias
        n_pos_wise_ff_vars = 6  # 2 kernels + 2 bias + beta + gamma
        n_rel_multi_head_vars = 7  # q,k,v,r,o + beta + gamma
        n_segment_embed_vars = 1
        n_layers = encoder.hparams.num_layers
        n_trainable_variables = \
            n_word_embed_vars + n_segment_embed_vars + n_mask_embed_vars + \
            n_layers * (n_rel_multi_head_vars + n_pos_wise_ff_vars) + \
            n_bias_vars
        self.assertEqual(len(encoder.trainable_variables),
                         n_trainable_variables)

        # case 2: XLNet with pre-trained model
        hparams = {"pretrained_model_name": "xlnet-large-cased"}
        encoder = XLNetEncoder(hparams=hparams)
        encoder(inputs)
        n_segment_embed_vars = 1
        n_layers = encoder.hparams.num_layers
        n_trainable_variables = \
            n_word_embed_vars + n_segment_embed_vars + n_mask_embed_vars + \
            n_layers * (n_rel_multi_head_vars + n_pos_wise_ff_vars) \
            + n_bias_vars
        self.assertEqual(len(encoder.trainable_variables),
                         n_trainable_variables)
Пример #7
0
class XLNetClassifier(ClassifierBase, PretrainedXLNetMixin):
    """Classifier based on XLNet modules. Please see
    :class:`~texar.tf.modules.PretrainedXLNetMixin` for a brief description
    of XLNet.

    This is a combination of the :class:`~texar.tf.modules.XLNetEncoder` with a
    classification layer. Both step-wise classification and sequence-level
    classification are supported, specified in :attr:`hparams`.

    Arguments are the same as in :class:`~texar.tf.modules.XLNetEncoder`.

    Args:
        pretrained_model_name (optional): a `str`, the name
            of pre-trained model (e.g., ``xlnet-based-cased``). Please refer to
            :class:`~texar.tf.modules.PretrainedXLNetMixin` for
            all supported models.
            If `None`, the model name in :attr:`hparams` is used.
        cache_dir (optional): the path to a folder in which the
            pre-trained models will be cached. If `None` (default),
            a default directory (``texar_data`` folder under user's home
            directory) will be used.
        hparams (dict or HParams, optional): Hyperparameters. Missing
            hyperparameters will be set to default values. See
            :meth:`default_hparams` for the hyperparameter structure
            and default values.

    .. document private functions
    .. automethod:: _build
    """

    def __init__(self,
                 pretrained_model_name=None,
                 cache_dir=None,
                 hparams=None):
        super(XLNetClassifier, self).__init__(hparams=hparams)

        with tf.variable_scope(self.variable_scope):
            tf.get_variable_scope().set_initializer(
                get_initializer(self._hparams.initializer))
            # Creates the underlying encoder
            encoder_hparams = dict_fetch(
                hparams, XLNetEncoder.default_hparams())
            if encoder_hparams is not None:
                encoder_hparams['name'] = "encoder"
            self._encoder = XLNetEncoder(
                pretrained_model_name=pretrained_model_name,
                cache_dir=cache_dir,
                hparams=encoder_hparams)
            if self._hparams.use_projection:
                self.projection = get_layer(hparams={
                    "type": "Dense",
                    "kwargs": {
                        "units": self._encoder.output_size
                    }
                })

            # Creates an dropout layer
            drop_kwargs = {"rate": self._hparams.dropout}
            layer_hparams = {"type": "Dropout", "kwargs": drop_kwargs}
            self._dropout_layer = get_layer(hparams=layer_hparams)

            # Creates an additional classification layer if needed
            self._num_classes = self._hparams.num_classes
            if self._num_classes <= 0:
                self._logit_layer = None
            else:
                logit_kwargs = self._hparams.logit_layer_kwargs
                if logit_kwargs is None:
                    logit_kwargs = {}
                elif not isinstance(logit_kwargs, HParams):
                    raise ValueError(
                        "hparams['logit_layer_kwargs'] must be a dict.")
                else:
                    logit_kwargs = logit_kwargs.todict()
                logit_kwargs.update({"units": self._num_classes})
                if 'name' not in logit_kwargs:
                    logit_kwargs['name'] = "logit_layer"

                layer_hparams = {"type": "Dense", "kwargs": logit_kwargs}
                self._logit_layer = get_layer(hparams=layer_hparams)

    @staticmethod
    def default_hparams():
        r"""Returns a dictionary of hyperparameters with default values.

        .. code-block:: python

            {
                # (1) Same hyperparameters as in XLNetEncoder
                ...
                # (2) Additional hyperparameters
                "clas_strategy": "cls_time",
                "use_projection": True,
                "num_classes": 2,
                "logit_layer_kwargs": None,
                "name": "xlnet_classifier",
            }

        Here:

        1. Same hyperparameters as in
            :class:`~texar.tf.modules.XLNetEncoder`.
            See the :meth:`~texar.tf.modules.XLNetEncoder.default_hparams`.
            An instance of XLNetEncoder is created for feature extraction.

        2. Additional hyperparameters:

            `"clas_strategy"`: str
                The classification strategy, one of:

                - **cls_time**: Sequence-level classification based on the
                  output of the last time step (which is the `CLS` token).
                  Each sequence has a class.
                - **all_time**: Sequence-level classification based on
                  the output of all time steps. Each sequence has a class.
                - **time_wise**: Step-wise classification, i.e., make
                  classification for each time step based on its output.

            `"use_projection"`: bool
                If `True`, an additional `Dense` layer is added after the
                summary step.

            `"num_classes"`: int
                Number of classes:

                - If **> 0**, an additional dense layer is appended to the
                  encoder to compute the logits over classes.
                - If **<= 0**, no dense layer is appended. The number of
                  classes is assumed to be the final dense layer size of the
                  encoder.

            `"logit_layer_kwargs"` : dict
                Keyword arguments for the logit Dense layer constructor,
                except for argument "units" which is set to "num_classes".
                Ignored if no extra logit layer is appended.

            `"name"`: str
                Name of the classifier.
        """
        hparams = XLNetEncoder.default_hparams()
        hparams.update({
            "num_classes": 2,
            "logit_layer_kwargs": None,
            "clas_strategy": "cls_time",
            "dropout": 0.1,
            "use_projection": True,
            "name": "xlnet_classifier"
        })
        return hparams

    def param_groups(self, lr=None, lr_layer_scale=1.0,
                     decay_base_params=False):
        r"""Create parameter groups for optimizers. When
        :attr:`lr_layer_decay_rate` is not 1.0, parameters from each layer form
        separate groups with different base learning rates.

        This method should be called before applying gradients to the variables
        through the optimizer. Particularly, after calling the optimizer's
        `compute_gradients` method, the user can call this method to get
        variable-specific learning rates for the network. The gradients for each
        variables can then be scaled accordingly. These scaled gradients are
        finally applied by calling optimizer's `apply_gradients` method.

        Args:
            lr (float): The learning rate. Can be omitted if
                :attr:`lr_layer_decay_rate` is 1.0.
            lr_layer_scale (float): Per-layer LR scaling rate. The `i`-th layer
                will be scaled by `lr_layer_scale ^ (num_layers - i - 1)`.
            decay_base_params (bool): If `True`, treat non-layer parameters
                (e.g. embeddings) as if they're in layer 0. If `False`, these
                parameters are not scaled.

        Returns: A dict mapping tensorflow variables to their learning rates.
        """
        vars_to_learning_rates = {}
        if lr_layer_scale != 1.0:
            if lr is None:
                raise ValueError(
                    "lr must be specified when lr_layer_decay_rate is not 1.0")

            scope = self.variable_scope.name
            projection_vars = tf.trainable_variables(scope=scope + "/dense")
            logits_vars = tf.trainable_variables(
                scope=self.variable_scope.name + "/logit_layer")
            finetune_vars = projection_vars + logits_vars
            for var in finetune_vars:
                vars_to_learning_rates[var] = lr

            vars_to_learning_rates.update(
                self._encoder.param_groups(lr=lr,
                                           lr_layer_scale=lr_layer_scale,
                                           decay_base_params=decay_base_params))
        else:
            for variable in self.trainable_variables:
                vars_to_learning_rates[variable] = lr

        return vars_to_learning_rates

    def _build(self, token_ids, segment_ids=None, input_mask=None, mode=None):
        r"""Feeds the inputs through the network and makes classification.

        Args:
            token_ids: Shape `[batch_size, max_time]`.
            segment_ids: Shape `[batch_size, max_time]`.
            input_mask: Float tensor of shape `[batch_size, max_time]`. Note
                that positions with value 1 are masked out.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`,
                including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle
                dropout.
                If `None` (default), :func:`texar.tf.global_mode` is used.

        Returns:
            A tuple `(logits, preds)`, containing the logits over classes and
            the predictions, respectively.

            - If ``clas_strategy`` is ``cls_time`` or ``all_time``:

                - If ``num_classes`` == 1, ``logits`` and ``pred`` are both of
                  shape ``[batch_size]``.
                - If ``num_classes`` > 1, ``logits`` is of shape
                  ``[batch_size, num_classes]`` and ``pred`` is of shape
                  ``[batch_size]``.

            - If ``clas_strategy`` is ``time_wise``:

                - ``num_classes`` == 1, ``logits`` and ``pred`` are both of
                  shape ``[batch_size, max_time]``.
                - If ``num_classes`` > 1, ``logits`` is of shape
                  ``[batch_size, max_time, num_classes]`` and ``pred`` is of
                  shape ``[batch_size, max_time]``.
        """
        is_training = is_train_mode(mode)
        output, _ = self._encoder(token_ids, segment_ids, input_mask=input_mask,
                                  mode=mode)
        strategy = self._hparams.clas_strategy
        if strategy == "time_wise":
            summary = output
        elif strategy == "cls_time":
            summary = output[:, -1]
        elif strategy == "all_time":
            length_diff = self._hparams.max_seq_len - tf.shape(token_ids)[1]
            summary_input = tf.pad(output,
                                   paddings=[[0, 0], [0, length_diff], [0, 0]])
            summary_input_dim = \
                self._encoder.output_size * self._hparams.max_seq_len
            summary = tf.reshape(summary_input, shape=[-1, summary_input_dim])
        else:
            raise ValueError("Unknown classification strategy: {}"
                             .format(strategy))

        if self._hparams.use_projection:
            summary = tf.tanh(self.projection(summary))
        # summary: (batch_size, hidden_dim)
        summary = self._dropout_layer(summary, training=is_training)

        logits = (self._logit_layer(summary) if self._logit_layer is not None
                  else summary)

        # Compute predictions
        num_classes = self._hparams.num_classes
        is_binary = num_classes == 1 or (num_classes <= 0
                                         and logits.shape[-1] == 1)

        if strategy == "time_wise":
            if is_binary:
                pred = tf.squeeze(tf.greater(logits, 0), -1)
                logits = tf.squeeze(logits, -1)
            else:
                pred = tf.argmax(logits, axis=-1)
        else:
            if is_binary:
                pred = tf.greater(logits, 0)
                logits = tf.reshape(logits, [-1])
            else:
                pred = tf.argmax(logits, axis=-1)
            pred = tf.reshape(pred, [-1])

        pred = tf.to_int64(pred)

        if not self._built:
            self._add_internal_trainable_variables()
            if self._logit_layer:
                self._add_trainable_variable(
                    self._logit_layer.trainable_variables)
            self._built = True

        return logits, pred