def default_hparams():
        """Returns a dictionary of hyperparameters with default values.

        See 'Texar.modules.encoders.transformer_encoders.TransformerEncoder' for details
        """
        return {
            'num_blocks': 6,
            'dim': 512,
            'use_bert_config': False,
            'embedding_dropout': 0.1,
            'residual_dropout': 0.1,
            'poswise_feedforward': default_transformer_poswise_net_hparams(),
            'graph_multihead_attention': {
                'name': 'graph_multihead_attention',
                'num_units': 512,
                'num_heads': 8,
                'dropout_rate': 0.1,
                'output_dim': 512,
                'use_bias': False,
            },
            'initializer': None,
            'name': 'cross_graph_transformer_fixed_length_decoder',
            'embedding_tie': True,
            'output_layer_bias': False,
            'max_decoding_length': int(1e10),
        }
    def default_hparams():
        """Returns a dictionary of hyperparameters with default values.

        See 'Texar.modules.decoders.transformer_decoders.TransformerDecoder' for details
        """
        return {
            "num_blocks": 6,
            "dim": 512,
            "embedding_tie": True,
            "output_layer_bias": False,
            "max_decoding_length": int(1e10),
            "embedding_dropout": 0.1,
            "residual_dropout": 0.1,
            "poswise_feedforward": default_transformer_poswise_net_hparams(),
            'graph_multihead_attention': {
                'name': 'graph_multihead_attention',
                'num_units': 512,
                'num_heads': 8,
                'dropout_rate': 0.1,
                'output_dim': 512,
                'use_bias': False,
            },
            "initializer": None,
            "name": "cross_graph_transformer_sequential_decoder",
        }
Exemplo n.º 3
0
    def default_hparams():
        """Returns a dictionary of hyperparameters with default values.

        .. code-block:: python

            {
                # Same as in TransformerEncoder
                "num_blocks": 6,
                "dim": 512,
                "position_embedder_hparams": None,
                "embedding_dropout": 0.1,
                "residual_dropout": 0.1,
                "poswise_feedforward": default_transformer_poswise_net_hparams,
                "multihead_attention": {
                    "num_units": 512,
                    "num_heads": 8,
                },
                "initializer": None,
                # Additional for TransformerDecoder
                "embedding_tie": True,
                "output_layer_bias": False,
                "max_decoding_length": 1e10,
                "name": "transformer_decoder"
            }

        Here:

        "num_blocks" : int
            Number of stacked blocks.

        "dim" : int
            Hidden dimension of the encoder.

        "position_embedder_hparams" : dict, optional
            Hyperparameters of a
            :class:`~texar.modules.SinusoidsPositionEmbedder` as position
            embedder. If `None`, the
            :meth:`~texar.modules.SinusoidsPositionEmbedder.default_hparams`
            is used.

        "embedding_dropout": float
            Dropout rate of the input word and position embeddings.

        "residual_dropout" :  float
            Dropout rate of the residual connections.

        "poswise_feedforward" : dict,
            Hyperparameters for a feed-forward network used in residual
            connections.
            Make sure the dimension of the output tensor is equal to `dim`.

            See :func:`~texar.modules.default_transformer_poswise_net_hparams`
            for details.

        "multihead_attention": dict,
            Hyperparameters for the multihead attention strategy.
            Make sure the `output_dim` in this module is equal to `dim`.

            See :func:
                `~texar.modules.encoder.MultiheadAttentionEncoder.
                default_harams` for details.
            `
        "initializer" : dict, optional
            Hyperparameters of the default initializer that initializes
            variables created in this module.
            See :func:`~texar.core.get_initializer` for details.

        "embedding_tie" : bool
            Whether to use the word embedding matrix as the output layer
            that computes logits. If `False`, an additional dense layer
            is created.

        "output_layer_bias" : bool
            Whether to use bias to the output layer.

        "max_decoding_length" : int
            The maximum allowed number of decoding steps.
            Set to a very large number of avoid the length constraint.
            Ignored if provided in :meth:`_build` or
            "train_greedy" decoding is used.

            Length penalty coefficient. Refer to
            https://arxiv.org/abs/1609.08144 for more details.

        "name" : str
            Name of the module.
        """
        return {
            "num_blocks": 6,
            "initializer": None,
            "position_embedder_hparams": None,
            "embedding_tie": True,
            "output_layer_bias": False,
            "max_decoding_length": 1e10,
            "embedding_dropout": 0.1,
            "residual_dropout": 0.1,
            "poswise_feedforward": default_transformer_poswise_net_hparams(),
            'multihead_attention': {
                'num_units': 512,
                'dropout_rate': 0.1,
                'output_dim': 512,
                'num_heads': 8,
            },
            "dim": 512,
            "name": "transformer_decoder",
        }
Exemplo n.º 4
0
    def default_hparams():
        """Returns a dictionary of hyperparameters with default values.

        .. code-block:: python

            {
                # Same as in TransformerEncoder
                "scale_embeds": True,
                "num_blocks": 6,
                "dim": 512,
                'position_embedder_type': 'sinusoids',
                'position_size': None,
                "position_embedder_hparams": None,
                "embedding_dropout": 0.1,
                "residual_dropout": 0.1,
                "poswise_feedforward": default_transformer_poswise_net_hparams,
                "multihead_attention": {
                    'name': 'multihead_attention',
                    'num_units': 512,
                    'output_dim': 512,
                    'num_heads': 8,
                    'dropout_rate': 0.1,
                    'output_dim': 512,
                    'use_bias': False,
                },
                "initializer": None,
                "name": "transformer_decoder"

                # Additional for TransformerDecoder
                "embedding_tie": True,
                "output_layer_bias": False,
                "max_decoding_length": int(1e10),
            }

        Here:
        "scale_embeds": bool
            Scale the word embedding with the square root of its dimension. True by default.
            This should be False when loading the pretrained GPT-2 Model.

        "num_blocks" : int
            Number of stacked blocks.

        "dim" : int
            Hidden dimension of the encoder.

        "position_embedder_type":
            Choose from "sinusoids" or "variables".

            "sinusoids":
                create the position embedding as sinusoids, which is fixed.
            "variables":
                create the position embedding as trainable variables.
        "position_size": int
            The size of position embeddings.
            Only be used when "position_embedder_type"is "variables".

        "position_embedder_hparams" : dict, optional
            Hyperparameters of a
            :class:`~texar.modules.SinusoidsPositionEmbedder` as position
            embedder. If `None`, the
            :meth:`~texar.modules.SinusoidsPositionEmbedder.default_hparams`
            is used.

        "embedding_dropout": float
            Dropout rate of the input word and position embeddings.

        "residual_dropout" :  float
            Dropout rate of the residual connections.

        "poswise_feedforward" : dict
            Hyperparameters for a feed-forward network used in residual
            connections.
            Make sure the dimension of the output tensor is equal to `dim`.

            See :func:`~texar.modules.default_transformer_poswise_net_hparams`
            for details.

        "multihead_attention" : dict
            Hyperparameters for the multihead attention strategy.
            Make sure the `output_dim` in this module is equal to `dim`.

            See :func:`~texar.modules.MultiheadAttentionEncoder.default_hparams`
            for details.

        "initializer" : dict, optional
            Hyperparameters of the default initializer that initializes
            variables created in this module.
            See :func:`~texar.core.get_initializer` for details.

        "embedding_tie" : bool
            Whether to use the word embedding matrix as the output layer
            that computes logits. If `False`, a new dense layer
            is created.

        "output_layer_bias" : bool
            Whether to use bias to the output layer.

        "max_decoding_length" : int
            The maximum allowed number of decoding steps.
            Set to a very large number of avoid the length constraint.
            Ignored if provided in :meth:`_build` or
            "train_greedy" decoding is used.

            Length penalty coefficient. Refer to
            https://arxiv.org/abs/1609.08144 for more details.

        "name" : str
            Name of the module.
        """
        return {
            "scale_embeds": True,
            "num_blocks": 6,
            "dim": 512,
            'position_embedder_type': 'sinusoids',
            'position_size': None,
            "position_embedder_hparams": None,
            "embedding_tie": True,
            "output_layer_bias": False,
            "max_decoding_length": int(1e10),
            "embedding_dropout": 0.1,
            "residual_dropout": 0.1,
            "poswise_feedforward": default_transformer_poswise_net_hparams(),
            'multihead_attention': {
                'name': 'multihead_attention',
                'num_units': 512,
                'num_heads': 8,
                'dropout_rate': 0.1,
                'output_dim': 512,
                'use_bias': False,
                'num_units': 512,
            },
            "initializer": None,
            "name": "transformer_decoder",
        }