def default_hparams(): """Returns a dictionary of hyperparameters with default values. See 'Texar.modules.encoders.transformer_encoders.TransformerEncoder' for details """ return { 'num_blocks': 6, 'dim': 512, 'use_bert_config': False, 'embedding_dropout': 0.1, 'residual_dropout': 0.1, 'poswise_feedforward': default_transformer_poswise_net_hparams(), 'graph_multihead_attention': { 'name': 'graph_multihead_attention', 'num_units': 512, 'num_heads': 8, 'dropout_rate': 0.1, 'output_dim': 512, 'use_bias': False, }, 'initializer': None, 'name': 'cross_graph_transformer_fixed_length_decoder', 'embedding_tie': True, 'output_layer_bias': False, 'max_decoding_length': int(1e10), }
def default_hparams(): """Returns a dictionary of hyperparameters with default values. See 'Texar.modules.decoders.transformer_decoders.TransformerDecoder' for details """ return { "num_blocks": 6, "dim": 512, "embedding_tie": True, "output_layer_bias": False, "max_decoding_length": int(1e10), "embedding_dropout": 0.1, "residual_dropout": 0.1, "poswise_feedforward": default_transformer_poswise_net_hparams(), 'graph_multihead_attention': { 'name': 'graph_multihead_attention', 'num_units': 512, 'num_heads': 8, 'dropout_rate': 0.1, 'output_dim': 512, 'use_bias': False, }, "initializer": None, "name": "cross_graph_transformer_sequential_decoder", }
def default_hparams(): """Returns a dictionary of hyperparameters with default values. .. code-block:: python { # Same as in TransformerEncoder "num_blocks": 6, "dim": 512, "position_embedder_hparams": None, "embedding_dropout": 0.1, "residual_dropout": 0.1, "poswise_feedforward": default_transformer_poswise_net_hparams, "multihead_attention": { "num_units": 512, "num_heads": 8, }, "initializer": None, # Additional for TransformerDecoder "embedding_tie": True, "output_layer_bias": False, "max_decoding_length": 1e10, "name": "transformer_decoder" } Here: "num_blocks" : int Number of stacked blocks. "dim" : int Hidden dimension of the encoder. "position_embedder_hparams" : dict, optional Hyperparameters of a :class:`~texar.modules.SinusoidsPositionEmbedder` as position embedder. If `None`, the :meth:`~texar.modules.SinusoidsPositionEmbedder.default_hparams` is used. "embedding_dropout": float Dropout rate of the input word and position embeddings. "residual_dropout" : float Dropout rate of the residual connections. "poswise_feedforward" : dict, Hyperparameters for a feed-forward network used in residual connections. Make sure the dimension of the output tensor is equal to `dim`. See :func:`~texar.modules.default_transformer_poswise_net_hparams` for details. "multihead_attention": dict, Hyperparameters for the multihead attention strategy. Make sure the `output_dim` in this module is equal to `dim`. See :func: `~texar.modules.encoder.MultiheadAttentionEncoder. default_harams` for details. ` "initializer" : dict, optional Hyperparameters of the default initializer that initializes variables created in this module. See :func:`~texar.core.get_initializer` for details. "embedding_tie" : bool Whether to use the word embedding matrix as the output layer that computes logits. If `False`, an additional dense layer is created. "output_layer_bias" : bool Whether to use bias to the output layer. "max_decoding_length" : int The maximum allowed number of decoding steps. Set to a very large number of avoid the length constraint. Ignored if provided in :meth:`_build` or "train_greedy" decoding is used. Length penalty coefficient. Refer to https://arxiv.org/abs/1609.08144 for more details. "name" : str Name of the module. """ return { "num_blocks": 6, "initializer": None, "position_embedder_hparams": None, "embedding_tie": True, "output_layer_bias": False, "max_decoding_length": 1e10, "embedding_dropout": 0.1, "residual_dropout": 0.1, "poswise_feedforward": default_transformer_poswise_net_hparams(), 'multihead_attention': { 'num_units': 512, 'dropout_rate': 0.1, 'output_dim': 512, 'num_heads': 8, }, "dim": 512, "name": "transformer_decoder", }
def default_hparams(): """Returns a dictionary of hyperparameters with default values. .. code-block:: python { # Same as in TransformerEncoder "scale_embeds": True, "num_blocks": 6, "dim": 512, 'position_embedder_type': 'sinusoids', 'position_size': None, "position_embedder_hparams": None, "embedding_dropout": 0.1, "residual_dropout": 0.1, "poswise_feedforward": default_transformer_poswise_net_hparams, "multihead_attention": { 'name': 'multihead_attention', 'num_units': 512, 'output_dim': 512, 'num_heads': 8, 'dropout_rate': 0.1, 'output_dim': 512, 'use_bias': False, }, "initializer": None, "name": "transformer_decoder" # Additional for TransformerDecoder "embedding_tie": True, "output_layer_bias": False, "max_decoding_length": int(1e10), } Here: "scale_embeds": bool Scale the word embedding with the square root of its dimension. True by default. This should be False when loading the pretrained GPT-2 Model. "num_blocks" : int Number of stacked blocks. "dim" : int Hidden dimension of the encoder. "position_embedder_type": Choose from "sinusoids" or "variables". "sinusoids": create the position embedding as sinusoids, which is fixed. "variables": create the position embedding as trainable variables. "position_size": int The size of position embeddings. Only be used when "position_embedder_type"is "variables". "position_embedder_hparams" : dict, optional Hyperparameters of a :class:`~texar.modules.SinusoidsPositionEmbedder` as position embedder. If `None`, the :meth:`~texar.modules.SinusoidsPositionEmbedder.default_hparams` is used. "embedding_dropout": float Dropout rate of the input word and position embeddings. "residual_dropout" : float Dropout rate of the residual connections. "poswise_feedforward" : dict Hyperparameters for a feed-forward network used in residual connections. Make sure the dimension of the output tensor is equal to `dim`. See :func:`~texar.modules.default_transformer_poswise_net_hparams` for details. "multihead_attention" : dict Hyperparameters for the multihead attention strategy. Make sure the `output_dim` in this module is equal to `dim`. See :func:`~texar.modules.MultiheadAttentionEncoder.default_hparams` for details. "initializer" : dict, optional Hyperparameters of the default initializer that initializes variables created in this module. See :func:`~texar.core.get_initializer` for details. "embedding_tie" : bool Whether to use the word embedding matrix as the output layer that computes logits. If `False`, a new dense layer is created. "output_layer_bias" : bool Whether to use bias to the output layer. "max_decoding_length" : int The maximum allowed number of decoding steps. Set to a very large number of avoid the length constraint. Ignored if provided in :meth:`_build` or "train_greedy" decoding is used. Length penalty coefficient. Refer to https://arxiv.org/abs/1609.08144 for more details. "name" : str Name of the module. """ return { "scale_embeds": True, "num_blocks": 6, "dim": 512, 'position_embedder_type': 'sinusoids', 'position_size': None, "position_embedder_hparams": None, "embedding_tie": True, "output_layer_bias": False, "max_decoding_length": int(1e10), "embedding_dropout": 0.1, "residual_dropout": 0.1, "poswise_feedforward": default_transformer_poswise_net_hparams(), 'multihead_attention': { 'name': 'multihead_attention', 'num_units': 512, 'num_heads': 8, 'dropout_rate': 0.1, 'output_dim': 512, 'use_bias': False, 'num_units': 512, }, "initializer": None, "name": "transformer_decoder", }