def add_args(parser): """Add model-specific arguments to the parser.""" # Arguments related to dropout parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for' ' attention weights') parser.add_argument('--act-dropout', type=float, metavar='D', help='dropout probability after' ' activation in FFN') # Arguments related to hidden states and self-attention parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--bias-kv', action='store_true', help='if set, adding a learnable bias kv') parser.add_argument('--zero-attn', action='store_true', help='if set, pads attn with zero') # Arguments related to input and output embeddings parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--share-encoder-input-output-embed', action='store_true', help='share encoder input' ' and output embeddings') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--no-token-positional-embeddings', action='store_true', help='if set, disables positional embeddings' ' (outside self attention)') parser.add_argument('--num-segment', type=int, metavar='N', help='num segment in the input') # Arguments related to sentence level prediction parser.add_argument('--sentence-class-num', type=int, metavar='N', help='number of classes for sentence task') parser.add_argument('--sent-loss', action='store_true', help='if set,' ' calculate sentence level predictions') # Arguments related to parameter initialization parser.add_argument('--apply-bert-init', action='store_true', help='use custom param initialization for BERT') # misc params parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--pooler-activation-fn', choices=utils.get_available_activation_fns(), help='Which activation function to use for pooler layer.') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block')
def add_args(parser): super(BARTModel, BARTModel).add_args(parser) parser.add_argument( '--pooler-dropout', type=float, metavar='D', help='dropout probability in the masked_lm pooler layers') parser.add_argument('--pooler-activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use for pooler layer')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') parser.add_argument( '--layer-wise-attention', default=False, action='store_true', help= 'perform layer-wise attention (cross-attention or cross+self-attention)' ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument( '--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) parser.add_argument( '--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings')
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--encoder-layers', type=int, metavar='L', help='num encoder layers') parser.add_argument('--encoder-embed-dim', type=int, metavar='H', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-attention-heads', type=int, metavar='A', help='num encoder attention heads') parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--pooler-activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use for pooler layer') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', type=float, metavar='D', help='dropout probability after activation in FFN') parser.add_argument( '--pooler-dropout', type=float, metavar='D', help='dropout probability in the masked_lm pooler layers') parser.add_argument('--max-positions', type=int, help='number of positional embeddings to learn') parser.add_argument( '--load-checkpoint-heads', action='store_true', help='(re-)register and load heads when loading checkpoints') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument( '--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension') parser.add_argument('--decoder-input-dim', type=int, metavar='N', help='decoder input dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--no-decoder-final-norm', action='store_true', help='don\'t add an extra layernorm after the last decoder block') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument('--no-token-positional-embeddings', action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--character-embeddings', action='store_true', help='if set, uses character embedding convolutions to produce token embeddings') parser.add_argument('--character-filters', type=str, metavar='LIST', default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', help='size of character embeddings') parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N', help='size of character embeddings') parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N', help='number of highway layers for character token embeddder') parser.add_argument('--adaptive-input', action='store_true', help='if set, uses adaptive input') parser.add_argument('--adaptive-input-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument('--adaptive-input-cutoff', metavar='EXPR', help='comma separated list of adaptive input cutoff points.') parser.add_argument('--tie-adaptive-weights', action='store_true', help='if set, ties the weights of adaptive softmax and adaptive input') parser.add_argument('--tie-adaptive-proj', action='store_true', help='if set, ties the projection weights of adaptive softmax and adaptive input') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument('--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings')