예제 #1
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # Arguments related to dropout
        parser.add_argument('--dropout', type=float, metavar='D',
                            help='dropout probability')
        parser.add_argument('--attention-dropout', type=float,
                            metavar='D', help='dropout probability for'
                            ' attention weights')
        parser.add_argument('--act-dropout', type=float,
                            metavar='D', help='dropout probability after'
                            ' activation in FFN')

        # Arguments related to hidden states and self-attention
        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                            help='encoder embedding dimension for FFN')
        parser.add_argument('--encoder-layers', type=int, metavar='N',
                            help='num encoder layers')
        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                            help='num encoder attention heads')
        parser.add_argument('--bias-kv', action='store_true',
                            help='if set, adding a learnable bias kv')
        parser.add_argument('--zero-attn', action='store_true',
                            help='if set, pads attn with zero')

        # Arguments related to input and output embeddings
        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--share-encoder-input-output-embed',
                            action='store_true', help='share encoder input'
                            ' and output embeddings')
        parser.add_argument('--encoder-learned-pos', action='store_true',
                            help='use learned positional embeddings in the encoder')
        parser.add_argument('--no-token-positional-embeddings',
                            action='store_true',
                            help='if set, disables positional embeddings'
                            ' (outside self attention)')
        parser.add_argument('--num-segment', type=int, metavar='N',
                            help='num segment in the input')

        # Arguments related to sentence level prediction
        parser.add_argument('--sentence-class-num', type=int, metavar='N',
                            help='number of classes for sentence task')
        parser.add_argument('--sent-loss', action='store_true', help='if set,'
                            ' calculate sentence level predictions')

        # Arguments related to parameter initialization
        parser.add_argument('--apply-bert-init', action='store_true',
                            help='use custom param initialization for BERT')

        # misc params
        parser.add_argument('--activation-fn',
                            choices=utils.get_available_activation_fns(),
                            help='activation function to use')
        parser.add_argument('--pooler-activation-fn',
                            choices=utils.get_available_activation_fns(),
                            help='Which activation function to use for pooler layer.')
        parser.add_argument('--encoder-normalize-before', action='store_true',
                            help='apply layernorm before each encoder block')
예제 #2
0
 def add_args(parser):
     super(BARTModel, BARTModel).add_args(parser)
     parser.add_argument(
         '--pooler-dropout',
         type=float,
         metavar='D',
         help='dropout probability in the masked_lm pooler layers')
     parser.add_argument('--pooler-activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use for pooler layer')
예제 #3
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion'),
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention',
                         default=False,
                         action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention',
                         default=False,
                         action='store_true',
                         help='perform cross+self-attention')
     parser.add_argument(
         '--layer-wise-attention',
         default=False,
         action='store_true',
         help=
         'perform layer-wise attention (cross-attention or cross+self-attention)'
     )
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument('--decoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument(
         '--encoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     parser.add_argument(
         '--decoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     parser.add_argument('--layernorm-embedding',
                         action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding',
                         action='store_true',
                         help='if True, dont scale embeddings')
예제 #4
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='L',
                         help='num encoder layers')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='H',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='F',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='A',
                         help='num encoder attention heads')
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--pooler-activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use for pooler layer')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability after activation in FFN')
     parser.add_argument(
         '--pooler-dropout',
         type=float,
         metavar='D',
         help='dropout probability in the masked_lm pooler layers')
     parser.add_argument('--max-positions',
                         type=int,
                         help='number of positional embeddings to learn')
     parser.add_argument(
         '--load-checkpoint-heads',
         action='store_true',
         help='(re-)register and load heads when loading checkpoints')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument(
         '--encoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
예제 #5
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout', type=float, metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout', type=float, metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                         help='dropout probability after activation in FFN.')
     parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                         help='decoder output dimension')
     parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                         help='decoder input dimension')
     parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers', type=int, metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                         help='num decoder attention heads')
     parser.add_argument('--decoder-normalize-before', action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--no-decoder-final-norm', action='store_true',
                         help='don\'t add an extra layernorm after the last decoder block')
     parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                         help='comma separated list of adaptive softmax cutoff points. '
                              'Must be used with adaptive_loss criterion')
     parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                         help='adaptive input factor')
     parser.add_argument('--no-token-positional-embeddings', action='store_true',
                         help='if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--character-embeddings', action='store_true',
                         help='if set, uses character embedding convolutions to produce token embeddings')
     parser.add_argument('--character-filters', type=str, metavar='LIST',
                         default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                         help='size of character embeddings')
     parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                         help='size of character embeddings')
     parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                         help='number of highway layers for character token embeddder')
     parser.add_argument('--adaptive-input', action='store_true',
                         help='if set, uses adaptive input')
     parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                         help='adaptive input factor')
     parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                         help='comma separated list of adaptive input cutoff points.')
     parser.add_argument('--tie-adaptive-weights', action='store_true',
                         help='if set, ties the weights of adaptive softmax and adaptive input')
     parser.add_argument('--tie-adaptive-proj', action='store_true',
                         help='if set, ties the projection weights of adaptive softmax and adaptive input')
     parser.add_argument('--decoder-learned-pos', action='store_true',
                         help='use learned positional embeddings in the decoder')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument('--decoder-layers-to-keep', default=None,
                         help='which layers to *keep* when pruning as a comma-separated list')
     parser.add_argument('--layernorm-embedding', action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding', action='store_true',
                         help='if True, dont scale embeddings')