def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--encoder-pretrained-embed", type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", default=False, action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--encoder-learned-pos", default=False, action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--decoder-pretrained-embed", type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-reduced-attention-dim", type=int, default=None, metavar="N", help="if specified, computes attention with this dimensionality " "(instead of using encoder output dims)", ) parser.add_argument( "--decoder-lstm-units", type=int, metavar="N", help="num LSTM units for each decoder layer", ) parser.add_argument( "--decoder-out-embed-dim", default=None, type=int, metavar="N", help="decoder output embedding dimension", ) # Args for vocab reduction vocab_reduction.add_args(parser)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--encoder-pretrained-embed", type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", default=False, action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--encoder-learned-pos", default=False, action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--decoder-pretrained-embed", type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-learned-pos", default=False, action="store_true", help="use learned positional embeddings in the decoder", ) parser.add_argument( "--decoder-normalize-before", default=False, action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", default=False, action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--share-all-embeddings", default=False, action="store_true", help="share encoder, decoder and output embeddings" " (requires shared dictionary and embed dim)", ) parser.add_argument( "--adaptive-softmax-cutoff", default=None, metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ) # Args for vocab reduction vocab_reduction.add_args(parser)
def add_args(parser): parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--encoder-embed-dim", default=0, type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-pretrained-embed", default=None, metavar="FILE", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--encoder-hidden-dim", type=int, metavar="N", help="encoder cell num units") parser.add_argument("--encoder-layers", type=int, metavar="N", help="number of encoder layers") parser.add_argument( "--encoder-bidirectional", action="store_true", help="whether the first layer is bidirectional or not", ) parser.add_argument( "--averaging-encoder", default=False, action="store_true", help=("whether use mean encoder hidden states as decoder initial " "states or not"), ) parser.add_argument( "--decoder-embed-dim", default=0, type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-pretrained-embed", default=None, metavar="FILE", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the decoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--decoder-hidden-dim", type=int, metavar="N", help="decoder cell num units") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument( "--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension", ) parser.add_argument( "--decoder-out-pretrained-embed", default=None, metavar="FILE", help="path to pre-trained decoder output embedding", ) parser.add_argument( "--decoder-tie-embeddings", default=False, action="store_true", help="tie the decoder word embeddings with the output projection " "weights (requires that the embedding dims be of the same size)", ) parser.add_argument( "--attention-type", type=str, metavar="EXPR", help="decoder attention, defaults to dot", ) parser.add_argument( "--residual-level", default=None, type=int, help= ("First layer where to apply a residual connection. " "The value should be greater than 0 and smaller than the number of " "layers."), ) parser.add_argument( "--cell-type", default="lstm", type=str, metavar="EXPR", help= "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm", ) # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--encoder-dropout-in", type=float, metavar="D", help="dropout probability for encoder input embedding", ) parser.add_argument( "--encoder-dropout-out", type=float, metavar="D", help="dropout probability for encoder output", ) parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding", ) parser.add_argument( "--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output", ) parser.add_argument( "--sequence-lstm", action="store_true", help="use nn.LSTM implementation for encoder", ) parser.add_argument( "--ngram-decoder", default=None, type=int, nargs="+", help=( "A single integer, or a list of integers. If " "positive, the decoder is not recurrent but a feedforward " "network with target-side n-gram history as input. The decoder " "is still conditioned on the source side via attention. If " "this parameter is a list of integers, the n-th entry applies " "to the n-th decoder (for multilingual models and " "multi-decoders)"), ) parser.add_argument( "--ngram-activation-type", default="relu", type=str, metavar="EXPR", help=("Activation in FF layers of the ngram decoder, defaults to " "relu, values: relu, tanh"), ) parser.add_argument( "--multi-encoder", default=None, type=int, help=( "If this is positive, train n encoder networks rather than " "only one. The outputs of the encoders are concatenated before " "passing them through to the decoder."), ) parser.add_argument( "--multi-decoder", default=None, type=int, help=("If this is positive, train n decoder networks rather than " "only one. The predictions are combined via the method in " "--multi-decoder-combination-strategy."), ) parser.add_argument( "--multi-decoder-combination-strategy", default="bottleneck", type=str, metavar="EXPR", help=( "Only used if --multi-decoder is positive. Controls how the " "decoders are combined with each other.\n" "- uniform: Separate projection layers, average predictions\n" "- uniform-probspace: Separate projection layers, average " "in probability space.\n" "- uniform-logprobspace: Separate projection layers, average " "in log-probability space.\n" "- unprojected: Shared projection layer, unprojected " "decoder outputs are averaged.\n" "- deepfusion: cf. https://arxiv.org/pdf/1503.03535.pdf \n" "- coldfusion: cf. https://arxiv.org/pdf/1708.06426.pdf \n" "- weighted: Separate projection layers, weighted average " "of logits. Weights are learned from unprojected decoder " "outputs.\n" "- weighted-probspace: Like 'weighted', but average in " "probability space.\n" "- weighted-logprobspace: Like 'weighted', but average in " "log-probability space.\n" "- weighted-unprojected: Shared projection layer, weighted " "average of decoder outputs. Weights are learned from " "unprojected decoder outputs.\n" "- concat: Shared projection layer, decoder outputs are " "concatenated.\n" "- bottleneck: Like 'concat' but with an additional " "bottleneck layer to reduce the size of the output embedding " "matrix.\n" "- deep_bottleneck: Like 'bottleneck' but with an additional " "non-linear layer.\n" "- multiplicative-unprojected: Shared projection layer, element" "-wise product of decoder outputs after ReLU.\n" "- max-unprojected: Shared projection layer, element" "-wise max of decoder outputs.\n"), ) parser.add_argument( "--multi-model-fixed-weights", default=None, type=float, nargs="+", help=( "Used for weighted* combination strategies. If specified, use " "these fixed model weights rather than a gating network."), ) parser.add_argument( "--multi-model-training-schedule", default="complete", type=str, metavar="EXPR", help= ("Only used if --multi-decoder is positive.\n" "- 'complete': Jointly train entire network on all batches.\n" "- 'unfreeze_single': Freeze all submodels except one for each " "training batch.\n" "- 'unfreeze_single_encoder': Freeze all encoders except one " "for each training batch.\n" "- 'unfreeze_single_decoder': Freeze all decoders except one " "for each training batch.\n" "- 'unfreeze_enc_N': Freeze N-th encoder.\n" "- 'unfreeze_dec_N': Freeze N-th decoder.\n" "- 'unfreeze_encdec_N': Freeze N-th encoder and N-th decoder.\n" "- 'freeze_all': Freeze all submodels, only train combination " "strategy.\n" "- 'freeze_all_encoders': Freeze all encoders.\n" "- 'freeze_all_decoders': Freeze all decoders.\n" "- 'separate': Each training batch is used for only one of the " "following: Train the n-th submodel, or train combination " "strategy."), ) parser.add_argument( "--multi-decoder-is-lm", default=None, type=int, nargs="+", help= ("If specified, sets --attention-type=no and --encoder-hidden-dim=0" "for the n-th decoder in an adaptive ensemble."), ) # Args for vocab reduction vocab_reduction.add_args(parser) # Args for word dropout word_dropout.add_args(parser)
def add_args(parser): parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--encoder-hidden-dim", type=int, metavar="N", help="encoder cell num units") parser.add_argument("--encoder-layers", type=int, metavar="N", help="number of encoder layers") parser.add_argument( "--encoder-bidirectional", action="store_true", help="whether the first layer is bidirectional or not", ) parser.add_argument( "--averaging-encoder", default=False, action="store_true", help=("whether use mean encoder hidden states as decoder initial " "states or not"), ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--decoder-hidden-dim", type=int, metavar="N", help="decoder cell num units") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument( "--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension", ) parser.add_argument( "--attention-type", type=str, metavar="EXPR", help="decoder attention, defaults to dot", ) parser.add_argument( "--residual-level", default=None, type=int, help= ("First layer where to apply a residual connection. " "The value should be greater than 0 and smaller than the number of " "layers."), ) parser.add_argument( "--cell-type", default="lstm", type=str, metavar="EXPR", help= "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm", ) # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--encoder-dropout-in", type=float, metavar="D", help="dropout probability for encoder input embedding", ) parser.add_argument( "--encoder-dropout-out", type=float, metavar="D", help="dropout probability for encoder output", ) parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding", ) parser.add_argument( "--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output", ) parser.add_argument( "--sequence-lstm", action="store_true", help="use nn.LSTM implementation for encoder", ) parser.add_argument( "--ngram-decoder", default=None, type=int, help=( "If this is positive, we use an n-gram based feedforward " "network in the decoder rather than recurrence. The decoder is " "still conditioned on the source side via attention."), ) parser.add_argument( "--ngram-activation-type", default="relu", type=str, metavar="EXPR", help=("Activation in FF layers of the ngram decoder, defaults to " "relu, values: relu, tanh"), ) # Args for vocab reduction vocab_reduction.add_args(parser) # Args for word dropout word_dropout.add_args(parser)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--encoder-embed-path", type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings in the decoder", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--share-all-embeddings", action="store_true", help="share encoder, decoder and output embeddings" " (requires shared dictionary and embed dim)", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ), parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) # AAN only parser.add_argument( "--decoder-attn-window-size", default=0, type=int, help= "attention window size of the decoder (default: 0 (unlimited))", ) parser.add_argument( "--no-decoder-aan-ffn", default=False, action="store_true", help="no FFN in the AAN block", ) parser.add_argument( "--no-decoder-aan-gating", default=False, action="store_true", help="no Gating in the AAN block", ) parser.add_argument( "--decoder-aan-ffn-use-embed-dim", default=False, action="store_true", help="""using decoder_embed_dim instead of decoder_ffn_embed_dim \ as the hidden size of the FFN in AAN""", ) parser.add_argument( "--decoder-aan-more-dropouts", type=lambda x: set(x.split(",")), help= """places to add more dropout in AAN, accepting multiple values in \ [residual/after_avg/after_aan] separated by commas""", ) parser.add_argument( "--decoder-out-embed-dim", default=None, type=int, metavar="N", help="decoder output embedding dimension (bottleneck layer before" "output layer if specified.)", ) # Args for vocab reduction vocab_reduction.add_args(parser)
def add_args(parser): parser.add_argument( '--dropout', default=0.1, type=float, metavar='D', help='dropout probability', ) parser.add_argument( '--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension', ) parser.add_argument( '--encoder-freeze-embed', default=False, action='store_true', help=('whether to freeze the encoder embedding or allow it to be ' 'updated during training'), ) parser.add_argument( '--encoder-hidden-dim', type=int, metavar='N', help='encoder cell num units', ) parser.add_argument( '--encoder-layers', type=int, metavar='N', help='number of encoder layers', ) parser.add_argument( '--encoder-bidirectional', action='store_true', help='whether the first layer is bidirectional or not', ) parser.add_argument( '--averaging-encoder', default=False, action='store_true', help=('whether use mean encoder hidden states as decoder initial ' 'states or not'), ) parser.add_argument( '--add-encoder-outputs-as-decoder-input', default=False, action='store_true', help=('whether use max encoder hidden states as constant decoder ' 'input'), ) parser.add_argument( '--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension', ) parser.add_argument( '--decoder-freeze-embed', default=False, action='store_true', help=('whether to freeze the encoder embedding or allow it to be ' 'updated during training'), ) parser.add_argument( '--decoder-hidden-dim', type=int, metavar='N', help='decoder cell num units', ) parser.add_argument( '--decoder-layers', type=int, metavar='N', help='number of decoder layers', ) parser.add_argument( '--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension', ) parser.add_argument( '--attention-type', type=str, metavar='EXPR', help='decoder attention, defaults to dot', ) parser.add_argument( '--residual-level', default=None, type=int, help= ('First layer where to apply a residual connection. ' 'The value should be greater than 0 and smaller than the number of ' 'layers.'), ) parser.add_argument( '--cell-type', default='lstm', type=str, metavar='EXPR', help= 'cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm', ) # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( '--encoder-dropout-in', type=float, metavar='D', help='dropout probability for encoder input embedding', ) parser.add_argument( '--encoder-dropout-out', type=float, metavar='D', help='dropout probability for encoder output', ) parser.add_argument( '--decoder-dropout-in', type=float, metavar='D', help='dropout probability for decoder input embedding', ) parser.add_argument( '--decoder-dropout-out', type=float, metavar='D', help='dropout probability for decoder output', ) parser.add_argument( '--sequence-lstm', action='store_true', help='use nn.LSTM implementation for encoder', ) # Args for vocab reduction vocab_reduction.add_args(parser) # Args for word dropout word_dropout.add_args(parser)
def add_args(parser): parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--encoder-hidden-dim", type=int, metavar="N", help="encoder cell num units") parser.add_argument("--encoder-layers", type=int, metavar="N", help="number of encoder layers") parser.add_argument( "--encoder-bidirectional", action="store_true", help="whether the first layer is bidirectional or not", ) parser.add_argument( "--averaging-encoder", default=False, action="store_true", help=("whether use mean encoder hidden states as decoder initial " "states or not"), ) parser.add_argument( "--add-encoder-outputs-as-decoder-input", default=False, action="store_true", help=("whether use max encoder hidden states as constant decoder " "input"), ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--decoder-hidden-dim", type=int, metavar="N", help="decoder cell num units") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument( "--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension", ) parser.add_argument( "--attention-type", type=str, metavar="EXPR", help="decoder attention, defaults to dot", ) parser.add_argument( "--residual-level", default=None, type=int, help= ("First layer where to apply a residual connection. " "The value should be greater than 0 and smaller than the number of " "layers."), ) parser.add_argument( "--cell-type", default="lstm", type=str, metavar="EXPR", help= "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm", ) # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--encoder-dropout-in", type=float, metavar="D", help="dropout probability for encoder input embedding", ) parser.add_argument( "--encoder-dropout-out", type=float, metavar="D", help="dropout probability for encoder output", ) parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding", ) parser.add_argument( "--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output", ) parser.add_argument( "--sequence-lstm", action="store_true", help="use nn.LSTM implementation for encoder", ) # Args for vocab reduction vocab_reduction.add_args(parser) # Args for word dropout word_dropout.add_args(parser) # Args for character RNN encoder char_rnn_encoder.add_args(parser)
def add_args(parser): parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--encoder-hidden-dim", type=int, metavar="N", help="encoder cell num units") parser.add_argument("--encoder-layers", type=int, metavar="N", help="number of encoder layers") parser.add_argument( "--encoder-bidirectional", action="store_true", help="whether the first layer is bidirectional or not", ) parser.add_argument( "--averaging-encoder", default=False, action="store_true", help=("whether use mean encoder hidden states as decoder initial " "states or not"), ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-freeze-embed", default=False, action="store_true", help=("whether to freeze the encoder embedding or allow it to be " "updated during training"), ) parser.add_argument("--decoder-hidden-dim", type=int, metavar="N", help="decoder cell num units") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument( "--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension", ) parser.add_argument( "--attention-type", type=str, metavar="EXPR", help="decoder attention, defaults to dot", ) parser.add_argument( "--residual-level", default=None, type=int, help= ("First layer where to apply a residual connection. " "The value should be greater than 0 and smaller than the number of " "layers."), ) parser.add_argument( "--cell-type", default="lstm", type=str, metavar="EXPR", help= "cell type, defaults to lstm, values:lstm, milstm, layer_norm_lstm", ) # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--encoder-dropout-in", type=float, metavar="D", help="dropout probability for encoder input embedding", ) parser.add_argument( "--encoder-dropout-out", type=float, metavar="D", help="dropout probability for encoder output", ) parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding", ) parser.add_argument( "--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output", ) parser.add_argument( "--sequence-lstm", action="store_true", help="use nn.LSTM implementation for encoder", ) parser.add_argument( "--ngram-decoder", default=None, type=int, nargs="+", help=( "A single integer, or a list of integers. If " "positive, the decoder is not recurrent but a feedforward " "network with target-side n-gram history as input. The decoder " "is still conditioned on the source side via attention. If " "this parameter is a list of integers, the n-th entry applies " "to the n-th decoder (for multilingual models and " "multi-decoders)"), ) parser.add_argument( "--ngram-activation-type", default="relu", type=str, metavar="EXPR", help=("Activation in FF layers of the ngram decoder, defaults to " "relu, values: relu, tanh"), ) parser.add_argument( "--multi-encoder", default=None, type=int, help=( "If this is positive, train n encoder networks rather than " "only one. The outputs of the encoders are concatenated before " "passing them through to the decoder."), ) parser.add_argument( "--multi-decoder", default=None, type=int, help=("If this is positive, train n decoder networks rather than " "only one. The predictions are combined via the method in " "--multi-decoder-combination-strategy."), ) parser.add_argument( "--multi-decoder-combination-strategy", default="bottleneck", type=str, metavar="EXPR", help=( "Only used if --multi-decoder is positive. Controls how the " "decoders are combined with each other.\n" "- uniform: Separate projection layers, average predictions\n" "- uniform-probspace: Separate projection layers, average " "in probability space.\n" "- unprojected: Shared projection layer, unprojected " "decoder outputs are averaged.\n" "- weighted: Separate projection layers, weighted average " "of logits. Weights are learned from unprojected decoder " "outputs.\n" "- weighted-probspace: Like 'weighted', but average in " "probability space.\n" "- weighted-unprojected: Shared projection layer, weighted " "average of decoder outputs. Weights are learned from " "unprojected decoder outputs.\n" "- concat: Shared projection layer, decoder outputs are " "concatenated.\n" "- bottleneck: Like 'concat' but with an additional " "bottleneck layer to reduce the size of the output embedding " "matrix.\n" "- multiplicative-unprojected: Shared projection layer, element" "-wise product of decoder outputs after ReLU.\n"), ) # Args for vocab reduction vocab_reduction.add_args(parser) # Args for word dropout word_dropout.add_args(parser)