示例#1
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim

        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False,
                                     uniform=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim,
                              bias=False, uniform=False) if embed_dim != output_embed_dim else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary), output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
           self.layer_norm = LayerNorm(embed_dim)
示例#2
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

        if args.encoder_layers != args.decoder_layers:
            raise ValueError('--encoder-layers must match --decoder-layers')

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if args.encoder_embed_path:
            pretrained_encoder_embed = load_pretrained_embedding_from_file(
                args.encoder_embed_path, task.source_dictionary,
                args.encoder_embed_dim)
        else:
            num_embeddings = len(task.source_dictionary)
            pretrained_encoder_embed = Embedding(num_embeddings,
                                                 args.encoder_embed_dim,
                                                 task.source_dictionary.pad())

        if args.share_all_embeddings:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError(
                    '--share-all-embeddings requires a joint dictionary')
            if args.decoder_embed_path and (args.decoder_embed_path !=
                                            args.encoder_embed_path):
                raise ValueError(
                    '--share-all-embed not compatible with --decoder-embed-path'
                )
            if args.encoder_embed_dim != args.decoder_embed_dim:
                raise ValueError(
                    '--share-all-embeddings requires --encoder-embed-dim to '
                    'match --decoder-embed-dim')
            pretrained_decoder_embed = pretrained_encoder_embed
            args.share_decoder_input_output_embed = True
        else:
            # separate decoder input embeddings
            pretrained_decoder_embed = None
            if args.decoder_embed_path:
                pretrained_decoder_embed = load_pretrained_embedding_from_file(
                    args.decoder_embed_path, task.target_dictionary,
                    args.decoder_embed_dim)
        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
                args.decoder_embed_dim != args.decoder_out_embed_dim):
            raise ValueError(
                '--share-decoder-input-output-embeddings requires '
                '--decoder-embed-dim to match --decoder-out-embed-dim')

        encoder = LSTMEncoder(
            dictionary=task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            hidden_size=args.encoder_hidden_size,
            num_layers=args.encoder_layers,
            dropout_in=args.encoder_dropout_in,
            dropout_out=args.encoder_dropout_out,
            bidirectional=args.encoder_bidirectional,
            pretrained_embed=pretrained_encoder_embed,
        )
        decoder = TGDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=options.eval_bool(args.decoder_attention),
            encoder_embed_dim=args.encoder_embed_dim,
            encoder_output_units=encoder.output_units,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(options.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == 'adaptive_loss' else None),
        )
        return cls(encoder, decoder)
示例#3
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        parser.add_argument('--dropout', type=float, metavar='D',
                            help='dropout probability')
        parser.add_argument('--attention-dropout', type=float, metavar='D',
                            help='dropout probability for attention weights')
        parser.add_argument('--relu-dropout', type=float, metavar='D',
                            help='dropout probability after ReLU in FFN')
        parser.add_argument('--input-dropout', type=float, metavar='D',
                            help='dropout probability of the inputs')
        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                            help='path to pre-trained encoder embedding')
        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--encoder-conv-dim', type=int, metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                            help='encoder embedding dimension for FFN')
        parser.add_argument('--encoder-layers', type=int, metavar='N',
                            help='num encoder layers')
        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                            help='num encoder attention heads or LightConv/DynamicConv heads')
        parser.add_argument('--encoder-normalize-before', action='store_true',
                            help='apply layernorm before each encoder block')
        parser.add_argument('--encoder-learned-pos', action='store_true',
                            help='use learned positional embeddings in the encoder')
        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                            help='path to pre-trained decoder embedding')
        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-conv-dim', type=int, metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                            help='decoder embedding dimension for FFN')
        parser.add_argument('--decoder-layers', type=int, metavar='N',
                            help='num decoder layers')
        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                            help='num decoder attention heads or LightConv/DynamicConv heads')
        parser.add_argument('--decoder-learned-pos', action='store_true',
                            help='use learned positional embeddings in the decoder')
        parser.add_argument('--decoder-normalize-before', action='store_true',
                            help='apply layernorm before each decoder block')
        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                            help='share decoder input and output embeddings')
        parser.add_argument('--share-all-embeddings', action='store_true',
                            help='share encoder, decoder and output embeddings'
                                 ' (requires shared dictionary and embed dim)')
        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                            help='comma separated list of adaptive softmax cutoff points. '
                                 'Must be used with adaptive_loss criterion'),
        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                            help='sets adaptive softmax dropout for the tail projections')

        """LightConv and DynamicConv arguments"""
        parser.add_argument('--encoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
                            help='list of kernel size (default: "[3,7,15,31,31,31,31]")')
        parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int),
                            help='list of kernel size (default: "[3,7,15,31,31,31]")')
        parser.add_argument('--encoder-glu', type=options.eval_bool,
                            help='glu after in proj')
        parser.add_argument('--decoder-glu', type=options.eval_bool,
                            help='glu after in proj')
        parser.add_argument('--encoder-conv-type', default='dynamic', type=str,
                            choices=['dynamic', 'lightweight'],
                            help='type of convolution')
        parser.add_argument('--decoder-conv-type', default='dynamic', type=str,
                            choices=['dynamic', 'lightweight'],
                            help='type of convolution')
        parser.add_argument('--weight-softmax', default=True, type=options.eval_bool)
        parser.add_argument('--weight-dropout', type=float, metavar='D',
                            help='dropout probability for conv weights')
示例#4
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     # fmt: off  # TODO
     parser.add_argument('--activation-fn',
                         choices=utils.get_available_activation_fns(),
                         help='activation function to use')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument(
         '--activation-dropout',
         '--relu-dropout',
         type=float,
         metavar='D',
         help='dropout probability after activation in FFN.')
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument('--encoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension for FFN')
     parser.add_argument('--encoder-layers',
                         type=int,
                         metavar='N',
                         help='num encoder layers')
     parser.add_argument('--encoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num encoder attention heads')
     parser.add_argument('--encoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each encoder block')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num decoder attention heads')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument(
         '--decoder-output-dim',
         type=int,
         metavar='N',
         help='decoder output dimension (extra linear layer '
         'if different from decoder embed dim')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion'),
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--layernorm-embedding',
                         action='store_true',
                         help='add layernorm to embedding')
     parser.add_argument('--no-scale-embedding',
                         action='store_true',
                         help='if True, dont scale embeddings')
     parser.add_argument(
         '--checkpoint-activations',
         action='store_true',
         help='checkpoint activations at each layer, which saves GPU '
         'memory usage at the cost of some additional compute')
     # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
     parser.add_argument('--no-cross-attention',
                         default=False,
                         action='store_true',
                         help='do not perform cross-attention')
     parser.add_argument('--cross-self-attention',
                         default=False,
                         action='store_true',
                         help='perform cross+self-attention')
     # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
     parser.add_argument('--encoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for encoder')
     parser.add_argument('--decoder-layerdrop',
                         type=float,
                         metavar='D',
                         default=0,
                         help='LayerDrop probability for decoder')
     parser.add_argument(
         '--encoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     parser.add_argument(
         '--decoder-layers-to-keep',
         default=None,
         help='which layers to *keep* when pruning as a comma-separated list'
     )
     # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
     parser.add_argument(
         '--quant-noise-pq',
         type=float,
         metavar='D',
         default=0,
         help='iterative PQ quantization noise at training time')
     parser.add_argument(
         '--quant-noise-pq-block-size',
         type=int,
         metavar='D',
         default=8,
         help='block size of quantization noise at training time')
     parser.add_argument(
         '--quant-noise-scalar',
         type=float,
         metavar='D',
         default=0,
         help=
         'scalar quantization noise and scalar quantization at training time'
     )
     # for prime
     parser.add_argument('--use_att',
                         type=str,
                         nargs='+',
                         default=[
                             'es',
                             'ds',
                             'dc',
                         ],
                         help='')
     parser.add_argument('--kernel_size',
                         type=int,
                         default=0,
                         help='do not set static kernel')
     parser.add_argument(
         '--attn_dynamic_type',
         type=int,
         default=0,
         help=
         '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use  dynamic kernel '
     )
     parser.add_argument('--attn_cat_relu', type=int, default=0)
     parser.add_argument(
         '--attn_wide_kernels',
         type=lambda x: options.eval_str_list(x, int),
         help='list of kernel size (default: "[3,15]") for wide and gate')
     parser.add_argument('--weight-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for conv weights')
     parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1')
     parser.add_argument(
         '--dynamic_depth_kernels',
         type=lambda x: options.eval_str_list(x, int),
         help=
         'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn'
     )
     parser.add_argument('--dynamic_padding',
                         type=int,
                         default=0,
                         help='padding before dynamic conv')
     parser.add_argument('--attn_dynamic_cat', type=int, default=1)
     parser.add_argument('--input_dropout', type=float, default=0, help='')
     parser.add_argument('--init_method',
                         type=str,
                         default='km',
                         help='xavier,km,xi,fixup')
     parser.add_argument('--lnv',
                         type=str,
                         default='origin',
                         help='layernorm,adanorm')
示例#5
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--dropout',
                         default=0.1,
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         default=0.,
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--relu-dropout',
                         default=0.,
                         type=float,
                         metavar='D',
                         help='dropout probability after ReLU in FFN')
     parser.add_argument('--input-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability of the inputs')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument('--decoder-output-dim',
                         type=int,
                         metavar='N',
                         help='decoder output dimension')
     parser.add_argument('--decoder-input-dim',
                         type=int,
                         metavar='N',
                         help='decoder input dimension')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension for FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num decoder layers')
     parser.add_argument(
         '--decoder-attention-heads',
         type=int,
         metavar='N',
         help='num decoder attention heads or LightConv/DynamicConv heads')
     parser.add_argument('--decoder-normalize-before',
                         default=False,
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument(
         '--adaptive-softmax-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive softmax cutoff points. '
         'Must be used with adaptive_loss criterion')
     parser.add_argument(
         '--adaptive-softmax-dropout',
         type=float,
         metavar='D',
         help='sets adaptive softmax dropout for the tail projections')
     parser.add_argument('--adaptive-softmax-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--no-token-positional-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, disables positional embeddings (outside self attention)')
     parser.add_argument('--share-decoder-input-output-embed',
                         default=False,
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument(
         '--character-embeddings',
         default=False,
         action='store_true',
         help=
         'if set, uses character embedding convolutions to produce token embeddings'
     )
     parser.add_argument(
         '--character-filters',
         type=str,
         metavar='LIST',
         default=
         '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
         help='size of character embeddings')
     parser.add_argument('--character-embedding-dim',
                         type=int,
                         metavar='N',
                         default=4,
                         help='size of character embeddings')
     parser.add_argument(
         '--char-embedder-highway-layers',
         type=int,
         metavar='N',
         default=2,
         help='number of highway layers for character token embeddder')
     parser.add_argument('--adaptive-input',
                         default=False,
                         action='store_true',
                         help='if set, uses adaptive input')
     parser.add_argument('--adaptive-input-factor',
                         type=float,
                         metavar='N',
                         help='adaptive input factor')
     parser.add_argument(
         '--adaptive-input-cutoff',
         metavar='EXPR',
         help='comma separated list of adaptive input cutoff points.')
     parser.add_argument(
         '--tie-adaptive-weights',
         action='store_true',
         help=
         'if set, ties the weights of adaptive softmax and adaptive input')
     parser.add_argument(
         '--tie-adaptive-proj',
         action='store_true',
         help=
         'if set, ties the projection weights of adaptive softmax and adaptive input'
     )
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         '--decoder-kernel-size-list',
         type=lambda x: options.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")')
     parser.add_argument('--decoder-glu',
                         type=options.eval_bool,
                         help='glu after in proj')
     parser.add_argument('--decoder-conv-type',
                         default='dynamic',
                         type=str,
                         choices=['dynamic', 'lightweight'],
                         help='type of convolution')
     parser.add_argument('--weight-softmax',
                         default=True,
                         type=options.eval_bool)
     parser.add_argument('--weight-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for conv weights')
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if getattr(args, 'max_target_positions', None) is None:
            args.max_target_positions = getattr(args, 'tokens_per_sample',
                                                DEFAULT_MAX_TARGET_POSITIONS)

        if args.character_embeddings:
            embed_tokens = CharacterTokenEmbedder(
                task.source_dictionary,
                eval(args.character_filters),
                args.character_embedding_dim,
                args.decoder_embed_dim,
                args.char_embedder_highway_layers,
            )
        elif args.adaptive_input:
            embed_tokens = AdaptiveInput(
                len(task.source_dictionary),
                task.source_dictionary.pad(),
                args.decoder_input_dim,
                args.adaptive_input_factor,
                args.decoder_embed_dim,
                options.eval_str_list(args.adaptive_input_cutoff, type=int),
            )
        else:
            if hasattr(task, 'vqvae_model'):
                vocab_size = args.codebook_size
                assert args.decoder_input_dim == task.vqvae_model.bottom_quantizer.dim
                code_embed_init = task.vqvae_model.bottom_quantizer.embed.data.transpose(
                    0, 1)
                embed_tokens = Embedding(vocab_size + 1,
                                         args.decoder_input_dim,
                                         padding_idx=None,
                                         weight=None)
            else:
                embed_tokens = Embedding(len(task.source_dictionary),
                                         args.decoder_input_dim,
                                         task.source_dictionary.pad())

        if hasattr(task, 'vqvae_model'):
            decoder = TransformerDecoder(args,
                                         task.target_dictionary,
                                         embed_tokens,
                                         no_encoder_attn=True,
                                         pad_idx=task.padding_idx)
        else:
            if args.tie_adaptive_weights:
                assert args.adaptive_input
                assert args.adaptive_input_factor == args.adaptive_softmax_factor
                assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format(
                    args.adaptive_softmax_cutoff, args.adaptive_input_cutoff)
                assert args.decoder_input_dim == args.decoder_output_dim

            decoder = TransformerDecoder(
                args,
                task.target_dictionary,
                embed_tokens,
                no_encoder_attn=True,
            )
        return TransformerLanguageModel(decoder)
示例#7
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--relu-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability after ReLU in FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num layers')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='embedding dimension for FFN')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num attention heads')
     parser.add_argument('--kernel-size-list',
                         type=lambda x: options.eval_str_list(x, int),
                         help='list of kernel size (default: None)')
     parser.add_argument('--language-embeddings',
                         action='store_true',
                         help='use language embeddings')
     # for the transformer XL integration, still I believe the numbers can not really adapt
     # Christine (7-2-2020)
     parser.add_argument('--d_head',
                         type=int,
                         default=50,
                         help='head dimension')
     parser.add_argument('--d_inner',
                         type=int,
                         default=1000,
                         help='inner dimension in FF')
     parser.add_argument(
         '--pre_lnorm',
         action='store_true',
         help='apply LayerNorm to the input instead of the output')
示例#8
0
def _fairseq_opt_wrapper(opt, skip_pretrained_embedding_loading=False):
    """
    Marshalls from a dict to a argparse.Namespace object for API compatibility.

    Also does some necessary post-processing needed for fairseq. Optionally can
    override pretrained embedding options, which is useful if we're just loading
    a model from a checkpoint.

    :param opt: dict. ParlAI options passed around from everywhere.
    :param skip_pretrained_embedding_loading: bool. Don't preload word embeddings.
    :return: an argparse.Namespace object for use in fairseq-py.
    """
    args = argparse.Namespace()

    # first set args according to ParlAI options
    for key in opt:
        if opt[key] is not None:
            setattr(args, key, opt[key])

    # at this point the user *must* have specified an arch
    if not hasattr(args, "arch"):
        raise ValueError("--arch/-a must be specified")
    # fill in default options from the model
    models.ARCH_CONFIG_REGISTRY[args.arch](args)

    # post processing of args. See
    # https://github.com/pytorch/fairseq/blob/v0.5.0/fairseq/options.py#L95
    if hasattr(args, "lr"):
        args.lr = options.eval_str_list(args.lr, type=float)
    if hasattr(args, "update_freq"):
        args.update_freq = options.eval_str_list(args.update_freq, int)
    if hasattr(args, "max_sentences_valid"):
        args.max_sentences_valid = args.max_sentences
    if getattr(args, "truncate") == -1:
        # some torch agents use positional embeddings, which must have a max length
        setattr(args, "truncate", 1024)
    if not hasattr(args, "max_source_positions"):
        # fairseq uses a different name for this CLI parameter
        # Sometimes it's set in model defaults, but not for all models
        setattr(args, "max_source_positions", getattr(args, "truncate"))
        # if we don't have source lengths, we don't have target lengths
        setattr(args, "max_target_positions", getattr(args, "truncate"))

    # handle modelzoo if possible
    for k in ("encoder_embed_path", "decoder_embed_path"):
        if getattr(args, k, None) is None:
            # not an argument for this model, pretrained embeddings don't matter
            continue
        elif skip_pretrained_embedding_loading:
            # if we want to skip pretrained, then hide the option from fairseq
            setattr(args, k, None)
        else:
            # otherwise we may need to modelzoo adjust the path for fairseq
            setattr(args, k,
                    modelzoo_path(opt.get("datapath"), getattr(args, k)))

    # Here we hardcode a few options that we currently do not support
    # turn off distributed training
    args.distributed_world_size = 1
    args.distributed_rank = 0

    return args, vars(args)
示例#9
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

        max_source_positions = getattr(args, "max_source_positions",
                                       DEFAULT_MAX_SOURCE_POSITIONS)
        max_target_positions = getattr(args, "max_target_positions",
                                       DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        # separate decoder input embeddings
        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path, task.target_dictionary,
                args.decoder_embed_dim)
        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
                args.decoder_embed_dim != args.decoder_out_embed_dim):
            raise ValueError(
                "--share-decoder-input-output-embed requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim")

        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        out_channels = speech_utils.eval_str_nested_list_or_tuple(
            args.encoder_conv_channels, type=int)
        kernel_sizes = speech_utils.eval_str_nested_list_or_tuple(
            args.encoder_conv_kernel_sizes, type=int)
        strides = speech_utils.eval_str_nested_list_or_tuple(
            args.encoder_conv_strides, type=int)
        logger.info("input feature dimension: {}, channels: {}".format(
            task.feat_dim, task.feat_in_channels))
        assert task.feat_dim % task.feat_in_channels == 0
        conv_layers = ConvBNReLU(
            out_channels,
            kernel_sizes,
            strides,
            in_channels=task.feat_in_channels,
        ) if out_channels is not None else None

        rnn_encoder_input_size = task.feat_dim // task.feat_in_channels
        if conv_layers is not None:
            for stride in strides:
                if isinstance(stride, (list, tuple)):
                    assert len(stride) > 0
                    s = stride[1] if len(stride) > 1 else stride[0]
                else:
                    assert isinstance(stride, int)
                    s = stride
                rnn_encoder_input_size = (rnn_encoder_input_size + s - 1) // s
            rnn_encoder_input_size *= out_channels[-1]
        else:
            rnn_encoder_input_size = task.feat_dim

        scheduled_sampling_rate_scheduler = ScheduledSamplingRateScheduler(
            args.scheduled_sampling_probs,
            args.start_scheduled_sampling_epoch,
        )

        encoder = SpeechLSTMEncoder(
            conv_layers_before=conv_layers,
            input_size=rnn_encoder_input_size,
            hidden_size=args.encoder_rnn_hidden_size,
            num_layers=args.encoder_rnn_layers,
            dropout_in=args.encoder_rnn_dropout_in,
            dropout_out=args.encoder_rnn_dropout_out,
            bidirectional=args.encoder_rnn_bidirectional,
            residual=args.encoder_rnn_residual,
            max_source_positions=max_source_positions,
        )
        decoder = SpeechLSTMDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            encoder_output_units=encoder.output_units,
            attn_type=args.attention_type,
            attn_dim=args.attention_dim,
            need_attn=args.need_attention,
            residual=args.decoder_rnn_residual,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(options.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            max_target_positions=max_target_positions,
            scheduled_sampling_rate_scheduler=scheduled_sampling_rate_scheduler,
        )
        pretrained_lm = None
        if args.pretrained_lm_checkpoint:
            logger.info("loading pretrained LM from {}".format(
                args.pretrained_lm_checkpoint))
            pretrained_lm = checkpoint_utils.load_model_ensemble(
                args.pretrained_lm_checkpoint, task=task)[0][0]
            pretrained_lm.make_generation_fast_()
            # freeze pretrained model
            for param in pretrained_lm.parameters():
                param.requires_grad = False
        return cls(encoder, decoder, pretrained_lm)
示例#10
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)
        self.layer_wise_attention = getattr(args, "layer_wise_attention",
                                            False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])
        self.num_layers = len(self.layers)

        self.adaptive_softmax = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.n_experts = 1
        self.nhidlast = self.embed_dim
        self.ninp = self.embed_dim
        self.ntoken = 9744
        self.prior = nn.Linear(self.nhidlast, self.n_experts, bias=False)
        self.latent = nn.Sequential(
            nn.Linear(self.nhidlast, self.n_experts * self.ninp), nn.Tanh())
示例#11
0
    def build_single_decoder(args,
                             src_dict,
                             dst_dict,
                             ngram_decoder=None,
                             project_output=True,
                             is_lm=False):
        if args.adaptive_softmax_cutoff is not None:
            project_output = False
        attention_type = args.attention_type
        encoder_hidden_dim = args.encoder_hidden_dim
        if is_lm:
            attention_type = "no"
            encoder_hidden_dim = 0
        if ngram_decoder:
            if args.ngram_activation_type == "relu":
                activation_fn = nn.ReLU
            elif args.ngram_activation_type == "tanh":
                activation_fn = nn.Tanh
            else:
                raise Exception("ngram_activation_type '%s' not implemented" %
                                args.ngram_activation_type)
            decoder = NGramDecoder(
                src_dict=src_dict,
                dst_dict=dst_dict,
                n=ngram_decoder,
                encoder_hidden_dim=encoder_hidden_dim,
                embed_dim=args.decoder_embed_dim,
                freeze_embed=args.decoder_freeze_embed,
                out_embed_dim=args.decoder_out_embed_dim,
                num_layers=args.decoder_layers,
                hidden_dim=args.decoder_hidden_dim,
                attention_type=attention_type,
                dropout_in=args.decoder_dropout_in,
                dropout_out=args.decoder_dropout_out,
                residual_level=args.residual_level,
                activation_fn=activation_fn,
                project_output=project_output,
                pretrained_embed=args.decoder_pretrained_embed,
                projection_pretrained_embed=args.decoder_out_pretrained_embed,
            )
        else:
            decoder = RNNDecoder(
                src_dict=src_dict,
                dst_dict=dst_dict,
                vocab_reduction_params=args.vocab_reduction_params,
                encoder_hidden_dim=encoder_hidden_dim,
                embed_dim=args.decoder_embed_dim,
                freeze_embed=args.decoder_freeze_embed,
                out_embed_dim=args.decoder_out_embed_dim,
                cell_type=args.cell_type,
                num_layers=args.decoder_layers,
                hidden_dim=args.decoder_hidden_dim,
                attention_type=attention_type,
                dropout_in=args.decoder_dropout_in,
                dropout_out=args.decoder_dropout_out,
                residual_level=args.residual_level,
                averaging_encoder=args.averaging_encoder,
                project_output=project_output,
                pretrained_embed=args.decoder_pretrained_embed,
                projection_pretrained_embed=args.decoder_out_pretrained_embed,
                tie_embeddings=args.decoder_tie_embeddings,
                att_weighted_src_embeds=args.att_weighted_src_embeds,
                src_embed_dim=args.encoder_embed_dim,
                att_weighted_activation_type=args.att_weighted_activation_type,
            )

        # Being able to use adaptive softmax for RNN decoder
        decoder.adaptive_softmax = None

        if args.adaptive_softmax_cutoff is not None:
            decoder.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                args.decoder_out_embed_dim or args.decoder_hidden_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        return decoder
示例#12
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument("--dropout",
                         type=float,
                         metavar="D",
                         help="dropout probability")
     parser.add_argument(
         "--attention-dropout",
         type=float,
         metavar="D",
         help="dropout probability for attention weights",
     )
     parser.add_argument(
         "--relu-dropout",
         type=float,
         metavar="D",
         help="dropout probability after ReLU in FFN",
     )
     parser.add_argument(
         "--input-dropout",
         type=float,
         metavar="D",
         help="dropout probability of the inputs",
     )
     parser.add_argument(
         "--encoder-embed-path",
         type=str,
         metavar="STR",
         help="path to pre-trained encoder embedding",
     )
     parser.add_argument(
         "--encoder-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-conv-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension",
     )
     parser.add_argument(
         "--encoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="encoder embedding dimension for FFN",
     )
     parser.add_argument("--encoder-layers",
                         type=int,
                         metavar="N",
                         help="num encoder layers")
     parser.add_argument(
         "--encoder-attention-heads",
         type=int,
         metavar="N",
         help="num encoder attention heads or LightConv/DynamicConv heads",
     )
     parser.add_argument(
         "--encoder-normalize-before",
         action="store_true",
         help="apply layernorm before each encoder block",
     )
     parser.add_argument(
         "--encoder-learned-pos",
         action="store_true",
         help="use learned positional embeddings in the encoder",
     )
     parser.add_argument(
         "--decoder-embed-path",
         type=str,
         metavar="STR",
         help="path to pre-trained decoder embedding",
     )
     parser.add_argument(
         "--decoder-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-conv-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension",
     )
     parser.add_argument(
         "--decoder-ffn-embed-dim",
         type=int,
         metavar="N",
         help="decoder embedding dimension for FFN",
     )
     parser.add_argument("--decoder-layers",
                         type=int,
                         metavar="N",
                         help="num decoder layers")
     parser.add_argument(
         "--decoder-attention-heads",
         type=int,
         metavar="N",
         help="num decoder attention heads or LightConv/DynamicConv heads",
     )
     parser.add_argument(
         "--decoder-learned-pos",
         action="store_true",
         help="use learned positional embeddings in the decoder",
     )
     parser.add_argument(
         "--decoder-normalize-before",
         action="store_true",
         help="apply layernorm before each decoder block",
     )
     parser.add_argument(
         "--share-decoder-input-output-embed",
         action="store_true",
         help="share decoder input and output embeddings",
     )
     parser.add_argument(
         "--share-all-embeddings",
         action="store_true",
         help="share encoder, decoder and output embeddings"
         " (requires shared dictionary and embed dim)",
     )
     parser.add_argument(
         "--adaptive-softmax-cutoff",
         metavar="EXPR",
         help="comma separated list of adaptive softmax cutoff points. "
         "Must be used with adaptive_loss criterion",
     ),
     parser.add_argument(
         "--adaptive-softmax-dropout",
         type=float,
         metavar="D",
         help="sets adaptive softmax dropout for the tail projections",
     )
     """LightConv and DynamicConv arguments"""
     parser.add_argument(
         "--encoder-kernel-size-list",
         type=lambda x: options.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31,31]")',
     )
     parser.add_argument(
         "--decoder-kernel-size-list",
         type=lambda x: options.eval_str_list(x, int),
         help='list of kernel size (default: "[3,7,15,31,31,31]")',
     )
     parser.add_argument("--encoder-glu",
                         type=options.eval_bool,
                         help="glu after in proj")
     parser.add_argument("--decoder-glu",
                         type=options.eval_bool,
                         help="glu after in proj")
     parser.add_argument(
         "--encoder-conv-type",
         default="dynamic",
         type=str,
         choices=["dynamic", "lightweight"],
         help="type of convolution",
     )
     parser.add_argument(
         "--decoder-conv-type",
         default="dynamic",
         type=str,
         choices=["dynamic", "lightweight"],
         help="type of convolution",
     )
     parser.add_argument("--weight-softmax",
                         default=True,
                         type=options.eval_bool)
     parser.add_argument(
         "--weight-dropout",
         type=float,
         metavar="D",
         help="dropout probability for conv weights",
     )
示例#13
0
    def __init__(self, **kwargs):

        super().__init__()
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        args = Parameters()
        args.update(**kwargs)
        args.criterion = ''

        lstm_luong_wmt_en_de(args)

        if args.encoder_layers != args.decoder_layers:
            raise ValueError('--encoder-layers must match --decoder-layers')

        max_source_positions = getattr(args, 'max_source_positions',
                                       DEFAULT_MAX_SOURCE_POSITIONS)
        max_target_positions = getattr(args, 'max_target_positions',
                                       DEFAULT_MAX_TARGET_POSITIONS)

        src_dict, tgt_dict = kwargs["vocab_src"], kwargs["vocab_tgt"]

        pretrained_encoder_embed = None
        pretrained_decoder_embed = None

        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
                args.decoder_embed_dim != args.decoder_out_embed_dim):
            raise ValueError(
                '--share-decoder-input-output-embeddings requires '
                '--decoder-embed-dim to match --decoder-out-embed-dim')

        if args.encoder_freeze_embed:
            pretrained_encoder_embed.weight.requires_grad = False
        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        self.encoder = LSTMEncoder(dictionary=src_dict,
                                   embed_dim=args.encoder_embed_dim,
                                   hidden_size=args.encoder_hidden_size,
                                   num_layers=args.encoder_layers,
                                   dropout_in=args.encoder_dropout_in,
                                   dropout_out=args.encoder_dropout_out,
                                   bidirectional=args.encoder_bidirectional,
                                   pretrained_embed=pretrained_encoder_embed,
                                   max_source_positions=max_source_positions)
        self.decoder = LSTMDecoder(
            dictionary=tgt_dict,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=options.eval_bool(args.decoder_attention),
            encoder_output_units=self.encoder.output_units,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(options.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == 'adaptive_loss' else None),
            max_target_positions=max_target_positions,
            residuals=False)
示例#14
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim
        self.ordinary_sinpos = args.ordinary_sinpos
        self.represent_length_by_lrpe = args.represent_length_by_lrpe
        self.represent_length_by_ldpe = args.represent_length_by_ldpe

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)

        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False,
                                     uniform=False) if embed_dim != input_embed_dim else None

        self.embed_positions_original = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings and self.ordinary_sinpos else None

        self.embed_positions_lrpe = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings and self.represent_length_by_lrpe else None

        self.embed_positions_ldpe = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings and self.represent_length_by_ldpe else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim,
                              bias=False, uniform=False) if embed_dim != output_embed_dim else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary), output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
           self.layer_norm = LayerNorm(embed_dim)
示例#15
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument('--dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability')
        parser.add_argument('--encoder-conv-channels',
                            type=str,
                            metavar='EXPR',
                            help='list of encoder convolution\'s out channels')
        parser.add_argument('--encoder-conv-kernel-sizes',
                            type=str,
                            metavar='EXPR',
                            help='list of encoder convolution\'s kernel sizes')
        parser.add_argument('--encoder-conv-strides',
                            type=str,
                            metavar='EXPR',
                            help='list of encoder convolution\'s strides')
        parser.add_argument('--encoder-rnn-hidden-size',
                            type=int,
                            metavar='N',
                            help='encoder rnn\'s hidden size')
        parser.add_argument('--encoder-rnn-layers',
                            type=int,
                            metavar='N',
                            help='number of rnn encoder layers')
        parser.add_argument(
            '--encoder-rnn-bidirectional',
            type=lambda x: options.eval_bool(x),
            help='make all rnn layers of encoder bidirectional')
        parser.add_argument(
            '--encoder-rnn-residual',
            type=lambda x: options.eval_bool(x),
            help='create residual connections for rnn encoder '
            'layers (starting from the 2nd layer), i.e., the actual '
            'output of such layer is the sum of its input and output')
        parser.add_argument('--decoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-embed-path',
                            type=str,
                            metavar='STR',
                            help='path to pre-trained decoder embedding')
        parser.add_argument('--decoder-freeze-embed',
                            action='store_true',
                            help='freeze decoder embeddings')
        parser.add_argument('--decoder-hidden-size',
                            type=int,
                            metavar='N',
                            help='decoder hidden size')
        parser.add_argument('--decoder-layers',
                            type=int,
                            metavar='N',
                            help='number of decoder layers')
        parser.add_argument('--decoder-out-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder output embedding dimension')
        parser.add_argument(
            '--decoder-rnn-residual',
            type=lambda x: options.eval_bool(x),
            help='create residual connections for rnn decoder '
            'layers (starting from the 2nd layer), i.e., the actual '
            'output of such layer is the sum of its input and output')
        parser.add_argument('--attention-type',
                            type=str,
                            metavar='STR',
                            choices=['bahdanau', 'luong'],
                            help='attention type')
        parser.add_argument('--attention-dim',
                            type=int,
                            metavar='N',
                            help='attention dimension')
        parser.add_argument(
            '--need-attention',
            action='store_true',
            help='need to return attention tensor for the caller')
        parser.add_argument(
            '--adaptive-softmax-cutoff',
            metavar='EXPR',
            help='comma separated list of adaptive softmax cutoff points. '
            'Must be used with adaptive_loss criterion')
        parser.add_argument('--share-decoder-input-output-embed',
                            type=lambda x: options.eval_bool(x),
                            help='share decoder input and output embeddings')
        parser.add_argument(
            '--pretrained-lm-checkpoint',
            type=str,
            metavar='STR',
            help='path to load checkpoint from pretrained language model(LM), '
            'which will be present and kept fixed during training.')

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            '--encoder-rnn-dropout-in',
            type=float,
            metavar='D',
            help='dropout probability for encoder rnn\'s input')
        parser.add_argument(
            '--encoder-rnn-dropout-out',
            type=float,
            metavar='D',
            help='dropout probability for encoder rnn\'s output')
        parser.add_argument(
            '--decoder-dropout-in',
            type=float,
            metavar='D',
            help='dropout probability for decoder input embedding')
        parser.add_argument('--decoder-dropout-out',
                            type=float,
                            metavar='D',
                            help='dropout probability for decoder output')

        # Scheduled sampling options
        parser.add_argument(
            '--scheduled-sampling-probs',
            type=lambda p: options.eval_str_list(p),
            metavar='P_1,P_2,...,P_N',
            default=1.0,
            help='scheduled sampling probabilities of sampling the truth '
            'labels for N epochs starting from --start-schedule-sampling-epoch; '
            'all later epochs using P_N')
        parser.add_argument(
            '--start-scheduled-sampling-epoch',
            type=int,
            metavar='N',
            default=1,
            help='start scheduled sampling from the specified epoch')
示例#16
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = getattr(args, "cross_self_attention",
                                            False)

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.decoder_normalize_before and not getattr(
                args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not args.tie_adaptive_weights else None)

        self.adaptive_softmax = None
        self.output_projection = None
        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif self.share_input_output_embed:
            self.output_projection = nn.Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = nn.Linear(self.output_embed_dim,
                                               len(dictionary),
                                               bias=False)
            nn.init.normal_(self.output_projection.weight,
                            mean=0,
                            std=self.output_embed_dim**-0.5)
示例#17
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument("--encoder-conv-channels",
                            type=str,
                            metavar="EXPR",
                            help="list of encoder convolution\'s out channels")
        parser.add_argument("--encoder-conv-kernel-sizes",
                            type=str,
                            metavar="EXPR",
                            help="list of encoder convolution\'s kernel sizes")
        parser.add_argument("--encoder-conv-strides",
                            type=str,
                            metavar="EXPR",
                            help="list of encoder convolution\'s strides")
        parser.add_argument("--encoder-rnn-hidden-size",
                            type=int,
                            metavar="N",
                            help="encoder rnn\'s hidden size")
        parser.add_argument("--encoder-rnn-layers",
                            type=int,
                            metavar="N",
                            help="number of rnn encoder layers")
        parser.add_argument(
            "--encoder-rnn-bidirectional",
            type=lambda x: options.eval_bool(x),
            help="make all rnn layers of encoder bidirectional")
        parser.add_argument(
            "--encoder-rnn-residual",
            type=lambda x: options.eval_bool(x),
            help="create residual connections for rnn encoder "
            "layers (starting from the 2nd layer), i.e., the actual "
            "output of such layer is the sum of its input and output")
        parser.add_argument("--decoder-embed-dim",
                            type=int,
                            metavar="N",
                            help="decoder embedding dimension")
        parser.add_argument("--decoder-embed-path",
                            type=str,
                            metavar="STR",
                            help="path to pre-trained decoder embedding")
        parser.add_argument("--decoder-freeze-embed",
                            action="store_true",
                            help="freeze decoder embeddings")
        parser.add_argument("--decoder-hidden-size",
                            type=int,
                            metavar="N",
                            help="decoder hidden size")
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="number of decoder layers")
        parser.add_argument("--decoder-out-embed-dim",
                            type=int,
                            metavar="N",
                            help="decoder output embedding dimension")
        parser.add_argument(
            "--decoder-rnn-residual",
            type=lambda x: options.eval_bool(x),
            help="create residual connections for rnn decoder "
            "layers (starting from the 2nd layer), i.e., the actual "
            "output of such layer is the sum of its input and output")
        parser.add_argument("--attention-type",
                            type=str,
                            metavar="STR",
                            choices=["bahdanau", "luong"],
                            help="attention type")
        parser.add_argument("--attention-dim",
                            type=int,
                            metavar="N",
                            help="attention dimension")
        parser.add_argument(
            "--need-attention",
            action="store_true",
            help="need to return attention tensor for the caller")
        parser.add_argument(
            "--adaptive-softmax-cutoff",
            metavar="EXPR",
            help="comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion")
        parser.add_argument("--share-decoder-input-output-embed",
                            type=lambda x: options.eval_bool(x),
                            help="share decoder input and output embeddings")
        parser.add_argument(
            "--pretrained-lm-checkpoint",
            type=str,
            metavar="STR",
            help="path to load checkpoint from pretrained language model(LM), "
            "which will be present and kept fixed during training.")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--encoder-rnn-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for encoder rnn\'s input")
        parser.add_argument(
            "--encoder-rnn-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder rnn\'s output")
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding")
        parser.add_argument("--decoder-dropout-out",
                            type=float,
                            metavar="D",
                            help="dropout probability for decoder output")

        # Scheduled sampling options
        parser.add_argument(
            "--scheduled-sampling-probs",
            type=lambda p: options.eval_str_list(p),
            metavar="P_1,P_2,...,P_N",
            default=[1.0],
            help="scheduled sampling probabilities of sampling the truth "
            "labels for N epochs starting from --start-schedule-sampling-epoch; "
            "all later epochs using P_N")
        parser.add_argument(
            "--start-scheduled-sampling-epoch",
            type=int,
            metavar="N",
            default=1,
            help="start scheduled sampling from the specified epoch")
示例#18
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.decoder_layerdrop = 0
        if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
            self.decoder_layerdrop = args.decoder_layerdrop

        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.aan = args.aan
        decoder_layer_class = (AANDecoderLayer if self.aan else
                               fairseq_transformer.TransformerDecoderLayer)

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [decoder_layer_class(args) for i in range(args.decoder_layers)])
        if hasattr(args,
                   "decoder_layers_to_keep") and args.decoder_layers_to_keep:
            layers_to_keep = sorted(
                int(x) for x in args.decoder_layers_to_keep.split(","))
            self.decoder_layers_to_keep = {
                layer_id: layer_idx
                for layer_idx, layer_id in enumerate(layers_to_keep)
            }

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False

        # Use quantizable nn.Linear for output projection instead of F.linear
        self.output_projection = None
        if self.vocab_reduction_module is None:
            if self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1],
                    self.embed_tokens.weight.shape[0])
                self.output_projection.weight = self.embed_tokens.weight
            else:
                self.output_projection = nn.Linear(self.embed_out.shape[1],
                                                   self.embed_out.shape[0])
                self.output_projection.weight = self.embed_out
示例#19
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument('--activation-fn',
                            choices=utils.get_available_activation_fns(),
                            help='activation function to use')
        parser.add_argument('--dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability')
        parser.add_argument('--attention-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability for attention weights')
        parser.add_argument(
            '--activation-dropout',
            '--relu-dropout',
            type=float,
            metavar='D',
            help='dropout probability after activation in FFN.')
        parser.add_argument('--encoder-embed-path',
                            type=str,
                            metavar='STR',
                            help='path to pre-trained encoder embedding')
        parser.add_argument('--encoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--encoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='encoder embedding dimension for FFN')
        parser.add_argument('--encoder-layers',
                            type=int,
                            metavar='N',
                            help='num encoder layers')
        parser.add_argument('--encoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num encoder attention heads')
        parser.add_argument('--encoder-normalize-before',
                            action='store_true',
                            help='apply layernorm before each encoder block')
        parser.add_argument('--decoder-final-norm',
                            default=False,
                            action='store_true',
                            help='apply layernorm before each decoder block')
        parser.add_argument(
            '--encoder-learned-pos',
            action='store_true',
            help='use learned positional embeddings in the encoder')
        parser.add_argument('--decoder-embed-path',
                            type=str,
                            metavar='STR',
                            help='path to pre-trained decoder embedding')
        parser.add_argument('--decoder-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-ffn-embed-dim',
                            type=int,
                            metavar='N',
                            help='decoder embedding dimension for FFN')
        parser.add_argument('--decoder-layers',
                            type=int,
                            metavar='N',
                            help='num decoder layers')
        parser.add_argument('--decoder-attention-heads',
                            type=int,
                            metavar='N',
                            help='num decoder attention heads')
        parser.add_argument(
            '--decoder-learned-pos',
            action='store_true',
            help='use learned positional embeddings in the decoder')
        parser.add_argument('--decoder-normalize-before',
                            action='store_true',
                            help='apply layernorm before each decoder block')
        parser.add_argument('--share-decoder-input-output-embed',
                            action='store_true',
                            help='share decoder input and output embeddings')
        parser.add_argument('--share-all-embeddings',
                            action='store_true',
                            help='share encoder, decoder and output embeddings'
                            ' (requires shared dictionary and embed dim)')
        parser.add_argument(
            '--no-token-positional-embeddings',
            default=False,
            action='store_true',
            help=
            'if set, disables positional embeddings (outside self attention)')
        parser.add_argument(
            '--adaptive-softmax-cutoff',
            metavar='EXPR',
            help='comma separated list of adaptive softmax cutoff points. '
            'Must be used with adaptive_loss criterion'),
        parser.add_argument(
            '--adaptive-softmax-dropout',
            type=float,
            metavar='D',
            help='sets adaptive softmax dropout for the tail projections')

        parser.add_argument('--use_att',
                            type=str,
                            nargs='+',
                            default=[
                                'es',
                                'ds',
                                'dc',
                            ],
                            help='')
        parser.add_argument('--combine',
                            type=int,
                            default=0,
                            help='0 as usual  1 combine residual')
        parser.add_argument('--kernel_size',
                            type=int,
                            default=0,
                            help='do not set static kernel')
        parser.add_argument(
            '--attn_dynamic_type',
            type=int,
            default=0,
            help=
            '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use  dynamic kernel '
        )
        parser.add_argument('--attn_cat_relu', type=int, default=0)
        parser.add_argument(
            '--attn_wide_kernels',
            type=lambda x: options.eval_str_list(x, int),
            help='list of kernel size (default: "[3,15]") for wide and gate')
        parser.add_argument('--weight-dropout',
                            type=float,
                            metavar='D',
                            help='dropout probability for conv weights')
        parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1')
        parser.add_argument(
            '--dynamic_depth_kernels',
            type=lambda x: options.eval_str_list(x, int),
            help=
            'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn'
        )
        parser.add_argument('--dynamic_padding',
                            type=int,
                            default=0,
                            help='padding before dynamic conv')
        parser.add_argument('--attn_dynamic_cat', type=int, default=1)
        parser.add_argument('--bm',
                            type=int,
                            default=0,
                            help='whether to use transformer_bm')
        parser.add_argument('--bm_in_a',
                            type=float,
                            default=3,
                            help='sqrt(6/(1+a)),-1 for xavier')
        parser.add_argument('--bm_out_a',
                            type=float,
                            default=0,
                            help='sqrt(6/(1+a)), -1 for xavier')
        parser.add_argument('--bm_fc3', type=float, default=1, help='')
        parser.add_argument('--bm_fc4', type=float, default=1, help='')
        parser.add_argument('--input_dropout', type=float, default=0, help='')
        parser.add_argument('--init_method',
                            type=str,
                            default='km',
                            help='xavier,km,xi,fixup')
        parser.add_argument('--lnv',
                            type=str,
                            default='origin',
                            help='layernorm,adanorm')
示例#20
0
	def build_model(cls, args, task):
		"""Build a new model instance."""
		# make sure that all args are properly defaulted (in case there are any new ones)
		base_architecture(args)
		if args.encoder_layers_to_keep:
			args.encoder_layers = len(args.encoder_layers_to_keep.split(","))

	
		max_source_positions = getattr(args, 'max_source_positions', DEFAULT_MAX_SOURCE_POSITIONS)
		max_target_positions = getattr(args, 'max_target_positions', DEFAULT_MAX_TARGET_POSITIONS)

		def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
			num_embeddings = len(dictionary)
			padding_idx = dictionary.pad()
			embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
			embed_dict = utils.parse_embedding(embed_path)
			utils.print_embed_overlap(embed_dict, dictionary)
			return utils.load_embedding(embed_dict, dictionary, embed_tokens)
		'''

		if args.encoder_embed_path:
			pretrained_encoder_embed = load_pretrained_embedding_from_file(
				args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim)
		else:
			num_embeddings = len(task.source_dictionary)
			pretrained_encoder_embed = Embedding(
				num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
			)

		if args.share_all_embeddings:
			# double check all parameters combinations are valid
			if task.source_dictionary != task.target_dictionary:
				raise ValueError('--share-all-embeddings requires a joint dictionary')
			if args.decoder_embed_path and (
					args.decoder_embed_path != args.encoder_embed_path):
				raise ValueError(
					'--share-all-embed not compatible with --decoder-embed-path'
				)
			if args.encoder_embed_dim != args.decoder_embed_dim:
				raise ValueError(
					'--share-all-embeddings requires --encoder-embed-dim to '
					'match --decoder-embed-dim'
				)
			pretrained_decoder_embed = pretrained_encoder_embed
			args.share_decoder_input_output_embed = True
		else:
			# separate decoder input embeddings
			pretrained_decoder_embed = None
			if args.decoder_embed_path:
				pretrained_decoder_embed = load_pretrained_embedding_from_file(
					args.decoder_embed_path,
					task.target_dictionary,
					args.decoder_embed_dim
				)
		# one last double check of parameter combinations
		if args.share_decoder_input_output_embed and (
				args.decoder_embed_dim != args.decoder_out_embed_dim):
			raise ValueError(
				'--share-decoder-input-output-embeddings requires '
				'--decoder-embed-dim to match --decoder-out-embed-dim'
			)

		if args.encoder_freeze_embed:
			pretrained_encoder_embed.weight.requires_grad = False
		if args.decoder_freeze_embed:
			pretrained_decoder_embed.weight.requires_grad = False
		'''
		encoder = TransformerEncoder(args, task.source_dictionary, args.word_encoder_embed_dim, args.encoder_embed_dim)
		decoder = LSTMDecoder(
			dictionary=task.target_dictionary,
			embed_dim=args.decoder_embed_dim,
			hidden_size=args.decoder_hidden_size,
			out_embed_dim=args.decoder_out_embed_dim,
			num_layers=args.decoder_layers,
			dropout_in=args.decoder_dropout_in,
			dropout_out=args.decoder_dropout_out,
			attention=options.eval_bool(args.decoder_attention),
			encoder_output_units=encoder.output_units,
			pretrained_embed=None,
			share_input_output_embed=args.share_decoder_input_output_embed,
			adaptive_softmax_cutoff=(
				options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
				if args.criterion == 'adaptive_loss' else None
			),
			max_target_positions=max_target_positions
		)
		return cls(args, encoder, decoder)
示例#21
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            LightConvDecoderLayer(args,
                                  no_encoder_attn,
                                  kernel_size=args.decoder_kernel_size_list[i])
            for i in range(args.decoder_layers)
        ])
        self.decoder_dynamic_combination = args.decoder_dynamic_combination
        self.decoder_linear_combination = args.decoder_linear_combination
        assert not (self.decoder_dynamic_combination
                    and self.decoder_linear_combination)
        if self.decoder_linear_combination or self.decoder_dynamic_combination:
            self.weight_ffn = nn.Sequential(
                nn.Linear(embed_dim, args.decoder_ffn_embed_dim),
                nn.ReLU(),
                nn.Linear(args.decoder_ffn_embed_dim, embed_dim),
            )
        if self.decoder_dynamic_combination:
            self.proj = nn.ModuleList([
                nn.Sequential(
                    nn.Linear(embed_dim * args.decoder_layers, embed_dim * 2),
                    nn.ReLU(), nn.Linear(embed_dim * 2, embed_dim))
                for _ in range(args.decoder_layers)
            ])
        if self.decoder_linear_combination:
            self.weights = nn.ParameterList([
                nn.Parameter(torch.randn(1, 1, embed_dim), requires_grad=True)
                for _ in range(args.decoder_layers)
            ])
        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
示例#22
0
    def __init__(self, args, dictionary, embed_tokens, lang2idx2idx, M, N, no_encoder_attn=False, final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout


        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        # define a dict of lang vocab id to its index in syntactic matrix
        self.lang2idx2idx = torch.LongTensor(lang2idx2idx)

        # define semantic and syntactic matrices
        no_langs = len([i for i in self.lang2idx2idx if i>-1])

        self.M = M
        self.N = N

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
def parse_args_and_adversary(parser, input_args=None):
    """This does the same thing as fairseq.options.parse_args_and_arch
    but for the criterion and adversary only"""
    # The parser doesn't know about adversary/criterion-specific args, so
    # we parse twice. First we parse the adversary/criterion, then we
    # parse a second time after adding the *-specific arguments.
    # If input_args is given, we will parse those args instead of sys.argv.
    args, _ = parser.parse_known_args(input_args)

    # Add model-specific args to parser.
    if hasattr(args, "arch"):
        model_specific_group = parser.add_argument_group(
            "Model-specific configuration",
            # Only include attributes which are explicitly given as command-line
            # arguments or which have default values.
            argument_default=argparse.SUPPRESS,
        )
        ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group)

    # Add adversary-specific args to parser.
    adversary_specific_group = parser.add_argument_group(
        f'Arguments for adversary "{args.adversary}"',
        # Only include attributes which are explicitly given as command-line
        # arguments or which have default values.
        argument_default=argparse.SUPPRESS,
    )
    ADVERSARY_REGISTRY[args.adversary].add_args(adversary_specific_group)

    # Add adversarial criterion-specific args to parser.
    adv_criterion_specific_group = parser.add_argument_group(
        f'Arguments for criterion "{args.adv_criterion}"',
        # Only include attributes which are explicitly given as command-line
        # arguments or which have default values.
        argument_default=argparse.SUPPRESS,
    )
    CRITERION_REGISTRY[args.adv_criterion].add_args(
        adv_criterion_specific_group)

    if hasattr(args, "criterion"):
        # Add criterion-specific args to parser.
        criterion_specific_group = parser.add_argument_group(
            f'Arguments for criterion "{args.criterion}"',
            # Only include attributes which are explicitly given as command-line
            # arguments or which have default values.
            argument_default=argparse.SUPPRESS,
        )
        CRITERION_REGISTRY[args.criterion].add_args(criterion_specific_group)

    # Add other *-specific args to parser.
    if hasattr(args, "optimizer"):
        OPTIMIZER_REGISTRY[args.optimizer].add_args(parser)
    if hasattr(args, "lr_scheduler"):
        LR_SCHEDULER_REGISTRY[args.lr_scheduler].add_args(parser)
    if hasattr(args, "task"):
        TASK_REGISTRY[args.task].add_args(parser)

    # Parse a second time.
    args = parser.parse_args(input_args)

    # Post-process args.
    if hasattr(args, "lr"):
        args.lr = eval_str_list(args.lr, type=float)
    if hasattr(args, "update_freq"):
        args.update_freq = eval_str_list(args.update_freq, type=int)
    if hasattr(args,
               "max_sentences_valid") and args.max_sentences_valid is None:
        args.max_sentences_valid = args.max_sentences
    # The following line is a hack to be able to use the cross_entropy
    # criterion without polluting the command line with unnecessary arguments
    if not hasattr(args, "sentence_avg"):
        args.sentence_avg = False
    # this is another hack to ignore the multilingual case
    if not hasattr(args, "multiling_source_lang"):
        args.multiling_source_lang = None

    # Apply architecture configuration.
    if hasattr(args, "arch"):
        ARCH_CONFIG_REGISTRY[args.arch](args)

    return args
示例#24
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if args.decoder_layers_to_keep:
            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))

        if getattr(args, "max_target_positions", None) is None:
            args.max_target_positions = getattr(args, "tokens_per_sample",
                                                DEFAULT_MAX_TARGET_POSITIONS)

        if args.character_embeddings:
            embed_tokens = CharacterTokenEmbedder(
                task.source_dictionary,
                eval(args.character_filters),
                args.character_embedding_dim,
                args.decoder_embed_dim,
                args.char_embedder_highway_layers,
            )
        elif args.adaptive_input:
            embed_tokens = AdaptiveInput(
                len(task.source_dictionary),
                task.source_dictionary.pad(),
                args.decoder_input_dim,
                args.adaptive_input_factor,
                args.decoder_embed_dim,
                options.eval_str_list(args.adaptive_input_cutoff, type=int),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            embed_tokens = cls.build_embedding(args, task.source_dictionary,
                                               args.decoder_input_dim)

        if args.tie_adaptive_weights:
            assert args.adaptive_input
            assert args.adaptive_input_factor == args.adaptive_softmax_factor
            assert (args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
                    ), "{} != {}".format(args.adaptive_softmax_cutoff,
                                         args.adaptive_input_cutoff)
            assert args.decoder_input_dim == args.decoder_output_dim

        decoder = TransformerDecoder(args,
                                     task.target_dictionary,
                                     embed_tokens,
                                     no_encoder_attn=True)

        if getattr(args, "lm_path", None):
            print('load Transformer_LM from {}'.format(args.lm_path))
            state = checkpoint_utils.load_checkpoint_to_cpu(args.lm_path)
            lm_args = state["args"]
            lm_args.data = args.data
            assert getattr(lm_args, "lm_path", None) is None

            task = tasks.setup_task(lm_args)
            decoder = task.build_model(lm_args)
            print('restore Transformer_LM from {}'.format(args.lm_path))
            decoder.load_state_dict(state["model"], strict=True)
        decoder.dim_output = len(task.dictionary)

        return cls(decoder)
示例#25
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed
        self.dictionary =dictionary

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim

        self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False,
                                     uniform=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim,
                              bias=False, uniform=False) if embed_dim != output_embed_dim else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary), output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
           self.layer_norm = LayerNorm(embed_dim)

        self.head_nums = args.decoder_attention_heads
        # complementary
        self.head_dim = embed_dim // args.decoder_attention_heads
        # self.head_dim = embed_dim
        self.attn_out = Linear(self.head_dim, self.head_dim)
        self.re_fc_1 = Linear(self.head_dim, self.head_dim)
        self.re_fc_2 = Linear(self.head_dim, self.head_dim)
        # self.re_fc_1 = Linear(self.head_dim, 512)
        # self.re_fc_2 = Linear(512, self.head_dim)
        self.re_layer_norm_1 = LayerNorm(self.head_dim, )
        self.re_layer_norm_2 = LayerNorm(self.head_dim)
        self.re_embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.head_dim))
        nn.init.normal_(self.re_embed_out, mean=0, std=self.head_dim ** -0.5)
示例#26
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 no_encoder_attn=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            transformer_with_copyDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.copy_attention = MultiheadOnlyAttention(
            embed_dim,
            1,
            dropout=args.attention_dropout,
        )
        self.copy_or_generate = nn.Sequential(nn.Linear(embed_dim, 1),
                                              nn.Sigmoid())

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
示例#27
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 embed_scale=None,
                 no_encoder_attn=False,
                 left_pad=False,
                 final_norm=True,
                 remove_head=False):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        self.embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            self.embed_dim) if embed_scale is None else embed_scale

        self.project_in_dim = nn.Linear(
            input_embed_dim, self.embed_dim,
            bias=False) if self.embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            self.embed_dim,
            self.padding_idx,
            # learned=args.decoder_learned_pos,
        ) if not args.no_dec_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = nn.Linear(self.embed_dim, output_embed_dim, bias=False) \
            if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        # self.load_softmax = not getattr(args, 'remove_head', False)
        self.load_softmax = not remove_head

        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    output_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens
                    if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), output_embed_dim))
                # nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = BertLayerNorm(self.embed_dim)
示例#28
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 problinkinput,
                 problinkweight,
                 no_encoder_attn=False,
                 left_pad=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        self.problinkinput = problinkinput
        self.problinkweight = problinkweight

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerBayesDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.layers_cls = nn.ModuleList([])
        self.layers_cls.extend([
            DACls.build_bayesclassifier(args, args.decoder_embed_dim)
            for i in range(args.decoder_layers)
        ])
        self.encoder_out_cls = DACls.build_bayesclassifier(
            args, args.encoder_embed_dim)

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
示例#29
0
    def __init__(
        self,
        args,
        src_dict,
        dst_dict,
        embed_tokens,
        no_encoder_attn=False,
        left_pad=False,
        final_norm=True,
    ):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerAANDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = Linear(embed_dim,
                                           args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)
        self.register_buffer("version", torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False
示例#30
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([3]))

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        self.project_in_dim = Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.cross_self_attention = getattr(args, 'cross_self_attention',
                                            False)
        self.layer_wise_attention = getattr(args, 'layer_wise_attention',
                                            False)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out,
                            mean=0,
                            std=self.output_embed_dim**-0.5)

        if args.decoder_normalize_before and not getattr(
                args, 'no_decoder_final_norm', False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, 'layernorm_embedding', False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
示例#31
0
 def add_args(parser):
     """Add model-specific arguments to the parser."""
     parser.add_argument('--encoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained encoder embedding')
     parser.add_argument('--encoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='encoder embedding dimension')
     parser.add_argument(
         '--encoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the encoder')
     parser.add_argument('--decoder-embed-path',
                         type=str,
                         metavar='STR',
                         help='path to pre-trained decoder embedding')
     parser.add_argument('--decoder-embed-dim',
                         type=int,
                         metavar='N',
                         help='decoder embedding dimension')
     parser.add_argument(
         '--decoder-learned-pos',
         action='store_true',
         help='use learned positional embeddings in the decoder')
     parser.add_argument('--decoder-normalize-before',
                         action='store_true',
                         help='apply layernorm before each decoder block')
     parser.add_argument('--share-decoder-input-output-embed',
                         action='store_true',
                         help='share decoder input and output embeddings')
     parser.add_argument('--share-all-embeddings',
                         action='store_true',
                         help='share encoder, decoder and output embeddings'
                         ' (requires shared dictionary and embed dim)')
     parser.add_argument('--dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability')
     parser.add_argument('--attention-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability for attention weights')
     parser.add_argument('--relu-dropout',
                         type=float,
                         metavar='D',
                         help='dropout probability after ReLU in FFN')
     parser.add_argument('--decoder-layers',
                         type=int,
                         metavar='N',
                         help='num layers')
     parser.add_argument('--decoder-ffn-embed-dim',
                         type=int,
                         metavar='N',
                         help='embedding dimension for FFN')
     parser.add_argument('--decoder-attention-heads',
                         type=int,
                         metavar='N',
                         help='num attention heads')
     parser.add_argument('--kernel-size-list',
                         type=lambda x: options.eval_str_list(x, int),
                         help='list of kernel size (default: None)')
     parser.add_argument('--language-embeddings',
                         action='store_true',
                         help='use language embeddings')