Exemplo n.º 1
0
    def __init__(self, args, dictionary, embed_tokens, left_pad=True):
        super().__init__(dictionary)
        self.dropout = args.dropout

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024,
            embed_dim,
            self.padding_idx,
            left_pad=left_pad,
            learned=args.encoder_learned_pos,
        )

        self.layers = nn.ModuleList([])
        self.layers.extend([
            fairseq_transformer.TransformerEncoderLayer(args)
            for i in range(args.encoder_layers)
        ])

        # Variable tracker
        self.tracker = VariableTracker()

        # Initialize adversarial mode
        self.set_gradient_tracking_mode(False)
Exemplo n.º 2
0
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024,
            embed_dim,
            padding_idx,
            left_pad=left_pad,
            learned=args.decoder_learned_pos,
        )

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [
                fairseq_transformer.TransformerDecoderLayer(args)
                for i in range(args.decoder_layers)
            ]
        )

        if not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
Exemplo n.º 3
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            fairseq_transformer.TransformerDecoderLayer(args)
            for i in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False
Exemplo n.º 4
0
 def __init__(self, args, embed_tokens):
     super().__init__()
     self.dropout = args.dropout
     embed_dim = embed_tokens.embedding_dim
     self.padding_idx = embed_tokens.padding_idx
     self.embed_tokens = embed_tokens
     self.embed_scale = math.sqrt(embed_dim)
     self.embed_positions = fairseq_transformer.PositionalEmbedding(
         1024, embed_dim, self.padding_idx, learned=args.encoder_learned_pos
     )
    def __init__(
        self,
        args,
        dictionary,
        embed_tokens,
        num_chars=50,
        embed_dim=32,
        char_cnn_params="[(128, 3), (128, 5)]",
        char_cnn_nonlinear_fn="tanh",
        char_cnn_pool_type="max",
        char_cnn_num_highway_layers=0,
        char_cnn_output_dim=-1,
        use_pretrained_weights=False,
        finetune_pretrained_weights=False,
        weights_file=None,
    ):
        super().__init__(dictionary)

        convolutions_params = literal_eval(char_cnn_params)
        self.char_cnn_encoder = char_encoder.CharCNNModel(
            dictionary,
            num_chars,
            embed_dim,
            convolutions_params,
            char_cnn_nonlinear_fn,
            char_cnn_pool_type,
            char_cnn_num_highway_layers,
            char_cnn_output_dim,
            use_pretrained_weights,
            finetune_pretrained_weights,
            weights_file,
        )

        self.embed_tokens = embed_tokens
        token_embed_dim = embed_tokens.embedding_dim
        self.word_layer_norm = nn.LayerNorm(token_embed_dim)

        char_embed_dim = (
            char_cnn_output_dim
            if char_cnn_output_dim != -1
            else sum(out_dim for (out_dim, _) in convolutions_params)
        )
        self.char_layer_norm = nn.LayerNorm(char_embed_dim)
        self.word_dim = char_embed_dim + token_embed_dim
        self.char_scale = math.sqrt(char_embed_dim / self.word_dim)
        self.word_scale = math.sqrt(token_embed_dim / self.word_dim)
        if self.word_dim != args.encoder_embed_dim:
            self.word_to_transformer_embed = fairseq_transformer.Linear(
                self.word_dim, args.encoder_embed_dim
            )

        self.dropout = args.dropout

        self.padding_idx = dictionary.pad()
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024,
            args.encoder_embed_dim,
            self.padding_idx,
            learned=args.encoder_learned_pos,
        )

        self.transformer_encoder_given_embeddings = TransformerEncoderGivenEmbeddings(
            args=args, proj_to_decoder=True
        )

        # Variable tracker
        self.tracker = VariableTracker()
        # Initialize adversarial mode
        self.set_gradient_tracking_mode(False)
        self.set_embed_noising_mode(False)

        # disables sorting and word-length thresholding if True
        # (enables ONNX tracing of length-sorted input with batch_size = 1)
        self.onnx_export_model = False
Exemplo n.º 6
0
    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.decoder_layerdrop = 0
        if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
            self.decoder_layerdrop = args.decoder_layerdrop

        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.aan = args.aan
        decoder_layer_class = (AANDecoderLayer if self.aan else
                               fairseq_transformer.TransformerDecoderLayer)

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [decoder_layer_class(args) for i in range(args.decoder_layers)])
        if hasattr(args,
                   "decoder_layers_to_keep") and args.decoder_layers_to_keep:
            layers_to_keep = sorted(
                int(x) for x in args.decoder_layers_to_keep.split(","))
            self.decoder_layers_to_keep = {
                layer_id: layer_idx
                for layer_idx, layer_id in enumerate(layers_to_keep)
            }

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False

        # Use quantizable nn.Linear for output projection instead of F.linear
        self.output_projection = None
        if self.vocab_reduction_module is None:
            if self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1],
                    self.embed_tokens.weight.shape[0])
                self.output_projection.weight = self.embed_tokens.weight
            else:
                self.output_projection = nn.Linear(self.embed_out.shape[1],
                                                   self.embed_out.shape[0])
                self.output_projection.weight = self.embed_out
Exemplo n.º 7
0
    def __init__(
        self,
        args,
        src_dict,
        dst_dict,
        embed_tokens,
        no_encoder_attn=False,
        left_pad=False,
        final_norm=True,
    ):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(
            embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos)

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerAANDecoderLayer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = Linear(embed_dim,
                                           args.decoder_out_embed_dim)
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens
                if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(
                torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5)
        self.register_buffer("version", torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict,
                dst_dict,
                args.vocab_reduction_params,
                fp16=args.fp16)

        self.onnx_trace = False