Пример #1
0
    def __init__(self, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.multihead_attn_impl = args.multihead_attn_impl
        if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd':
            self.self_attn = SelfMultiheadAttn(
                self.embed_dim,
                args.encoder_attention_heads,
                dropout=args.attention_dropout,
                bias=False,
                include_norm_add=True,
                impl='fast',
            )
        elif args.multihead_attn_impl == 'fast':
            self.self_attn = SelfMultiheadAttn(
                self.embed_dim,
                args.encoder_attention_heads,
                dropout=args.attention_dropout,
                bias=False,
                include_norm_add=False,
                impl='fast',
            )
        else:
            self.self_attn = SelfMultiheadAttn(
                self.embed_dim,
                args.encoder_attention_heads,
                dropout=args.attention_dropout,
                bias=False,
                include_norm_add=False,
                impl='default',
            )

        # in_proj_weight has shape [3 * hidden, hidden] but it should be
        # initialized like a [hidden, hidden] matrix.
        # sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
        # therefore xavier_uniform gain should be set to sqrt(2).
        torch.nn.init.xavier_uniform_(self.self_attn.in_proj_weight,
                                      gain=math.sqrt(2))

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd':
            self.layer_norms = nn.ModuleList(
                [FusedLayerNorm(self.embed_dim) for i in range(1)])
        else:
            self.layer_norms = nn.ModuleList(
                [FusedLayerNorm(self.embed_dim) for i in range(2)])
    def setUp(self, seed=1234):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.seq_length = 80
        self.sequences = 10
        self.hidden_dim = 1024
        self.heads = 16
        self.dropout_prob = 0.0

        self.ref_layer = SelfMultiheadAttn(self.hidden_dim,
                                           self.heads,
                                           dropout=self.dropout_prob,
                                           bias=True,
                                           include_norm_add=False,
                                           separate_qkv_params=True,
                                           mask_additive=True,
                                           impl='default')
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda")).requires_grad_(True)
        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.tst_layer = SelfMultiheadAttn(self.hidden_dim,
                                           self.heads,
                                           dropout=self.dropout_prob,
                                           bias=True,
                                           include_norm_add=False,
                                           separate_qkv_params=True,
                                           mask_additive=True,
                                           impl='fast')
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()

        self.tst_inputs = torch.randn(
            self.seq_length,
            self.sequences,
            self.hidden_dim,
            dtype=torch.float16,
            device=torch.device("cuda")).requires_grad_(True)
Пример #3
0
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = SelfMultiheadAttn(d_model,
                                           nhead,
                                           dropout=dropout,
                                           impl='fast')
        self.feed_forward = MLP([d_model, dim_feedforward, d_model])
        self.d_model = d_model
        self.norm1 = layer_norm(d_model)
        self.norm2 = layer_norm(d_model)

        self.activation = F.gelu
class SelfMultiheadAttnNormAddTest(unittest.TestCase):
    def setUp(self, seed=1234):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        self.seq_length   = 80
        self.sequences    = 10
        self.hidden_dim   = 1024
        self.heads        = 16
        self.dropout_prob = 0.0

        self.ref_layer = SelfMultiheadAttn(self.hidden_dim, 
                                           self.heads, 
                                           dropout=self.dropout_prob, 
                                           bias=False, 
                                           include_norm_add=True, 
                                           impl='default')
        self.ref_layer.cuda().half()
        self.ref_layer.reset_parameters()
        self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
                                      dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)

        # Reset seed so parameters are identical
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        
        self.tst_layer = SelfMultiheadAttn(self.hidden_dim, 
                                           self.heads, 
                                           dropout=self.dropout_prob, 
                                           bias=False, 
                                           include_norm_add=True, 
                                           impl='fast')
        self.tst_layer.cuda().half()
        self.tst_layer.reset_parameters()
        
        self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
                                      dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)

    def test_self_multihead_attn_norm_add(self) :
        grads         = torch.randn_like(self.tst_inputs)

        for _ in range(0, 5) :
            ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
                                                   self.ref_inputs, 
                                                   self.ref_inputs,
                                                   key_padding_mask=None, 
                                                   need_weights=False, 
                                                   attn_mask=None,
                                                   is_training=True)
         
            tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
                                                   self.tst_inputs, 
                                                   self.tst_inputs,
                                                   key_padding_mask=None, 
                                                   need_weights=False, 
                                                   attn_mask=None,
                                                   is_training=True)
            
            self.ref_inputs.backward(grads)
            self.tst_inputs.backward(grads)

        self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
        self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3))
        self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
                                    dropout=0.1,
                                    bias=args.biases,
                                    include_norm_add=args.norm_add,
                                    impl='fast'))
    else:
        if args.native:
            attn_layers.append(
                torch.nn.MultiheadAttention(args.hidden_dim,
                                            args.heads,
                                            dropout=0.1,
                                            bias=args.biases))
        elif args.ref:
            attn_layers.append(
                SelfMultiheadAttn(args.hidden_dim,
                                  args.heads,
                                  dropout=0.1,
                                  bias=args.biases,
                                  include_norm_add=args.norm_add,
                                  impl='default'))
        else:
            attn_layers.append(
                SelfMultiheadAttn(args.hidden_dim,
                                  args.heads,
                                  dropout=0.1,
                                  bias=args.biases,
                                  include_norm_add=args.norm_add,
                                  impl='fast'))
    attn_layers[idx].cuda()
    attn_layers[idx].half()
    if not args.native:
        attn_layers[idx].reset_parameters()
for seed in range(args.seed_start, args.seed_end + 1):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    ref_layer = None
    if args.encdec_attn:
        ref_layer = EncdecMultiheadAttn(args.hidden_dim,
                                        args.heads,
                                        dropout=dropout_prob,
                                        bias=False,
                                        include_norm_add=args.norm_add,
                                        impl='default')
    else:
        ref_layer = SelfMultiheadAttn(args.hidden_dim,
                                      args.heads,
                                      dropout=dropout_prob,
                                      bias=False,
                                      include_norm_add=args.norm_add,
                                      impl='default')
    ref_layer.cuda()
    ref_layer.half()
    ref_layer.reset_parameters()

    ref_inputs = torch.randn(args.seq_length,
                             args.num_seqs_start,
                             args.hidden_dim,
                             dtype=torch.float16,
                             device=torch.device("cuda")).requires_grad_(True)
    ref_inputs_kv = None
    if args.encdec_attn:
        ref_inputs_kv = torch.randn(
            args.seq_length,
Пример #7
0
    def __init__(self, args, no_encoder_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.multihead_attn_impl = args.multihead_attn_impl
        if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd':
            self.self_attn = SelfMultiheadAttn(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
                bias=False,
                include_norm_add=True,
                impl='fast',
            )
        elif args.multihead_attn_impl == 'fast':
            self.self_attn = SelfMultiheadAttn(
                self.embed_dim,
                args.encoder_attention_heads,
                dropout=args.attention_dropout,
                bias=False,
                include_norm_add=False,
                impl='fast',
            )
        else:
            self.self_attn = SelfMultiheadAttn(
                self.embed_dim,
                args.encoder_attention_heads,
                dropout=args.attention_dropout,
                bias=False,
                include_norm_add=False,
                impl='default',
            )

        # in_proj_weight has shape [3 * hidden, hidden] but it should be
        # initialized like a [hidden, hidden] matrix.
        # sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
        # therefore xavier_uniform gain should be set to sqrt(2).
        torch.nn.init.xavier_uniform_(self.self_attn.in_proj_weight,
                                      gain=math.sqrt(2))

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before

        if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd'):
            self.self_attn_layer_norm = FusedLayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            if not (args.multihead_attn_impl
                    == 'fast_with_lyrnrm_and_dropoutadd'):
                self.encoder_attn_layer_norm = None
        else:
            if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd':
                self.encoder_attn = EncdecMultiheadAttn(
                    self.embed_dim,
                    args.decoder_attention_heads,
                    dropout=args.attention_dropout,
                    bias=False,
                    include_norm_add=True,
                    impl='fast',
                )
            elif args.multihead_attn_impl == 'fast':
                self.encoder_attn = EncdecMultiheadAttn(
                    self.embed_dim,
                    args.decoder_attention_heads,
                    dropout=args.attention_dropout,
                    bias=False,
                    include_norm_add=False,
                    impl='fast',
                )
            else:
                self.encoder_attn = EncdecMultiheadAttn(
                    self.embed_dim,
                    args.decoder_attention_heads,
                    dropout=args.attention_dropout,
                    bias=False,
                    include_norm_add=False,
                    impl='default',
                )

            # in_proj_weight_kv has shape [2 * hidden, hidden] but it should be
            # initialized like a [hidden, hidden] matrix.
            # sqrt(6 / (hidden + hidden)) / sqrt(6 / (2 * hidden + hidden)) = sqrt(1.5)
            # therefore xavier_uniform gain should be set to sqrt(1.5).
            torch.nn.init.xavier_uniform_(self.encoder_attn.in_proj_weight_kv,
                                          gain=math.sqrt(1.5))

            if not (args.multihead_attn_impl
                    == 'fast_with_lyrnrm_and_dropoutadd'):
                self.encoder_attn_layer_norm = FusedLayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = FusedLayerNorm(self.embed_dim)
        self.need_attn = True