예제 #1
0
파일: wav2vec2.py 프로젝트: ishine/fairseq
 def build_encoder_layer(self, args: Wav2Vec2Config):
     if args.layer_type == "transformer":
         layer = TransformerSentenceEncoderLayer(
             embedding_dim=self.embedding_dim,
             ffn_embedding_dim=args.encoder_ffn_embed_dim,
             num_attention_heads=args.encoder_attention_heads,
             dropout=self.dropout,
             attention_dropout=args.attention_dropout,
             activation_dropout=args.activation_dropout,
             activation_fn=args.activation_fn,
             layer_norm_first=args.layer_norm_first,
         )
     elif args.layer_type == "conformer":
         layer = ConformerWav2Vec2EncoderLayer(
             embed_dim=self.embedding_dim,
             ffn_embed_dim=args.encoder_ffn_embed_dim,
             attention_heads=args.encoder_attention_heads,
             dropout=args.dropout,
             depthwise_conv_kernel_size=args.depthwise_conv_kernel_size,
             activation_fn="swish",
             attn_type=args.attn_type,
             use_fp16=args.fp16,
             pos_enc_type="abs",
         )
     layer = fsdp_wrap(layer)
     if args.checkpoint_activations:
         layer = checkpoint_wrapper(layer)
     return layer
예제 #2
0
    def build_decoder_layer(self, cfg, no_encoder_attn=False):
        if self.cfg.shared_layer_qkv_conv == 1 and self.compress_layer is None:
            target_dim = cfg.compressed_dim
            compress_layer = nn.Linear(
                self.cfg.max_positions,
                target_dim,
            )

            nn.init.xavier_uniform_(compress_layer.weight,
                                    gain=1 / math.sqrt(2))
            if self.cfg.freeze_conv == 1:
                compress_layer.weight.requires_grad = False
            self.compress_layer = compress_layer

        #return PrimerTransformerEncoderLayer(cfg, self.compress_layer)
        layer = primer_layer.PrimerDecoderLayerBase(cfg, self.compress_layer,
                                                    no_encoder_attn)
        checkpoint = cfg.checkpoint_activations
        if checkpoint:
            offload_to_cpu = cfg.offload_activations
            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
        # if we are checkpointing, enforce that FSDP always wraps the
        # checkpointed layer, regardless of layer size
        min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
        return layer
예제 #3
0
    def __init__(self, cfg, task):
        try:
            from transformers.models.transfo_xl import (
                TransfoXLConfig,
                TransfoXLLMHeadModel,
            )
        except ImportError:
            from transformers.configuration_transfo_xl import TransfoXLConfig
            from transformers.modeling_transfo_xl import TransfoXLLMHeadModel

        super().__init__(task.target_dictionary)
        self.cfg = cfg

        # remove any cutoffs larger than the vocab size
        cutoffs = [
            cutoff for cutoff in cfg.cutoffs
            if cutoff < len(task.target_dictionary)
        ]

        config = TransfoXLConfig(
            vocab_size=len(task.target_dictionary),
            cutoffs=cutoffs,
            d_model=cfg.d_model,
            d_embed=cfg.d_model,
            n_head=cfg.n_head,
            d_head=cfg.d_head,
            d_inner=cfg.d_inner,
            div_val=cfg.div_val,
            n_layer=cfg.n_layer,
            mem_len=cfg.mem_len,
            clamp_len=cfg.clamp_len,
            same_length=cfg.same_length,
            dropout=cfg.dropout,
            dropatt=cfg.dropatt,
        )
        logger.info(config)
        self.model = TransfoXLLMHeadModel(config)

        # Workaround a bug in huggingface's ``ProjectedAdaptiveLogSoftmax``
        # which adds ``None`` values to an ``nn.ParameterList``, which is not
        # supported in PyTorch. Instead we can replace this with an
        # ``nn.ModuleList``, which does support ``None`` values.
        try:
            if all(p is None
                   for p in self.model.crit.out_projs._parameters.values()):
                self.model.crit.out_projs = torch.nn.ModuleList(
                    [None] * len(self.model.crit.out_projs._parameters))
        except Exception:
            pass

        if cfg.checkpoint_activations or cfg.offload_activations:
            for i in range(len(self.model.transformer.layers)):
                self.model.transformer.layers[i] = checkpoint_wrapper(
                    self.model.transformer.layers[i],
                    offload_to_cpu=cfg.offload_activations,
                )
                # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3])

        self._mems = None
 def build_decoder_layer(self, args, no_encoder_attn=False):
     layer = DecoderLayer(args.encoder_embed_dim,
                          args.encoder_ffn_embed_dim,
                          args.encoder_attention_heads, args.kdim,
                          args.vdim, args.dropout)
     if getattr(args, "checkpoint_activations", False):
         offload_to_cpu = getattr(args, "offload_activations", False)
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     return layer
예제 #5
0
 def build_encoder_layer(self, cfg):
     layer = transformer_layer.TransformerEncoderLayerBase(cfg)
     checkpoint = cfg.checkpoint_activations
     if checkpoint:
         offload_to_cpu = cfg.offload_activations
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     # if we are checkpointing, enforce that FSDP always wraps the
     # checkpointed layer, regardless of layer size
     min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
     layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
     return layer
예제 #6
0
 def build_decoder_layer(self, cfg, no_encoder_attn=False):
     layer = UniLMDecoderLayer(cfg, no_encoder_attn)
     checkpoint = cfg.checkpoint_activations
     if checkpoint:
         offload_to_cpu = cfg.offload_activations
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     # if we are checkpointing, enforce that FSDP always wraps the
     # checkpointed layer, regardless of layer size
     min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
     layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
     return layer
예제 #7
0
 def build_encoder_layer(self, args):
     layer = TransformerEncoderLayer(args)
     checkpoint = getattr(args, "checkpoint_activations", False)
     if checkpoint:
         offload_to_cpu = getattr(args, "offload_activations", False)
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     # if we are checkpointing, enforce that FSDP always wraps the
     # checkpointed layer, regardless of layer size
     min_params_to_wrap = (getattr(args, "min_params_to_wrap",
                                   DEFAULT_MIN_PARAMS_TO_WRAP)
                           if not checkpoint else 0)
     layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
     return layer
 def __init__(self,
              use_pytorch_checkpoint=False,
              use_fairseq_checkpoint=False):
     super().__init__()
     torch.manual_seed(0)
     self.use_pytorch_checkpoint = use_pytorch_checkpoint
     self.ffn = nn.Sequential(
         nn.Linear(32, 128),
         # add a Dropout layer to test RNG save/restore
         nn.Dropout(p=0.5),
         nn.Linear(128, 32),
     )
     if use_fairseq_checkpoint:
         self.ffn = checkpoint_wrapper(self.ffn)
     self.out = nn.Linear(32, 1)
예제 #9
0
    def __init__(self, args):
        super().__init__()

        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim

        self.pos_conv = nn.Conv1d(
            self.embedding_dim,
            self.embedding_dim,
            kernel_size=args.conv_pos,
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
        )
        dropout = 0
        std = math.sqrt(
            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        nn.init.constant_(self.pos_conv.bias, 0)

        self.pos_conv = nn.utils.weight_norm(self.pos_conv,
                                             name="weight",
                                             dim=2)
        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos),
                                      nn.GELU())

        layers = []
        for _ in range(args.encoder_layers):
            layer = TransformerSentenceEncoderLayer(
                embedding_dim=self.embedding_dim,
                ffn_embedding_dim=args.encoder_ffn_embed_dim,
                num_attention_heads=args.encoder_attention_heads,
                dropout=self.dropout,
                attention_dropout=args.attention_dropout,
                activation_dropout=args.activation_dropout,
                activation_fn=args.activation_fn,
                layer_norm_first=args.layer_norm_first,
            )
            if args.checkpoint_activations:
                layer = fsdp_wrap(layer)
                layer = checkpoint_wrapper(layer)
            layers.append(layer)
        self.layers = nn.ModuleList(layers)

        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
        self.layerdrop = args.encoder_layerdrop

        self.apply(init_bert_params)
예제 #10
0
파일: wav2vec2.py 프로젝트: ishine/fairseq
 def build_encoder_layer(self, args):
     layer = ConformerWav2Vec2EncoderLayer(
         embed_dim=self.embedding_dim,
         ffn_embed_dim=args.encoder_ffn_embed_dim,
         attention_heads=args.encoder_attention_heads,
         dropout=args.dropout,
         depthwise_conv_kernel_size=args.depthwise_conv_kernel_size,
         activation_fn="swish",
         attn_type=args.attn_type,
         pos_enc_type=args.pos_enc_type,
         use_fp16=args.fp16,  # only used for rope
     )
     layer = fsdp_wrap(layer)
     if args.checkpoint_activations:
         layer = checkpoint_wrapper(layer)
     return layer
예제 #11
0
    def __init__(self, cfg, task):
        try:
            from transformers.models.transfo_xl import (
                TransfoXLConfig,
                TransfoXLLMHeadModel,
            )
        except ImportError:
            from transformers.configuration_transfo_xl import TransfoXLConfig
            from transformers.modeling_transfo_xl import TransfoXLLMHeadModel

        super().__init__(task.target_dictionary)
        self.cfg = cfg

        # remove any cutoffs larger than the vocab size
        cutoffs = [
            cutoff for cutoff in cfg.cutoffs
            if cutoff < len(task.target_dictionary)
        ]

        config = TransfoXLConfig(
            vocab_size=len(task.target_dictionary),
            cutoffs=cutoffs,
            d_model=cfg.d_model,
            d_embed=cfg.d_model,
            n_head=cfg.n_head,
            d_head=cfg.d_head,
            d_inner=cfg.d_inner,
            div_val=cfg.div_val,
            n_layer=cfg.n_layer,
            mem_len=cfg.mem_len,
            clamp_len=cfg.clamp_len,
            same_length=cfg.same_length,
            dropout=cfg.dropout,
            dropatt=cfg.dropatt,
        )
        logger.info(config)
        self.model = TransfoXLLMHeadModel(config)

        if cfg.checkpoint_activations or cfg.offload_activations:
            for i in range(len(self.model.transformer.layers)):
                self.model.transformer.layers[i] = checkpoint_wrapper(
                    self.model.transformer.layers[i],
                    offload_to_cpu=cfg.offload_activations,
                )
                # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3])

        self._mems = None
예제 #12
0
 def build_decoder_layer(
     self,
     cfg,
     no_encoder_attn=False,
     positional_embedding: Optional[RelativePositionalEmbedding] = None,
 ):
     layer = TransformerWithRelativePositionalEmbeddingDecoderLayerBase(
         cfg,
         no_encoder_attn=no_encoder_attn,
         positional_embedding=positional_embedding,
     )
     checkpoint = cfg.checkpoint_activations
     if checkpoint:
         offload_to_cpu = cfg.offload_activations
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     # if we are checkpointing, enforce that FSDP always wraps the
     # checkpointed layer, regardless of layer size
     min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
     layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
     return layer
예제 #13
0
 def build_encoder_layer(
     self, cfg, positional_embedding: Optional[RelativePositionalEmbedding] = None
 ):
     if cfg.encoder.layer_type == "transformer":
         layer_cls = TransformerWithRelativePositionalEmbeddingEncoderLayerBase
     elif cfg.encoder.layer_type == "conformer":
         layer_cls = ConformerWithRelativePositionalEmbeddingEncoderLayerBase
     else:
         raise NotImplementedError
     layer = layer_cls(
         cfg, return_fc=self.return_fc, positional_embedding=positional_embedding
     )
     checkpoint = cfg.checkpoint_activations
     if checkpoint:
         offload_to_cpu = cfg.offload_activations
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     # if we are checkpointing, enforce that FSDP always wraps the
     # checkpointed layer, regardless of layer size
     min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
     layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
     return layer
예제 #14
0
 def build_decoder_layer(self, args, no_encoder_attn=False):
     layer = TransformerDecoderLayer(args, no_encoder_attn)
     if getattr(args, "checkpoint_activations", False):
         layer = checkpoint_wrapper(layer)
     return layer
예제 #15
0
 def build_encoder_layer(self, args):
     layer = TransformerEncoderLayer(args)
     if getattr(args, "checkpoint_activations", False):
         offload_to_cpu = getattr(args, "offload_activations", False)
         layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
     return layer
예제 #16
0
 def build_encoder_layer(self, args):
     layer = ConvTransformerEncoderLayer(args)
     if getattr(args, "checkpoint_activations", False):
         layer = checkpoint_wrapper(layer)
     return layer