Пример #1
0
 def __init__(self, args, dictionary, embed_tokens):
     self.args = args
     super().__init__(
         TransformerConfig.from_namespace(args),
         dictionary,
         embed_tokens,
     )
Пример #2
0
 def add_args(cls, parser):
     """Add model-specific arguments to the parser."""
     # we want to build the args recursively in this case.
     gen_parser_from_dataclass(parser,
                               TransformerConfig(),
                               delete_default=False,
                               with_prefix="")
Пример #3
0
 def build_self_attention(
     self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
 ):
     return super().build_self_attention(
         embed_dim,
         TransformerConfig.from_namespace(args),
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
     )
Пример #4
0
 def __init__(
     self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
 ):
     super().__init__(
         TransformerConfig.from_namespace(args),
         no_encoder_attn=no_encoder_attn,
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
     )
     self.args = args
Пример #5
0
 def __init__(
         self,
         args,
         positional_embedding: Optional[RelativePositionalEmbedding] = None
 ):
     super().__init__(
         TransformerConfig.from_namespace(args),
         positional_embedding=positional_embedding,
     )
     self.args = args
Пример #6
0
 def __init__(
     self,
     args,
     dictionary,
     embed_tokens,
     no_encoder_attn=False,
     output_projection=None,
 ):
     self.args = args
     super().__init__(
         TransformerConfig.from_namespace(args),
         dictionary,
         embed_tokens,
         no_encoder_attn=no_encoder_attn,
         output_projection=output_projection,
     )
Пример #7
0
 def __init__(
     self,
     args,
     no_encoder_attn=False,
     add_bias_kv=False,
     add_zero_attn=False,
     positional_embedding: Optional[RelativePositionalEmbedding] = None,
 ):
     super().__init__(
         TransformerConfig.from_namespace(args),
         no_encoder_attn=no_encoder_attn,
         add_bias_kv=add_bias_kv,
         add_zero_attn=add_zero_attn,
         positional_embedding=positional_embedding,
     )
     self.args = args
Пример #8
0
class Data2VecTextConfig(FairseqDataclass):
    max_positions: int = II("task.tokens_per_sample")

    head_layers: int = 1

    transformer: TransformerConfig = TransformerConfig()

    load_checkpoint_heads: bool = field(
        default=False,
        metadata={
            "help": "(re-)register and load heads when loading checkpoints"
        },
    )

    loss_beta: float = field(
        default=0,
        metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"})
    loss_scale: Optional[float] = field(
        default=None,
        metadata={
            "help":
            "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
        },
    )
    average_top_k_layers: int = field(
        default=8, metadata={"help": "how many layers to average"})

    layer_norm_target_layer: bool = False
    instance_norm_target_layer: bool = False
    batch_norm_target_layer: bool = False
    instance_norm_targets: bool = False
    layer_norm_targets: bool = False

    ema_decay: float = field(default=0.999,
                             metadata={"help": "initial ema decay rate"})
    ema_end_decay: float = field(default=0.9999,
                                 metadata={"help": "final ema decay rate"})

    # when to finish annealing ema decay rate
    ema_anneal_end_step: int = II("optimization.max_update")

    ema_transformer_layers_only: bool = field(
        default=True,
        metadata={
            "help": "whether to momentum update only the transformer layers"
        },
    )
Пример #9
0
 def build_encoder_attention(self, embed_dim, args):
     return super().build_encoder_attention(
         embed_dim,
         TransformerConfig.from_namespace(args),
     )
Пример #10
0
 def __init__(self, args):
     super().__init__(TransformerConfig.from_namespace(args))
     self.args = args
Пример #11
0
 def build_encoder_layer(self, args):
     return super().build_encoder_layer(
         TransformerConfig.from_namespace(args), )
Пример #12
0
 def build_decoder_layer(self, args, no_encoder_attn=False):
     return super().build_decoder_layer(
         TransformerConfig.from_namespace(args),
         no_encoder_attn=no_encoder_attn)
Пример #13
0
 def build_output_projection(self, args, dictionary, embed_tokens):
     super().build_output_projection(TransformerConfig.from_namespace(args),
                                     dictionary, embed_tokens)
Пример #14
0
 def build_self_attention(self, embed_dim, args, positional_embedding=None):
     return super().build_self_attention(
         embed_dim,
         TransformerConfig.from_namespace(args),
         positional_embedding=positional_embedding,
     )
Пример #15
0
        def __init__(
            self,
            embed_dim,
            attention_heads,
            ffn_embed_dim,
            num_layers,
            embedding_layer,  # torch.nn.Embedding. Must have a padding_idx field
            dropout=0,
            normalize_before=False,
            torch_encoder=None,  # torch encoder that you can map weights from
            activation="relu",
        ):
            super().__init__()

            cfg = FairseqTransformerConfig()
            cfg.encoder.embed_dim = embed_dim
            cfg.encoder.attention_heads = attention_heads
            cfg.encoder.ffn_embed_dim = ffn_embed_dim
            cfg.dropout = dropout
            cfg.encoder.normalize_before = normalize_before
            cfg.encoder.layers = num_layers
            # make embedding behavior same as other encoders
            cfg.no_token_positional_embeddings = True
            cfg.no_scale_embedding = True
            cfg.activation_fn = activation
            dictionary = {}  # TODO: verify what this is

            self.encoder = FairseqTransformerEncoder(
                cfg, dictionary, embedding_layer, return_fc=False
            )

            if torch_encoder is not None:
                for src_layer, dst_layer in zip(
                    torch_encoder.layers, self.encoder.layers
                ):
                    w_q, w_k, w_v = src_layer.self_attn.in_proj_weight.chunk(3, dim=0)
                    b_q, b_k, b_v = src_layer.self_attn.in_proj_bias.chunk(3, dim=0)

                    dst_layer.self_attn.q_proj.weight = torch.nn.Parameter(w_q)
                    dst_layer.self_attn.q_proj.bias = torch.nn.Parameter(b_q)
                    dst_layer.self_attn.k_proj.weight = torch.nn.Parameter(w_k)
                    dst_layer.self_attn.k_proj.bias = torch.nn.Parameter(b_k)
                    dst_layer.self_attn.v_proj.weight = torch.nn.Parameter(w_v)
                    dst_layer.self_attn.v_proj.bias = torch.nn.Parameter(b_v)

                    dst_layer.self_attn.out_proj.weight = (
                        src_layer.self_attn.out_proj.weight
                    )
                    dst_layer.self_attn.out_proj.bias = (
                        src_layer.self_attn.out_proj.bias
                    )

                    dst_layer.fc1.weight = src_layer.linear1.weight
                    dst_layer.fc1.bias = src_layer.linear1.bias

                    # fairseq may use fusedlayernorm from nvidia apex - diff properties
                    dst_layer.self_attn_layer_norm.load_state_dict(
                        src_layer.norm1.state_dict()
                    )

                    dst_layer.fc2.weight = src_layer.linear2.weight
                    dst_layer.fc2.bias = src_layer.linear2.bias

                    dst_layer.final_layer_norm.load_state_dict(
                        src_layer.norm2.state_dict()
                    )