def __init__(self, args, dictionary): super().__init__(dictionary) self.dropout = args.dropout self.embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_embed_dim self.padding_idx = dictionary.pad_idx self.max_tgt_positions = args.max_tgt_positions self.embedding = generate_embedding(len(dictionary), self.embed_dim, dictionary.pad_idx) self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( self.embed_dim) self.embed_positions = PositionalEmbedding( self.embed_dim, padding_idx=self.padding_idx, init_size=self.max_tgt_positions + self.padding_idx + 1) self.layers = nn.ModuleList([]) # Generate N identical Decoder Layers self.layers.extend([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) self.embed_out = nn.Linear(self.output_embed_dim, len(dictionary)) nn.init.normal_(self.embed_out.weight, mean=0, std=self.output_embed_dim**-0.5)
def __init__(self, args, dictionary): super().__init__(dictionary) self.dropout = args.dropout self.embed_dim = args.encoder_embed_dim self.padding_idx = dictionary.pad_idx self.max_src_positions = args.max_src_positions self.embedding = generate_embedding(len(dictionary), self.embed_dim, dictionary.pad_idx) self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( self.embed_dim) self.embed_positions = PositionalEmbedding( self.embed_dim, padding_idx=self.padding_idx, init_size=self.max_src_positions + self.padding_idx + 1) self.layers = nn.ModuleList([]) # Generate N identical Encoder Layers self.layers.extend([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ])