def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend( [ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ] ) if not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False