def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) # Variable tracker self.tracker = VariableTracker() # Initialize adversarial mode self.set_gradient_tracking_mode(False)
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) self.layers = nn.ModuleList([]) self.layers.extend( [ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ] ) if not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ fairseq_transformer.TransformerDecoderLayer(args) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def __init__(self, args, embed_tokens): super().__init__() self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, self.padding_idx, learned=args.encoder_learned_pos )
def __init__( self, args, dictionary, embed_tokens, num_chars=50, embed_dim=32, char_cnn_params="[(128, 3), (128, 5)]", char_cnn_nonlinear_fn="tanh", char_cnn_pool_type="max", char_cnn_num_highway_layers=0, char_cnn_output_dim=-1, use_pretrained_weights=False, finetune_pretrained_weights=False, weights_file=None, ): super().__init__(dictionary) convolutions_params = literal_eval(char_cnn_params) self.char_cnn_encoder = char_encoder.CharCNNModel( dictionary, num_chars, embed_dim, convolutions_params, char_cnn_nonlinear_fn, char_cnn_pool_type, char_cnn_num_highway_layers, char_cnn_output_dim, use_pretrained_weights, finetune_pretrained_weights, weights_file, ) self.embed_tokens = embed_tokens token_embed_dim = embed_tokens.embedding_dim self.word_layer_norm = nn.LayerNorm(token_embed_dim) char_embed_dim = ( char_cnn_output_dim if char_cnn_output_dim != -1 else sum(out_dim for (out_dim, _) in convolutions_params) ) self.char_layer_norm = nn.LayerNorm(char_embed_dim) self.word_dim = char_embed_dim + token_embed_dim self.char_scale = math.sqrt(char_embed_dim / self.word_dim) self.word_scale = math.sqrt(token_embed_dim / self.word_dim) if self.word_dim != args.encoder_embed_dim: self.word_to_transformer_embed = fairseq_transformer.Linear( self.word_dim, args.encoder_embed_dim ) self.dropout = args.dropout self.padding_idx = dictionary.pad() self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, args.encoder_embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) self.transformer_encoder_given_embeddings = TransformerEncoderGivenEmbeddings( args=args, proj_to_decoder=True ) # Variable tracker self.tracker = VariableTracker() # Initialize adversarial mode self.set_gradient_tracking_mode(False) self.set_embed_noising_mode(False) # disables sorting and word-length thresholding if True # (enables ONNX tracing of length-sorted input with batch_size = 1) self.onnx_export_model = False
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.decoder_layerdrop = 0 if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0: self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.aan = args.aan decoder_layer_class = (AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer) self.layers = nn.ModuleList([]) self.layers.extend( [decoder_layer_class(args) for i in range(args.decoder_layers)]) if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep: layers_to_keep = sorted( int(x) for x in args.decoder_layers_to_keep.split(",")) self.decoder_layers_to_keep = { layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep) } self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False # Use quantizable nn.Linear for output projection instead of F.linear self.output_projection = None if self.vocab_reduction_module is None: if self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.embed_out.shape[1], self.embed_out.shape[0]) self.output_projection.weight = self.embed_out
def __init__( self, args, src_dict, dst_dict, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True, ): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerAANDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = Linear(embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False