def __init__(self, layer_id, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention820( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, layer_id=layer_id, args=args, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, cur_attn_type='ds') self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, 'char_inputs', False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export, args=args) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention820( self.embed_dim, args.decoder_attention_heads, layer_id=layer_id, args=args, dropout=args.attention_dropout, cur_attn_type='dc', ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export, args=args) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim, layer_id=layer_id, args=args, cur_linear='fc1') self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim, layer_id=layer_id, args=args, cur_linear='fc2') self.need_attn = True self.onnx_trace = False self.input_dropout = args.input_dropout if 'input_dropout' in args else 0
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop: float = 0.0, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, add_bias_kv: bool = False, add_zero_attn: bool = False, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.embed_scale = embed_scale self.segment_embeddings = (nn.Embedding( self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None) self.embed_positions = (PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=( self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, export=export, ) for _ in range(num_encoder_layers) ]) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.output_embed_dim, len(dictionary), bias=False) nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5)
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, LayerNum=None): super().__init__() global tmp_file self.args = args if not hasattr(self.args, 'mixed_precision'): self.args.mixed_precision = False if not hasattr(self.args, 'plot_variance'): self.args.plot_variance = False if not hasattr(self.args, 'plot_gradient'): self.args.plot_gradient = False self.normalize_before = args.decoder_normalize_before self.embed_dim = args.decoder_embed_dim self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_num = LayerNum if 'adaptive' in args.init_type: assert not self.normalize_before self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not self.cross_self_attention) assert not no_encoder_attn self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, 'encoder_embed_dim', None), vdim=getattr(args, 'encoder_embed_dim', None), dropout=args.attention_dropout, encoder_decoder_attention=True) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) if 'adaptive-profiling' == args.init_type: if not tmp_file: tmp_file = open('profile.ratio.init', 'w') self.self_ratio_change = nn.Parameter( torch.ones(self.embed_dim)) self.encoder_ratio_change = nn.Parameter( torch.ones(self.embed_dim)) self.fc_ratio_change = nn.Parameter(torch.ones(self.embed_dim)) else: if not tmp_file: tmp_file = open('profile.ratio.init', 'r') layer_iter, next_value = [ float(tup) for tup in tmp_file.readline().split() ] print('layer_num: {}, layer_iter: {}'.format( self.layer_num, layer_iter)) assert layer_iter == 3 * self.layer_num + 1 print('decoder self ratio: {}'.format(next_value)) self.self_ratio_change = nn.Parameter( torch.ones(self.embed_dim)) self.self_ratio_change.data.fill_(next_value) layer_iter, next_value = [ float(tup) for tup in tmp_file.readline().split() ] print('layer_num: {}, layer_iter: {}'.format( self.layer_num, layer_iter)) assert layer_iter == 3 * self.layer_num + 2 print('decoder en ratio: {}'.format(next_value)) self.encoder_ratio_change = nn.Parameter( torch.ones(self.embed_dim)) self.encoder_ratio_change.data.fill_(next_value) layer_iter, next_value = [ float(tup) for tup in tmp_file.readline().split() ] print('layer_num: {}, layer_iter: {}'.format( self.layer_num, layer_iter)) assert layer_iter == 3 * self.layer_num + 3 print('decoder ffn ratio: {}'.format(next_value)) self.fc_ratio_change = nn.Parameter(torch.ones(self.embed_dim)) self.fc_ratio_change.data.fill_(next_value) export = getattr(args, 'char_inputs', False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) else: self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not self.cross_self_attention) assert not no_encoder_attn self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, 'encoder_embed_dim', None), vdim=getattr(args, 'encoder_embed_dim', None), dropout=args.attention_dropout, encoder_decoder_attention=True) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) if args.init_type == 'looklinear': self.fc1.weight.data[int(args.decoder_ffn_embed_dim / 2):, :] = -self.fc1.weight.data[ 0:int(args.decoder_ffn_embed_dim / 2), :] self.fc2.weight.data[:, int(args.decoder_ffn_embed_dim / 2):] = -self.fc2.weight.data[:, 0:int( args.decoder_ffn_embed_dim / 2)] export = getattr(args, 'char_inputs', False) if args.init_type != 'rezero': self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) else: self.self_attn_layer_norm = None self.encoder_attn_layer_norm = None self.final_layer_norm = None if 'rezero' in args.init_type: self.rezero_weight = nn.Parameter(torch.Tensor([0])) else: assert args.init_type == 'default' self.rezero_weight = None self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: self.activation_dropout = getattr(args, 'relu_dropout', 0) self.need_attn = True self.onnx_trace = False if args.fp16: self.in_type = torch.half else: self.in_type = torch.float
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.decoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__ ) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__ ) self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__(self, args, dictionary): super().__init__(dictionary) self.padding_idx = dictionary.pad() self.vocab_size = dictionary.__len__() self.max_positions = args.max_positions self.sentence_encoder = TransformerSentenceEncoder( padding_idx=self.padding_idx, vocab_size=self.vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.act_dropout, max_seq_len=self.max_positions, num_segments=args.num_segment, use_position_embeddings=not args.no_token_positional_embeddings, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, learned_pos_embedding=args.encoder_learned_pos, add_bias_kv=args.bias_kv, add_zero_attn=args.zero_attn, ) self.share_input_output_embed = args.share_encoder_input_output_embed self.embed_out = None self.sentence_projection_layer = None self.sentence_out_dim = args.sentence_class_num self.lm_output_learned_bias = None # Remove head is set to true during fine-tuning self.load_softmax = not getattr(args, 'remove_head', False) self.masked_lm_pooler = nn.Linear( args.encoder_embed_dim, args.encoder_embed_dim ) self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn) self.lm_head_transform_weight = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.activation_fn = utils.get_activation_fn(args.activation_fn) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.lm_output_learned_bias = None if self.load_softmax: self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size)) if not self.share_input_output_embed: self.embed_out = nn.Linear( args.encoder_embed_dim, self.vocab_size, bias=False ) if args.sent_loss: self.sentence_projection_layer = nn.Linear( args.encoder_embed_dim, self.sentence_out_dim, bias=False )
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.dropout_structured_attention = getattr(args, "dropout_structured_attention", False) #getattr(args, "layernorm_embedding", False) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) # if self.freeze_bart: # self.embed_positions.weight.requires_grad = False # self.embed_positions.bias.requires_grad = False self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [self.build_encoder_layer(args) for i in range(args.encoder_layers)] ) # if self.freeze_bart: # for layer in self.layers: # for param in layer.parameters(): # self.embed_positions.weight.requires_grad = False self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.use_structured_attention = args.use_structured_attention # HARD CODED by Rishabh self.explicit_str_att = args.explicit_str_att # HARD CODED by Rishabh self.detach_bart_encoder = args.detach_bart_encoder self.use_identity_init = args.identity_init print ('Using Identity init : ', self.use_identity_init) self.fp16 = args.fp16 # self.use_structured_attention = True # self.explicit_str_att = False # self.detach_bart_encoder = False # if not self.use_structured_attention and not self.explicit_str_att: # print("One of --use_structured_attention or --explicit_str_att must be set") # exit() str_out_size = 0 if self.use_structured_attention: print("Using Latent Structured Attention") self.structure_att = StructuredAttention(sent_hiddent_size=args.encoder_embed_dim, bidirectional=False, py_version='nightly', identity_init = self.use_identity_init) str_out_size += args.encoder_embed_dim//2 else: print("NOT Using Latent Structured Attention") self.structure_att = None if self.explicit_str_att: print("Using Explicit Structured Attention") self.tp_linear = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim//2, bias=True) self.fzlinear = nn.Linear(args.encoder_embed_dim//2, args.encoder_embed_dim//2, bias=True) if self.use_identity_init: nn.init.eye_(self.tp_linear.weight) nn.init.eye_(self.fzlinear.weight) str_out_size += args.encoder_embed_dim//2
def __init__(self, args): super().__init__() self.args = args feature_enc_layers = eval(args.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=args.extractor_mode, conv_bias=args.conv_bias, ) self.post_extract_proj = (nn.Linear(self.embed, args.encoder_embed_dim) if self.embed != args.encoder_embed_dim and not args.quantize_input else None) self.mask_prob = args.mask_prob self.mask_selection = args.mask_selection self.mask_other = args.mask_other self.mask_length = args.mask_length self.no_mask_overlap = args.no_mask_overlap self.mask_min_space = args.mask_min_space self.mask_channel_prob = args.mask_channel_prob self.mask_channel_selection = args.mask_channel_selection self.mask_channel_other = args.mask_channel_other self.mask_channel_length = args.mask_channel_length self.no_mask_channel_overlap = args.no_mask_channel_overlap self.mask_channel_min_space = args.mask_channel_min_space self.dropout_input = nn.Dropout(args.dropout_input) self.dropout_features = nn.Dropout(args.dropout_features) self.feature_grad_mult = args.feature_grad_mult self.quantizer = None self.input_quantizer = None self.n_negatives = args.num_negatives self.cross_sample_negatives = args.cross_sample_negatives self.codebook_negatives = args.codebook_negatives self.negatives_from_everywhere = args.negatives_from_everywhere self.logit_temp = args.logit_temp final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim if args.quantize_targets: vq_dim = args.latent_dim if args.latent_dim > 0 else final_dim self.quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=args.latent_vars, temp=eval(args.latent_temp), groups=args.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) if args.quantize_input: if args.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: vq_dim = (args.latent_dim if args.latent_dim > 0 else args.encoder_embed_dim) self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=args.latent_vars, temp=eval(args.latent_temp), groups=args.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_inp = nn.Linear(vq_dim, args.encoder_embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(args.encoder_embed_dim).uniform_()) self.encoder = TransformerEncoder(args) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if args.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU()) self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=True) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, 'char_inputs', False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.positional_attention = getattr(args, 'positional_attention', True) if self.positional_attention: self.position_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, positional_attention=True) self.position_layer_norm = LayerNorm(self.embed_dim, export=export) else: self.position_attn = None self.position_layer_norm = None if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, 'encoder_embed_dim', None), vdim=getattr(args, 'encoder_embed_dim', None), dropout=args.attention_dropout, encoder_decoder_attention=True) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions # self.embed_tokens = embed_tokens self.output_embed_dim = args.decoder_output_dim self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) # self.embed_positions = ( # PositionalEmbedding( # args.max_source_positions, # embed_dim, # self.padding_idx, # learned=args.encoder_learned_pos, # ) # if not args.no_token_positional_embeddings # else None # ) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_encoder_layer(args) for i in range(args.encoder_layers) ]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.output_projection = nn.Linear( embed_tokens.weight.shape[1], embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = embed_tokens.weight
def __init__(self, args, embed_dim): super().__init__() if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, num_features, num_layers=8, kernel_size=3): super().__init__() self.residual_blocks = nn.ModuleList([]) for _ in range(num_layers): self.residual_blocks.append(_ResLayer(num_features, kernel_size)) self.final_ln = LayerNorm(num_features, elementwise_affine=False)
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 24, embedding_dim: int = 1024, ffn_embedding_dim: int = 4096, num_attention_heads: int = 16, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.0, layerdrop: float = 0.0, max_seq_len: int = 512, num_segments: int = 0, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = True, apply_bert_init: bool = True, activation_fn: str = "gelu", learned_pos_embedding: bool = True, add_bias_kv: bool = False, add_zero_attn: bool = False, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ): super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.num_encoder_layers = num_encoder_layers self.num_attention_heads = num_attention_heads self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = (nn.Embedding( self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None) self.embed_positions = ( PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=( self.padding_idx if offset_positions_by_padding else None), #padding_idx=None, learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, # add_bias_kv=add_bias_kv, # add_zero_attn=add_zero_attn, q_noise=q_noise, qn_block_size=qn_block_size, export=export, ) for _ in range(self.num_encoder_layers) ]) #self.roberta = torch.hub.load('pytorch/fairseq', load_model) # self.roberta = RobertaModel.from_pretrained('model/roberta.base/',checkpoint_file='model.pt') # self.roberta=RobertaModel() # print(self.roberta.encode('Hello world!')) #self.score = nn.Linear(embedding_dim*2, 1, bias=True) self.score2 = nn.Sequential( nn.Linear(embedding_dim * 2, 200, bias=True), nn.Tanh()) self.score3 = nn.Linear(200, 1, bias=True) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__(self, args, dictionary, embed_tokens, lang2idx2idx, M, N, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim # define a dict of lang vocab id to its index in syntactic matrix self.lang2idx2idx = torch.LongTensor(lang2idx2idx) # define semantic and syntactic matrices no_langs = len([i for i in self.lang2idx2idx if i>-1]) self.M = M self.N = N padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, max_seq_len: int = 256, encoder_normalize_before: bool = False, embedding_normalize: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", embed_scale: float = None, rel_pos: bool = False, rel_pos_bins: int = 32, max_rel_pos: int = 128, export: bool = False, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.apply_bert_init = apply_bert_init self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.embed_scale = embed_scale self.attn_scale_factor = 2 self.num_attention_heads = num_attention_heads self.pos = nn.Embedding(self.max_seq_len + 1, self.embedding_dim) self.pos_q_linear = nn.Linear(self.embedding_dim, self.embedding_dim) self.pos_k_linear = nn.Linear(self.embedding_dim, self.embedding_dim) self.pos_scaling = float(self.embedding_dim / num_attention_heads * self.attn_scale_factor)**-0.5 self.pos_ln = LayerNorm(self.embedding_dim, export=export) self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, attn_scale_factor=self.attn_scale_factor, export=export, encoder_normalize_before=encoder_normalize_before, ) for _ in range(num_encoder_layers) ]) if embedding_normalize: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None if encoder_normalize_before: self.emb_out_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_out_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) self.rel_pos = rel_pos if self.rel_pos: assert rel_pos_bins % 2 == 0 self.rel_pos_bins = rel_pos_bins self.max_rel_pos = max_rel_pos self.relative_attention_bias = nn.Embedding( self.rel_pos_bins + 1, self.num_attention_heads) seq_len = self.max_seq_len context_position = torch.arange(seq_len, dtype=torch.long)[:, None] memory_position = torch.arange(seq_len, dtype=torch.long)[None, :] relative_position = memory_position - context_position self.rp_bucket = relative_position_bucket( relative_position, num_buckets=self.rel_pos_bins, max_distance=self.max_rel_pos) # others to [CLS] self.rp_bucket[:, 0] = self.rel_pos_bins # [CLS] to others, Note: self.rel_pos_bins // 2 is not used in relative_position_bucket self.rp_bucket[0, :] = self.rel_pos_bins // 2
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) # self.dropout = [0.05, 0.1, 0.25, 0.3] self.dropout = [0, 0, 0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3] # self.dropout = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3] self.index = None self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) # self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim], # [embed_dim, embed_dim, embed_dim, embed_dim]) # self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim], # [int(embed_dim / 4), int(embed_dim * 2 / 4), int(embed_dim * 3 / 4), embed_dim]) self.embedding_hidden_mapping_in = SlimmableLinear([embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim], [int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim]) self.embedding_hidden_mapping_out = SlimmableLinear([int(embed_dim * 4 / 16), int(embed_dim * 5 / 16), int(embed_dim * 6 / 16), int(embed_dim * 7 / 16), int(embed_dim * 8 / 16), int(embed_dim * 9 / 16), int(embed_dim * 10 / 16), int(embed_dim * 11 / 16), int(embed_dim * 12 / 16), int(embed_dim * 13 / 16), int(embed_dim * 14 / 16), int(embed_dim * 15 / 16), embed_dim], [embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim, embed_dim]) self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__( self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 8, attention=True, dropout=0.1, selfattention=False, attention_nheads=1, selfattention_nheads=1, project_input=False, gated_attention=False, downsample=False, pretrained=False, trained_decoder=None, left_pad=False, ): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) self.pretrained = pretrained self.pretrained_decoder = trained_decoder self.dropout = dropout self.left_pad = left_pad self.need_attn = True in_channels = convolutions[0][0] def expand_bool_array(val): if isinstance(val, bool): # expand True into [True, True, ...] and do the same with False return [val] * len(convolutions) return val attention = expand_bool_array(attention) selfattention = expand_bool_array(selfattention) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError('Attention is expected to be a list of booleans of ' 'length equal to the number of layers.') num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=self.left_pad, ) self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.selfattention = nn.ModuleList() self.attproj = nn.ModuleList() for i, (out_channels, kernel_size) in enumerate(convolutions): self.projections.append( Linear(in_channels, out_channels) if in_channels != out_channels else None ) self.convolutions.append( LinearizedConv1d( in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout, ) ) self.attention.append( DownsampledMultiHeadAttention( out_channels, embed_dim, attention_nheads, project_input=project_input, gated=False, downsample=False, ) if attention[i] else None ) self.attproj.append( Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None ) self.selfattention.append( SelfAttention( out_channels, embed_dim, selfattention_nheads, project_input=project_input, gated=gated_attention, downsample=downsample, ) if selfattention[i] else None ) in_channels = out_channels self.fc2 = Linear(in_channels, out_embed_dim) self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout) # model fusion if self.pretrained: # independent gates are learned from the concatenated input self.gate1 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid()) self.gate2 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid()) # pretrained and trained models are joined self.joining = nn.Sequential( Linear(out_embed_dim*2, out_embed_dim*2), LayerNorm(out_embed_dim*2), nn.GLU(), Linear(out_embed_dim, out_embed_dim*2), LayerNorm(out_embed_dim*2), nn.GLU(), Linear(out_embed_dim, out_embed_dim), LayerNorm(out_embed_dim) ) # pretrained model contains an output layer that is nhid -> vocab size # but the models are combined in their hidden state # the hook stores the output of the pretrained model forward self.pretrained_outputs = {} def save_output(): def hook(a, b, output): self.pretrained_outputs["out"] = output return hook self.pretrained_decoder.fc2.register_forward_hook(save_output())
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None #image section self.img_dim = 2048 self.text_dim = embed_dim self.L2norm = args.L2norm self.total_num_img = args.total_num_img self.per_num_img = args.per_num_img cap2image_file = args.cap2image_file image_embedding_file = args.image_embedding_file self.cap2image = pickle.load(open(cap2image_file, "rb")) #cap_id to image_id #print("image embedding processing...") embeding_weights = np.load(image_embedding_file) img_vocab, img_dim = embeding_weights.shape embeddings_matrix = np.zeros((img_vocab + 1, img_dim)) embeddings_matrix[1:] = embeding_weights self.img_embeddings = nn.Embedding.from_pretrained( torch.FloatTensor(embeddings_matrix), freeze=args.image_emb_fix) # update embedding # self.img_embeddings.load_state_dict({'weight': embeddings_matrix}) # if args.image_emb_fix: # self.img_embeddings.weight.requires_grad = False self.merge_option = args.merge_option self.dense = nn.Linear(self.img_dim, self.text_dim) self.mergeImage = nn.Linear(self.total_num_img, 1) if self.merge_option == "att-mul-concat": self.proj_attention = SCAttention(self.text_dim, 128) self.dense2 = nn.Linear(self.text_dim, 384) elif self.merge_option == "att-concat": self.dense2 = nn.Linear(2 * self.text_dim, self.text_dim) elif self.merge_option == "att-gate": self.gate_type = args.gate_type self.proj_attention = SCAttention(self.text_dim, self.text_dim) if self.gate_type == "neural-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2 * self.text_dim, self.text_dim) elif self.gate_type == "scalar-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2 * self.text_dim, 1) else: self.image_weight = args.image_weight else: self.proj_attention = SCAttention(self.text_dim, self.text_dim)
def __init__( self, cfg, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): self.cfg = cfg super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)) self.decoder_layerdrop = cfg.decoder.layerdrop self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder.embed_dim self.embed_dim = embed_dim self.output_embed_dim = cfg.decoder.output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt( embed_dim) if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( self.max_target_positions, embed_dim, self.padding_idx, learned=cfg.decoder.learned_pos, ) if not cfg.no_token_positional_embeddings else None) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None self.cross_self_attention = cfg.cross_self_attention if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(cfg, no_encoder_attn) for _ in range(cfg.decoder.layers) ]) self.num_layers = len(self.layers) if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm: self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = output_projection if self.output_projection is None: self.build_output_projection(cfg, dictionary, embed_tokens)
def __init__(self, args): super().__init__() self.args = args feature_enc_layers = eval(args.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=args.extractor_mode, conv_bias=args.conv_bias, ) self.post_extract_proj = (nn.Linear(self.embed, args.encoder_embed_dim) if self.embed != args.encoder_embed_dim and not args.quantize_input else None) self.mask_prob = args.mask_prob self.mask_selection = args.mask_selection self.mask_other = args.mask_other self.mask_length = args.mask_length self.no_mask_overlap = args.no_mask_overlap self.mask_min_space = args.mask_min_space self.mask_channel_prob = args.mask_channel_prob self.mask_channel_selection = args.mask_channel_selection self.mask_channel_other = args.mask_channel_other self.mask_channel_length = args.mask_channel_length self.no_mask_channel_overlap = args.no_mask_channel_overlap self.mask_channel_min_space = args.mask_channel_min_space self.dropout_input = nn.Dropout(args.dropout_input) self.dropout_features = nn.Dropout(args.dropout_features) self.feature_grad_mult = args.feature_grad_mult self.quantizer = None self.input_quantizer = None self.n_negatives = args.num_negatives self.cross_sample_negatives = args.cross_sample_negatives self.codebook_negatives = args.codebook_negatives self.negatives_from_everywhere = args.negatives_from_everywhere self.logit_temp = args.logit_temp final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim if args.quantize_targets: vq_dim = args.latent_dim if args.latent_dim > 0 else final_dim # 256 self.quantizer = GumbelVectorQuantizer( dim=self.embed, # 512 num_vars=args.latent_vars, # 320 temp=eval(args.latent_temp), # (2,0.5,0.999995) groups=args.latent_groups, # 2 combine_groups=False, vq_dim=vq_dim, # 256 time_first=True, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) if args.quantize_input: if args.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: vq_dim = (args.latent_dim if args.latent_dim > 0 else args.encoder_embed_dim) self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=args.latent_vars, temp=eval(args.latent_temp), groups=args.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, ) self.project_inp = nn.Linear(vq_dim, args.encoder_embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(args.encoder_embed_dim).uniform_()) self.encoder = TransformerEncoder(args) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if args.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU()) self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim) if getattr(args, "w2v_path", None): print('load Wav2VecEncoder from {}'.format(args.w2v_path)) state = checkpoint_utils.load_checkpoint_to_cpu(args.w2v_path) for i in list(state['model'].keys()): if 'quantizer' in i: state['model'].pop(i) print(self.load_state_dict(state["model"], strict=False))
def __init__(self, args, LayerNum=None): super().__init__() global tmp_file self.args = args if not hasattr(self.args, 'mixed_precision'): self.args.mixed_precision = False if not hasattr(self.args, 'plot_variance'): self.args.plot_variance = False if not hasattr(self.args, 'plot_gradient'): self.args.plot_gradient = False if not hasattr(self.args, 'plot_stability'): self.args.plot_stability = False self.normalize_before = args.encoder_normalize_before self.embed_dim = args.encoder_embed_dim self.layer_num = LayerNum # if LayerNum is not None and not self.normalize_before: if 'adaptive' in args.init_type: assert not self.normalize_before self.self_attn = MultiheadAttention(self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True) self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) if 'adaptive-profiling' == args.init_type: if not tmp_file: tmp_file = open('profile.ratio.init', 'w') self.attention_ratio_change = nn.Parameter( torch.ones(self.embed_dim)) self.fc_ratio_change = nn.Parameter(torch.ones(self.embed_dim)) else: if not tmp_file: tmp_file = open('profile.ratio.init', 'r') layer_iter, next_value = [ float(tup) for tup in tmp_file.readline().split() ] print('layer_num: {}, layer_iter: {}'.format( self.layer_num, layer_iter)) assert layer_iter == 2 * self.layer_num + 1 print('encoder attn ratio: {}'.format(next_value)) self.attention_ratio_change = nn.Parameter( torch.ones(self.embed_dim)) self.attention_ratio_change.data.fill_(next_value) layer_iter, next_value = [ float(tup) for tup in tmp_file.readline().split() ] print('layer_num: {}, layer_iter: {}'.format( self.layer_num, layer_iter)) assert layer_iter == 2 * self.layer_num + 2 print('encoder ffn ratio: {}'.format(next_value)) self.fc_ratio_change = nn.Parameter(torch.ones(self.embed_dim)) self.fc_ratio_change.data.fill_(next_value) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) else: self.self_attn = MultiheadAttention(self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True) self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) if args.init_type == 'looklinear': self.fc1.weight.data[int(args.encoder_ffn_embed_dim / 2):, :] = -self.fc1.weight.data[ 0:int(args.encoder_ffn_embed_dim / 2), :] self.fc2.weight.data[:, int(args.encoder_ffn_embed_dim / 2):] = -self.fc2.weight.data[:, 0:int( args.encoder_ffn_embed_dim / 2)] if args.init_type != 'rezero': self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) else: self.self_attn_layer_norm = None self.final_layer_norm = None if 'rezero' in args.init_type: self.rezero_weight = nn.Parameter(torch.Tensor([0])) else: assert args.init_type == 'default' self.rezero_weight = None if self.args.plot_stability: self.x0_hat = None self.x1_hat = None if self.layer_num == self.args.encoder_layers - 1: self.x_final = None self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: self.activation_dropout = getattr(args, 'relu_dropout', 0) if args.fp16: self.in_type = torch.half else: self.in_type = torch.float
def __init__(self, args, embed_dim, block_num, block_id, stride, should_compress_query): super().__init__() self.quant_noise = getattr(args, 'quant_noise_pq', 0) self.quant_noise_block_size = getattr( args, 'quant_noise_pq_block_size', 8) or 8 # Funnel Args self.stride = stride self.embed_dim = embed_dim self.ffn_embed_dim = self.embed_dim * args.encoder_ffn_embed_factor self.block_id = block_id self.block_num = block_num self.should_compress_query = should_compress_query if self.should_compress_query: self.should_compress_feature = args.feature_compress if self.should_compress_feature: self.feature_compress_type = getattr( args, 'feature_compress_type', 'mean') if self.feature_compress_type == "mean": self.feature_compress_query = nn.AvgPool1d( stride, stride=stride, ceil_mode=True) elif self.feature_compress_type == "linear": self.feature_compress_query = nn.Linear( embed_dim * stride, embed_dim) elif self.feature_compress_type == "max": self.feature_compress_query = nn.MaxPool1d( stride, stride=stride, ceil_mode=True) elif self.feature_compress_type == "min": self.feature_compress_query = - \ nn.MaxPool1d(stride, stride=stride, ceil_mode=True) self.should_compress_time = args.time_compress if self.should_compress_time: self.time_compress_type = getattr( args, 'time_compress_type', 'mean') if self.time_compress_type == "mean": self.time_compress_query_fn = nn.AvgPool1d( stride, stride=stride, ceil_mode=True) # elif self.time_compress_type == "linear": # self.time_compress_query = nn.Linear( # embed_dim * stride, embed_dim) elif self.time_compress_type == "max": self.time_compress_query_fn = nn.MaxPool1d( stride, stride=stride, ceil_mode=True) elif self.time_compress_type == "min": self.time_compress_query_fn = - \ nn.MaxPool1d(stride, stride=stride, ceil_mode=True) self.kv_dim = embed_dim * ( self.stride if should_compress_query and self.should_compress_feature else 1) # self.pooling_size = getattr(args, 'pooling_size', True) self.separate_cls = getattr(args, 'separate_cls', False) self.self_attn = self.build_self_attention( self.embed_dim, self.kv_dim, args) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu') or "relu" ) activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 if activation_dropout_p == 0: # for backwards compatibility with models that use args.relu_dropout activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = args.encoder_normalize_before self.fc1 = self.build_fc1( self.embed_dim, self.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( self.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim else None if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop: float = 0.0, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.embed_tokens = self.build_embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = (nn.Embedding( self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None) self.embed_positions = (PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=( self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None if self.layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_transformer_sentence_encoder_layer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout_module.p, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size, ) for _ in range(num_encoder_layers) ]) # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.tgt_drop = args.tgt_drop self.drop_method = args.drop_method if self.drop_method == 'drop_tag': self.mask = dictionary.indices['<dropped>'] elif self.drop_method == 'unk_tag': self.mask = dictionary.indices['<unk>']
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, layer_id=-1): super().__init__() self.embed_dim = args.decoder_embed_dim self.cross_self_attention = getattr(args, 'cross_self_attention', False) # beg 20191115 multi-hop attention configuration in layer self.layer_id = layer_id self.self_attn_type = args.decoder_attn_type self.self_spec_layers = [ int(i) for i in args.decoder_spec_attn_layers.split(',') if i != '' ] if self.self_attn_type == 'MHDA' and self.layer_id in self.self_spec_layers: self.self_attn = MultiHopDependentAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=True) print('Self Attention [@Decoder Layer-{}] is MHDA.'.format( self.layer_id)) else: self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not self.cross_self_attention, ) print( 'Self Attention @[Decoder Layer-{}] is vanilla multi-head attention.' .format(self.layer_id)) # end 20191115 self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, 'char_inputs', False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: # beg 20191115 multi-hop attention configuration in layer self.encdec_attn_type = args.encdec_attn_type self.encdec_spec_layers = [ int(i) for i in args.encdec_spec_attn_layers.split(',') if i != '' ] if self.encdec_attn_type == 'MHDA' and self.layer_id in self.encdec_spec_layers: self.encoder_attn = MultiHopDependentAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, 'encoder_embed_dim', None), vdim=getattr(args, 'encoder_embed_dim', None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) print('Encoder-Decoder Attention [@Decoder Layer-{}] is MHDA.'. format(self.layer_id)) else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, 'encoder_embed_dim', None), vdim=getattr(args, 'encoder_embed_dim', None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) print( 'Encoder-Decoder Attention [@Decoder Layer-{}] is vanilla multi-head attention.' .format(self.layer_id)) # end 20191115 self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__( self, cfg: Wav2Vec2Seq2SeqConfig, dictionary, embed_tokens, no_encoder_attn=False, ): super().__init__(dictionary) self.dropout = cfg.decoder_dropout self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder_embed_dim self.output_embed_dim = cfg.decoder_embed_dim self.layerdrop = cfg.decoder_layerdrop padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( cfg.max_target_positions, embed_dim, padding_idx, learned=cfg.decoder_learned_pos, ) if not cfg.no_token_positional_embeddings else None) # TODO: update this when transformer gets converted to dataclass configs transformer_cfg = copy.deepcopy(cfg) with open_dict(transformer_cfg): transformer_cfg.dropout = transformer_cfg.decoder_dropout transformer_cfg.attention_dropout = ( transformer_cfg.decoder_attention_dropout) transformer_cfg.activation_dropout = ( transformer_cfg.decoder_activation_dropout) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(transformer_cfg, no_encoder_attn) for _ in range(transformer_cfg.decoder_layers) ]) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.quant_noise = getattr(args, "quant_noise_pq", 0) self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.self_attn = self.build_self_attention( self.embed_dim, args, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.activation_fn = utils.get_activation_fn( activation=str(args.activation_fn) if getattr( args, "activation_fn", None) is not None else "relu") activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 if activation_dropout_p == 0: # for backwards compatibility with models that use args.relu_dropout activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, "char_inputs", False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention( self.embed_dim, args) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = self.build_fc1( self.embed_dim, args.decoder_ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( args.decoder_ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, embed_dim, ffn_embed_dim, nhead, encoder_embed_dim, dropout, attn_dropout, activation_dropout, normalize_before=True, activation_fn="relu", quant_noise=0, quant_noise_block_size=8, cross_self_attention=False, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, ): super().__init__() self.embed_dim = embed_dim self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.quant_noise = quant_noise self.quant_noise_block_size = quant_noise_block_size self.cross_self_attention = cross_self_attention self.self_attn = self.build_self_attention( self.embed_dim, nhead, attn_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.activation_fn = utils.get_activation_fn(activation=activation_fn) activation_dropout_p = activation_dropout self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = normalize_before export = False self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encodec_attn = None self.encodec_attn_layer_norm = None else: self.encodec_attn = self.build_encoder_attention( self.embed_dim, encoder_embed_dim, attn_dropout, nhead ) self.encodec_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = self.build_fc1( self.embed_dim, ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False