def __init__(self, args): super().__init__() self.embed_dim = args['model']['encoder_embed_dim'] if args['model']['multihead_attention_version'] == 'pytorch': from ncc.modules.attention.pytorch_multihead_attention import PytorchMultiheadAttention self.self_attn = PytorchMultiheadAttention( self.embed_dim, args['model']['encoder_attention_heads'], dropout=args['model']['attention_dropout']) elif args['model']['multihead_attention_version'] == 'ncc': from ncc.modules.attention.ncc_multihead_attention import NccMultiheadAttention self.self_attn = NccMultiheadAttention( self.embed_dim, args['model']['encoder_attention_heads'], dropout=args['model']['attention_dropout'], ) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args['model']['dropout'] self.activation_fn = get_activation( activation_string=args['model'].get('activation_fn', 'relu')) self.activation_dropout = args['model']['activation_dropout'] if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = args['model']['relu_dropout'] self.normalize_before = args['model']['encoder_normalize_before'] self.fc1 = Linear(self.embed_dim, args['model']['encoder_ffn_embed_dim']) self.fc2 = Linear(args['model']['encoder_ffn_embed_dim'], self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, max_positions=None, dropout=0.0, ): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.bias = nn.Parameter( torch.tril(torch.ones(max_positions, max_positions)).view(1, 1, max_positions, max_positions)) self.k_proj = Linear(self.kdim, embed_dim) self.v_proj = Linear(self.vdim, embed_dim) self.q_proj = Linear(embed_dim, embed_dim) self.out_proj = Linear(embed_dim, embed_dim)
def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False): super().__init__() self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias) self.output_proj = Linear(input_embed_dim + source_embed_dim, output_embed_dim, bias=bias)
def __init__( self, dictionary, src_modalities=['code'], embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS ): super().__init__(dictionary) self.src_modalities = src_modalities self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size self.layers = nn.ModuleList([ LSTMCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable # self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False) self.attention = None else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) # if adaptive_softmax_cutoff is not None: # # setting adaptive_softmax dropout to dropout_out for now but can be redefined # self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, # dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim ** -0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size" ) self.k_proj = Linear( self.kdim, embed_dim, bias=bias, weight_initializer=trunc_normal(mean=.0, std=.02), ) self.v_proj = Linear( self.vdim, embed_dim, bias=bias, weight_initializer=trunc_normal(mean=.0, std=.02), ) self.q_proj = Linear( embed_dim, embed_dim, bias=bias, weight_initializer=trunc_normal(mean=.0, std=.02), ) self.out_proj = Linear( embed_dim, embed_dim, bias=bias, weight_initializer=trunc_normal(mean=.0, std=.02), ) self.add_zero_attn = add_zero_attn
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args['model']['decoder_embed_dim'] self.cross_self_attention = args['model']['cross_self_attention'] self.self_attn = NccMultiheadAttention( embed_dim=self.embed_dim, num_heads=args['model']['decoder_attention_heads'], dropout=args['model']['attention_dropout'], add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not self.cross_self_attention, # maximum_relative_position=args['model']['decoder_max_relative_len'], ) self.dropout = args['model']['dropout'] self.activation_fn = get_activation(args['model']['activation_fn']) self.activation_dropout = args['model']['activation_dropout'] if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = args['model']['relu_dropout'] self.normalize_before = args['model']['decoder_normalize_before'] self.self_attn_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = NccMultiheadAttention( self.embed_dim, args['model']['decoder_attention_heads'], kdim=args['model']['encoder_embed_dim'], vdim=args['model']['encoder_embed_dim'], dropout=args['model']['attention_dropout'], encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args['model']['decoder_ffn_embed_dim']) self.fc2 = Linear(args['model']['decoder_ffn_embed_dim'], self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__( self, dictionary, embed_dim=512, hidden_size=512, num_layers=1, bidirectional=False, dropout=0.5, pretrained_embed=None, shared_embedding=False, ): super(LSTMDecoder, self).__init__(dictionary) if pretrained_embed is None: self.embed_tokens = Embedding(len(dictionary), embed_dim, padding_idx=dictionary.pad()) else: self.embed_tokens = pretrained_embed self.rnn = LSTM( embed_dim, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True, bidirectional= False, # in prediction task, cannot set bidirectional True ) # self.dropout = dropout # self.bidirectional = bidirectional # if bidirectional: # self.proj = Linear(hidden_size * 2, hidden_size) self.fc_out = Linear(hidden_size, len(dictionary)) if shared_embedding: self.fc_out.weight = self.embed_tokens.weight
def __init__(self, embed_dim, attention_heads, dropout, ffn_embed_dim, activation_fn): super().__init__() self.dropout = dropout self.self_attn = MultiheadAttention( embed_dim=embed_dim, num_heads=attention_heads, dropout=dropout, ) self.self_attn_layer_norm = LayerNorm(embed_dim) self.fc1 = Linear( embed_dim, ffn_embed_dim, weight_initializer=trunc_normal(mean=.0, std=.02), ) self.fc2 = Linear( ffn_embed_dim, embed_dim, weight_initializer=trunc_normal(mean=.0, std=.02), ) self.ff_layer_norm = LayerNorm(embed_dim) self.activation_fn = get_activation(activation_fn)
def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() self.embed_dim = args['model']['decoder_embed_dim'] self.cross_self_attention = args['model']['cross_self_attention'] self.self_attn = RelativeMultiheadAttention( embed_dim=self.embed_dim, num_heads=args['model']['decoder_attention_heads'], dropout=args['model']['attention_dropout'], add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not self.cross_self_attention, maximum_relative_position=args['model']['decoder_max_relative_len'], ) self.dropout = args['model']['dropout'] self.activation_fn = get_activation(args['model']['activation_fn']) self.activation_dropout = args['model']['activation_dropout'] if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = args['model']['relu_dropout'] # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix self.self_attn_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = RelativeMultiheadAttention( self.embed_dim, args['model']['decoder_attention_heads'], kdim=args['model']['encoder_embed_dim'], vdim=args['model']['encoder_embed_dim'], dropout=args['model']['attention_dropout'], encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args['model']['decoder_ffn_embed_dim']) self.fc2 = Linear(args['model']['decoder_ffn_embed_dim'], self.embed_dim) self.ff_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__(self, dictionary, embed_dim=400, pos_len=100, pos_dim=50, hidden_size=400, out_embed_dim=400, num_layers=1, dropout_in=0.5, dropout_out=0.5, encoder_output_units=400, pretrained_embed=None, share_input_output_embed=False, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.max_target_positions = max_target_positions num_embeddings = len(dictionary) if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx=dictionary.pad()) else: self.embed_tokens = pretrained_embed self.pos_len = pos_len + 1 self.pos_dim = pos_dim self.pos_embed = Embedding(self.pos_len, pos_dim) # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 # self.layers = nn.ModuleList([ # LSTMCell( # # input_size=encoder_output_units + pos_dim if layer == 0 else hidden_size, # input_size=encoder_output_units if layer == 0 else hidden_size, # hidden_size=hidden_size, # ) # for layer in range(num_layers) # ]) self.layers = nn.ModuleList([ LSTM( in_dim=encoder_output_units + pos_dim if layer == 0 else hidden_size, # in_dim=encoder_output_units if layer == 0 else hidden_size, out_dim=hidden_size, ) for layer in range(num_layers) ]) # W_H(h)+W_T(t) => fc_out self.W_H = nn.Linear(self.hidden_size, self.hidden_size) self.W_T = nn.Linear(self.hidden_size, self.hidden_size) if not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings)
def __init__(self, dictionary, embed_dim, out_channels, kernel_size, **kwargs): super().__init__(dictionary) # word embedding + positional embedding self.embed = Embedding( len(dictionary), embed_dim) # , padding_idx=self.dictionary.pad()) self.position_encoding = kwargs.get('position_encoding', None) if self.position_encoding == 'learned': self.position_embed = Parameter(1, kwargs['max_tokens'], embed_dim, initializer=trunc_normal(mean=0., std=0.02)) else: self.position_embed = None # pooling pooling = kwargs.get('pooling', None) self.pooling = pooling1d(pooling) if 'weighted' in pooling: self.weight_layer = Linear(embed_dim, 1, bias=False) else: self.weight_layer = None # conv1d self.out_channels = out_channels self.kernel_size = kernel_size # padding mode = ['valid'(default), 'same'] self.padding = kwargs.get('padding', 'valid') if self.padding == 'same': self.padding_size = [] for kernel_sz in self.kernel_size: padding_right = (kernel_sz - 1) // 2 padding_left = kernel_sz - 1 - padding_right self.padding_size.append(( 0, 0, padding_left, padding_right, )) self.conv_layers = nn.ModuleList([]) # input: [bsz, 1, seq_len, embed_dim] # filters = 1 -> embed_dim # kernel_size = (kernel_width, embed_dim) # => output: [bsz, embed_dim, seq_len - kernel_width + 1] for idx, kernel_sz in enumerate(self.kernel_size): self.conv_layers.append( Conv2d(in_channels=1, out_channels=embed_dim, kernel_size=(kernel_sz, embed_dim))) self.residual = kwargs.get('residual', False) # residual self.dropout = kwargs.get('dropout', None) activation_fn = kwargs.get('activation_fn', None) self.activation_fn = get_activation( activation_fn) if activation_fn else None
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args['model']['decoder_embed_dim'] self.dropout = args['model']['dropout'] self.cross_self_attention = args['model'].get('cross_self_attention', False) self.self_attn = self.build_self_attention( self.embed_dim, args, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.activation_fn = get_activation(args['model'].get( 'activation_fn', 'relu')) self.activation_dropout = args['model'].get('activation_dropout', 0.) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = args['model'].get('relu_dropout', 0.) self.normalize_before = args['model']['decoder_normalize_before'] self.self_attn_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention( self.embed_dim, args) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args['model']['decoder_ffn_embed_dim']) self.fc2 = Linear(args['model']['decoder_ffn_embed_dim'], self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__( self, dictionary, embed_dim, # rnn config rnn_cell, rnn_hidden_dim, rnn_dropout=None, rnn_num_layers=2, rnn_bidirectional=False, # auxiliary input aux_dim=2, inner_dim=32, out_dim=2, ): super(DeepTuneEncoder, self).__init__(dictionary) self.embed = Embedding(len(dictionary), embed_dim) # LSTM self.rnn_dropout = rnn_dropout self.rnn = getattr(nn, str.upper(rnn_cell))( embed_dim, rnn_hidden_dim, num_layers=rnn_num_layers, dropout=self.rnn_dropout, # rnn inner dropout between layers bidirectional=rnn_bidirectional, batch_first=True, ) self.src_out_proj = nn.Sequential( Linear(rnn_hidden_dim, out_dim), nn.Sigmoid(), ) # Auxiliary inputs. wgsize and dsize self.bn = BatchNorm1d(rnn_hidden_dim + aux_dim) self.hybrid_out_proj = nn.Sequential( Linear(rnn_hidden_dim + aux_dim, inner_dim), nn.ReLU(), Linear(inner_dim, out_dim), nn.Sigmoid(), )
def __init__(self, args): super().__init__() self.embed_dim = args['model']['encoder_embed_dim'] self.self_attn = RelativeMultiheadAttention( self.embed_dim, args['model']['encoder_attention_heads'], dropout=args['model']['attention_dropout'], self_attention=True, maximum_relative_position=args['model'] ['encoder_max_relative_len']) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args['model']['dropout'] self.activation_fn = get_activation(args['model']['activation_fn']) self.activation_dropout = args['model']['activation_dropout'] if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = args['model']['relu_dropout'] self.fc1 = Linear(self.embed_dim, args['model']['encoder_ffn_embed_dim']) self.fc2 = Linear(args['model']['encoder_ffn_embed_dim'], self.embed_dim) self.ff_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, dictionary, embed_dim, token_types, max_positions, self_attn_layers, attention_heads, ffn_embed_dim, activation_fn, dropout, **kwargs, ): super(SelfAttnEncoder, self).__init__(dictionary) # word embedding self.embed = Embedding( len(dictionary), embed_dim, padding_idx=self.dictionary.pad(), initializer=trunc_normal(mean=.0, std=.02), ) # type embedding if token_types is not None: self.type_embed = Embedding( token_types, embed_dim, initializer=trunc_normal(mean=.0, std=.02), ) else: self.type_embed = None # positional embedding if max_positions is not None: self.positional_embed = Parameter( 1, max_positions, embed_dim, initializer=trunc_normal(mean=.0, std=.02), ) else: self.positional_embed = None # layer norm for embedding self.embed_layer_norm = LayerNorm(embed_dim) self.dropout = dropout # self attn self.num_layers = self_attn_layers self.layers = nn.ModuleList( [TransformerEncoderLayer(embed_dim, attention_heads, dropout, ffn_embed_dim, activation_fn) for _ in range(self_attn_layers)] ) # pooling pooling = kwargs.get('pooling', None) self.pooling = pooling1d(pooling) if 'weighted' in pooling: self.weight_layer = Linear(embed_dim, 1, bias=False, weight_initializer=xavier_uniform()) else: self.weight_layer = None
def __init__( self, dictionary, embed_dim, pooling='weighted_mean', dropout=0.1, **kwargs, ): super().__init__(dictionary) self.padding_idx = self.dictionary.pad() self.embed = Embedding(len(dictionary), embed_dim, padding_idx=self.padding_idx, initializer=xavier_uniform()) self.dropout = dropout self.pooling = pooling1d(pooling) if self.pooling: self.weight_layer = Linear(embed_dim, 1, bias=False, weight_initializer=xavier_uniform()) \ if 'weighted' in pooling else None
def __init__( self, dictionary, embed_dim, embed_out, dropout, edge_types, # scoring/transform MLPs out_dropout, dim_inner, dim_out, ): super(PoemEncoder, self).__init__(dictionary) # embedding block if dictionary is not None: self.embed = Embedding(len(dictionary), embed_dim) else: self.embed = None self.embed_modules = nn.Sequential( Linear(embed_dim, embed_out, bias=False), nn.ReLU(), nn.Dropout(dropout)) # MLP-GNN self.gnn_modules = GNNEncoder(edge_types, dim_in=embed_out, dim_inner=dim_out, dim_out=embed_out, \ dropout=dropout) # scoring MLP def get_mlp(): return nn.Sequential( nn.Dropout(out_dropout), nn.Linear(embed_dim + embed_out, dim_inner, bias=False), nn.ReLU(), nn.Linear(dim_inner, dim_out, bias=False), nn.ReLU(), ) self.score_mlp = get_mlp() self.transform_mlp = get_mlp() self.out_linear = nn.Sequential( nn.Linear(dim_out, 2), nn.Sigmoid(), )
def __init__( self, dictionary, embed_dim, dropout, # rnn config rnn_cell, rnn_hidden_dim, rnn_dropout, rnn_num_layers=1, rnn_bidirectional=False, **kwargs): super().__init__(dictionary) # word embedding + positional embedding self.embed = Embedding(len(dictionary), embed_dim, initializer=xavier_uniform()) self.dropout = dropout # pooling pooling = kwargs.get('pooling', None) self.pooling = pooling1d(pooling) if 'weighted' in pooling: self.weight_layer = Linear(embed_dim, 1, bias=False, weight_initializer=xavier_uniform()) else: self.weight_layer = None # rnn self.rnn_dropout = rnn_dropout self.rnn_num_layers = rnn_num_layers self.rnn_bidirectional = rnn_bidirectional self.rnn = getattr(nn, str.upper(rnn_cell))( embed_dim, rnn_hidden_dim, num_layers=rnn_num_layers, dropout=self.rnn_dropout, # rnn inner dropout between layers bidirectional=rnn_bidirectional, batch_first=True, )
def __init__(self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.lstm = LSTM(hidden_size, hidden_size, dropout=dropout_in, batch_first=True) self.fc_out = Linear(out_embed_dim, num_embeddings, bias=False)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args['model']['dropout'] self.decoder_layerdrop = args['model']['decoder_layerdrop'] self.share_input_output_embed = args['model'][ 'share_decoder_input_output_embed'] input_embed_dim = embed_tokens.embedding_dim embed_dim = args['model']['decoder_embed_dim'] self.embed_dim = embed_dim self.output_embed_dim = args['model']['decoder_output_dim'] self.padding_idx = dictionary.pad() # embed_tokens.padding_idx TODO self.max_target_positions = args['task']['max_target_positions'] self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args['model'][ 'no_scale_embedding'] else math.sqrt(embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) offset_positions_by_padding = args['model'].get( 'offset_positions_by_padding', True) if args['model']['decoder_positional_embeddings']: self.embed_positions = None else: # Option 1 if args['model'][ 'decoder_position_encoding_version'] == 'ncc_sinusoidal': self.embed_positions = SinusoidalPositionalEmbedding( self.embed_dim, padding_idx=self.padding_idx if offset_positions_by_padding else None, init_size=args['model']['max_target_positions'] + self.padding_idx + 1 \ if offset_positions_by_padding else args['model']['max_target_positions'], ) # Option 2 elif args['model'][ 'decoder_position_encoding_version'] == 'ncc_learned': num_embeddings = args['model']['max_target_positions'] if offset_positions_by_padding: num_embeddings += self.padding_idx + 1 m = LearnedPositionalEmbedding( num_embeddings, self.embed_dim, padding_idx=self.padding_idx if offset_positions_by_padding else None) nn.init.normal_(m.weight, mean=0, std=self.embed_dim**-0.5) if self.padding_idx is not None: nn.init.constant_(m.weight[self.padding_idx], 0) self.embed_positions = m self.cross_self_attention = args['model']['cross_self_attention'] self.layer_wise_attention = args['model']['layer_wise_attention'] self.layers = nn.ModuleList([ NccTransformerDecoderLayer(args, no_encoder_attn) for _ in range(args['model']['decoder_layers']) ]) self.num_layers = len(self.layers) self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args['model']['tie_adaptive_weights'] else None) self.out_generator = Linear( embed_dim, len(dictionary), bias=args['model']['decoder_out_embed_bias']) if self.share_input_output_embed: self.out_generator.weight = self.embed_tokens.weight if args['model']['decoder_normalize_before'] and not args['model'][ 'no_decoder_final_norm']: self.layer_norm = nn.LayerNorm(embed_dim) else: self.layer_norm = None if args['model']['layernorm_embedding']: self.layernorm_embedding = nn.LayerNorm(embed_dim) else: self.layernorm_embedding = None
def get_mlp(): return nn.Sequential( Linear(dim_in, dim_inner, bias=False), nn.ReLU(), Linear(dim_inner, dim_out, bias=False), )
def __init__(self, input_size: int, hidden_size: int) -> None: super(NaryTreeLSTMCell, self).__init__() self.W_iou = Linear(input_size, 3 * hidden_size, bias=False) self.U_iou = Linear(2 * hidden_size, 3 * hidden_size, bias=False) self.b_iou = nn.Parameter(torch.zeros(1, 3 * hidden_size)) self.U_f = Linear(2 * hidden_size, 2 * hidden_size)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args['model']['dropout'] self.decoder_layerdrop = args['model']['decoder_layerdrop'] self.share_input_output_embed = args['model'][ 'share_decoder_input_output_embed'] input_embed_dim = embed_tokens.embedding_dim embed_dim = args['model']['decoder_embed_dim'] self.embed_dim = embed_dim self.output_embed_dim = args['model']['decoder_output_dim'] self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args['model']['max_target_positions'] self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args['model'][ 'no_scale_embedding'] else math.sqrt(embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args['model']['max_target_positions'], embed_dim, self.padding_idx, learned=args['model']['decoder_learned_pos'], ) if not args['model']['no_token_positional_embeddings'] else None) if args['model']['layernorm_embedding']: self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = args['model'].get('cross_self_attention', False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args['model']['decoder_layers']) ]) self.num_layers = len(self.layers) if args['model']['decoder_normalize_before'] and not getattr( args['model'], "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim else None) self.adaptive_softmax = None self.output_projection = None if args['model']['adaptive_softmax_cutoff'] is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, eval(args['model']['adaptive_softmax_cutoff']), dropout=args['model']['adaptive_softmax_dropout'], adaptive_inputs=embed_tokens if args['model']['tie_adaptive_weights'] else None, factor=args['model']['adaptive_softmax_factor'], tie_proj=args['model']['tie_adaptive_proj'], ) elif self.share_input_output_embed: self.output_projection = Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = Linear(self.output_embed_dim, len(dictionary), bias=False)