def __init__(self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False): super().__init__() self.attention = DownsampledMultiHeadAttention( out_channels, embed_dim, num_heads, dropout=0, bias=True, project_input=project_input, gated=gated, downsample=downsample, ) self.in_proj_q = Linear(out_channels, embed_dim) self.in_proj_k = Linear(out_channels, embed_dim) self.in_proj_v = Linear(out_channels, embed_dim) self.ln = nn.LayerNorm(out_channels)
def __init__( self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 8, attention=True, dropout=0.1, selfattention=False, attention_nheads=1, selfattention_nheads=1, project_input=False, gated_attention=False, downsample=False, pretrained=False, trained_decoder=None, left_pad=False, ): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) self.pretrained = pretrained self.pretrained_decoder = trained_decoder self.dropout = dropout self.left_pad = left_pad self.need_attn = True in_channels = convolutions[0][0] def expand_bool_array(val): if isinstance(val, bool): # expand True into [True, True, ...] and do the same with False return [val] * len(convolutions) return val attention = expand_bool_array(attention) selfattention = expand_bool_array(selfattention) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError('Attention is expected to be a list of booleans of ' 'length equal to the number of layers.') num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=self.left_pad, ) self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.selfattention = nn.ModuleList() self.attproj = nn.ModuleList() for i, (out_channels, kernel_size) in enumerate(convolutions): self.projections.append( Linear(in_channels, out_channels) if in_channels != out_channels else None ) self.convolutions.append( LinearizedConv1d( in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout, ) ) self.attention.append( DownsampledMultiHeadAttention( out_channels, embed_dim, attention_nheads, project_input=project_input, gated=False, downsample=False, ) if attention[i] else None ) self.attproj.append( Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None ) self.selfattention.append( SelfAttention( out_channels, embed_dim, selfattention_nheads, project_input=project_input, gated=gated_attention, downsample=downsample, ) if selfattention[i] else None ) in_channels = out_channels self.fc2 = Linear(in_channels, out_embed_dim) self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout) # model fusion if self.pretrained: # independent gates are learned from the concatenated input self.gate1 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid()) self.gate2 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid()) # pretrained and trained models are joined self.joining = nn.Sequential( Linear(out_embed_dim*2, out_embed_dim*2), nn.LayerNorm(out_embed_dim*2), nn.GLU(), Linear(out_embed_dim, out_embed_dim*2), nn.LayerNorm(out_embed_dim*2), nn.GLU(), Linear(out_embed_dim, out_embed_dim), nn.LayerNorm(out_embed_dim) ) # pretrained model contains an output layer that is nhid -> vocab size # but the models are combined in their hidden state # the hook stores the output of the pretrained model forward self.pretrained_outputs = {} def save_output(): def hook(a, b, output): self.pretrained_outputs["out"] = output return hook self.pretrained_decoder.fc2.register_forward_hook(save_output())