def __init__( self, vocab_size, input_dim, cutoff, dropout, factor=4.0, adaptive_inputs=None, tie_proj=False, q_noise=0, qn_block_size=8, ): super().__init__() if vocab_size > cutoff[-1]: cutoff = cutoff + [vocab_size] else: assert (vocab_size == cutoff[-1] ), "cannot specify cutoff larger than vocab size" output_dim = cutoff[0] + len(cutoff) - 1 self.vocab_size = vocab_size self.cutoff = cutoff self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.input_dim = input_dim self.factor = factor self.q_noise = q_noise self.qn_block_size = qn_block_size self.lsm = nn.LogSoftmax(dim=1) if adaptive_inputs is not None: self.head = TiedHeadModule( adaptive_inputs.weights_for_band(0), input_dim, len(cutoff) - 1, self.q_noise, self.qn_block_size, ) else: self.head = quant_noise( nn.Linear(input_dim, output_dim, bias=False), self.q_noise, self.qn_block_size, ) self._make_tail(adaptive_inputs, tie_proj) def init_weights(m): if (hasattr(m, "weight") and not isinstance(m, TiedLinear) and not isinstance(m, TiedHeadModule)): nn.init.xavier_uniform_(m.weight) self.apply(init_weights) self.register_buffer("version", torch.LongTensor([1]))
def _make_tail(self, adaptive_inputs=None, tie_proj=False): self.tail = nn.ModuleList() for i in range(len(self.cutoff) - 1): dim = int(self.input_dim // self.factor**(i + 1)) tied_emb, tied_proj = (adaptive_inputs.weights_for_band(i + 1) if adaptive_inputs is not None else (None, None)) if tied_proj is not None: if tie_proj: proj = quant_noise( TiedLinear(tied_proj, transpose=True), self.q_noise, self.qn_block_size, ) else: proj = quant_noise( nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False), self.q_noise, self.qn_block_size, ) else: proj = quant_noise( nn.Linear(self.input_dim, dim, bias=False), self.q_noise, self.qn_block_size, ) if tied_emb is None: out_proj = nn.Linear(dim, self.cutoff[i + 1] - self.cutoff[i], bias=False) else: out_proj = TiedLinear(tied_emb, transpose=False) m = nn.Sequential( proj, nn.Dropout(self.dropout_module.p), quant_noise(out_proj, self.q_noise, self.qn_block_size), ) self.tail.append(m)
def __init__( self, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = 'relu', export: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = MultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, add_bias_kv=False, add_zero_attn=False, self_attention=True, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) self.fc1 = quant_noise( nn.Linear(self.embedding_dim, ffn_embedding_dim), q_noise, qn_block_size) self.fc2 = quant_noise( nn.Linear(ffn_embedding_dim, self.embedding_dim), q_noise, qn_block_size) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
def _prune_fc_layer(self, remove_index: List[int]): new_fc1_weight = [] new_fc1_bias = [] for i in range(self.fc1.out_features): if i not in remove_index: new_fc1_weight.append(self.fc1.weight[i]) new_fc1_bias.append(self.fc1.bias[i]) new_fc1_weight = torch.stack(new_fc1_weight).detach() new_fc1_weight.requires_grad = True new_fc1_bias = torch.stack(new_fc1_bias).detach() new_fc1_bias.requires_grad = True self.fc1 = quant_noise( nn.Linear(self.fc1.in_features, self.fc1.out_features - len(remove_index)), p=self.quant_noise, block_size=self.quant_noise_block_size, ) self.fc1.weight = torch.nn.Parameter(new_fc1_weight) self.fc1.bias = torch.nn.Parameter(new_fc1_bias) new_fc2_weight = [] new_fc2_bias = [] for i in range(self.fc2.in_features): if i not in remove_index: new_fc2_weight.append(self.fc2.weight[:, i]) new_fc2_bias = self.fc2.bias.detach() new_fc2_weight = torch.stack(new_fc2_weight, dim=-1).detach() new_fc2_weight.requires_grad = True new_fc2_bias = self.fc2.bias.detach() new_fc2_bias.requires_grad = True self.fc2 = quant_noise( nn.Linear(self.fc2.in_features - len(remove_index), self.fc2.out_features), p=self.quant_noise, block_size=self.quant_noise_block_size, ) self.fc2.weight = torch.nn.Parameter(new_fc2_weight) self.fc2.bias = torch.nn.Parameter(new_fc2_bias)
def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size): super().__init__() tied_emb, _ = weights self.num_words, emb_dim = tied_emb.size() self.word_proj = quant_noise(TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size) if input_dim != emb_dim: self.word_proj = nn.Sequential( quant_noise(nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size), self.word_proj, ) self.class_proj = quant_noise( nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size) self.out_dim = self.num_words + num_classes self.register_buffer('_float_tensor', torch.FloatTensor(1))
def __init__( self, vocab_size: int, padding_idx: int, initial_dim: int, factor: float, output_dim: int, cutoff: List[int], q_noise: float = 0, qn_block_size: int = 8, ): super().__init__() if vocab_size > cutoff[-1]: cutoff = cutoff + [vocab_size] else: assert (vocab_size == cutoff[-1] ), "cannot specify cutoff larger than vocab size" self.cutoff = cutoff self.embedding_dim = output_dim self.padding_idx = padding_idx self.embeddings = nn.ModuleList() for i in range(len(self.cutoff)): prev = self.cutoff[i - 1] if i > 0 else 0 size = self.cutoff[i] - prev dim = int(initial_dim // (factor**i)) seq = nn.Sequential( nn.Embedding(size, dim, self.padding_idx), quant_noise(nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size), ) self.embeddings.append(seq) self.padding_idx = None self.padding_idx = padding_idx def init_weights(m): if isinstance(m, nn.Embedding): nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1]**-0.5) nn.init.constant_(m.weight[padding_idx], 0) elif hasattr(m, "weight"): nn.init.xavier_uniform_(m.weight) self.apply(init_weights) self.register_buffer("_float_tensor", torch.FloatTensor(1))
def __init__( self, embed_dim, num_heads, dropout=0.0, bias=True, tie_kv=True, q_noise=0.0, qn_block_size=8, parallel=True, ): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.parallel = parallel self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.pq_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.pc_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if tie_kv: self.c_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.k_proj = self.v_proj = None else: self.k_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.c_proj = None self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.reset_parameters() self.onnx_trace = False self.tpu = False
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim # Ture self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise( nn.Linear(2 * self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False self.tpu = False
def build_domain_projection(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, compressed=1, max_seq_len=256, shared_kv_compressed=0, shared_compress_layer=None, freeze_compress=0, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) # used for compress sequence to subsequence if shared_compress_layer is None: self.compress_seq_len = max_seq_len // compressed self.compress_k = nn.Linear(max_seq_len, self.compress_seq_len, bias=False) if shared_kv_compressed == 0: self.compress_v = nn.Linear(max_seq_len, self.compress_seq_len, bias=False) self.layerwise_sharing = False else: self.compress_k = shared_compress_layer if shared_kv_compressed == 0: self.compress_v = shared_compress_layer self.layerwise_sharing = True self.shared_kv_compressed = shared_kv_compressed self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() if freeze_compress == 1: self.compress_k.weight.requires_grad = False if shared_kv_compressed == 0: self.compress_v.weight.requires_grad = False self.onnx_trace = False self.tpu = False
def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, nblocks=1, top_k_ratio=None): super().__init__() self.embed_dim = embed_dim self.nblocks = nblocks self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) print('heads', num_heads) print('num blocks', nblocks) if top_k_ratio is None: self.sa = None print('no topk') else: top_k = int(top_k_ratio * nblocks * num_heads) self.sa = SparseAttention(top_k=top_k) print('using topk', top_k) self.head_dim = 128 #embed_dim // num_heads #assert ( # self.head_dim * num_heads == self.embed_dim #), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise( nn.Linear(self.kdim, self.head_dim * num_heads, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise( nn.Linear(self.vdim, self.head_dim * num_heads, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise( nn.Linear(embed_dim, self.head_dim * num_heads, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise( nn.Linear(self.head_dim * num_heads, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False self.tpu = False
def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, relative_pos_type=None, max_relative_pos=None, heads_share_embeddings=False, add_pos_embeddings_to_values=False): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False self.tpu = False self.positional_embedding_layer = None self.unmasked_attention = None self.add_pos_embeddings_to_values = add_pos_embeddings_to_values if relative_pos_type is not None: if add_pos_embeddings_to_values: self.enable_torch_version = False self.unmasked_attention = (relative_pos_type == "unmasked") self.heads_share_embeddings = heads_share_embeddings self.positional_embedding_layer = RelativePositionalEmbedding( max_relative_pos=max_relative_pos, num_heads=num_heads, embedding_dim=self.head_dim, unmasked=self.unmasked_attention, heads_share_embeddings=heads_share_embeddings, add_to_values=add_pos_embeddings_to_values)
def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise( GroupLinearLayer(input_dim // self.nb, output_dim // self.nb, self.nb), q_noise, qn_block_size)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, relaxed_attention_weight=0.0, q_noise=0.0, qn_block_size=8, # TODO: pass in config rather than string. # config defined in xformers.components.attention.AttentionConfig xformers_att_config: Optional[str] = None, xformers_blocksparse_layout: Optional[ torch.Tensor] = None, # This should be part of the config xformers_blocksparse_blocksize: Optional[ int] = 16, # This should be part of the config positional_embedding=None, ): super().__init__() xformers_att_config = utils.eval_str_dict(xformers_att_config) self.use_xformers = xformers_att_config is not None if self.use_xformers and not _xformers_available: raise ImportError("\n\n Please install xFormers.") self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention self.relaxed_attention_weight = relaxed_attention_weight assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.beam_size = 1 self.positional_embedding = positional_embedding if (self.positional_embedding is not None and not self.positional_embedding.learnable): self.pos_bias_u = nn.Parameter(torch.Tensor(embed_dim)) self.pos_bias_v = nn.Parameter(torch.Tensor(embed_dim)) self.pos_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=False), q_noise, qn_block_size) else: self.pos_bias_u = self.pos_bias_v = self.pos_proj = None self.reset_parameters() if self.use_xformers: xformers_att_config["dropout"] = xformers_att_config.get( "dropout", dropout) xformers_att_config["num_heads"] = xformers_att_config.get( "num_heads", num_heads) if xformers_blocksparse_layout is not None: # Could be part of a single config passed only once xformers_att_config[ "block_size"] = xformers_blocksparse_blocksize xformers_att_config["layout"] = xformers_blocksparse_layout xformers_att_config["name"] = "blocksparse" self.attention = build_attention(xformers_att_config) self.onnx_trace = False self.skip_embed_dim_check = False
def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, sigsoftmax=False, mix_softmax=False, mix_type=-1, temperature=1.0, pre_drop_mix=False, pre_mix=False, fix_head_dim=-1, use_div_reg=False, synth_attn_type='vanilla', synth_hidden_dim=-1, synth_factor_dim=-1, synth_trainable_random=True, synth_max_len_seq=-1, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) # fix head dim self.fix_head_dim = fix_head_dim self.embed_dim2 = embed_dim if self.fix_head_dim != -1: self.head_dim = fix_head_dim print('fix head dim:', self.head_dim) self.embed_dim2 = self.head_dim * self.num_heads else: self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise( nn.Linear(self.kdim, self.embed_dim2, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise( nn.Linear(self.vdim, self.embed_dim2, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise( nn.Linear(self.embed_dim, self.embed_dim2, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise( nn.Linear(self.embed_dim2, self.embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False self.tpu = False # custom code # div reg self.use_div_reg = use_div_reg if self.use_div_reg: self.div_reg = 0 print('using div regularization') else: self.div_reg = None # Mix-softmax paras self.sigsoftmax = sigsoftmax self.mix_softmax = mix_softmax self.mix_type = mix_type self.temperature = temperature self.pre_drop_mix = pre_drop_mix self.pre_mix = pre_mix for attr in ['sigsoftmax', 'pre_drop_mix', 'pre_mix']: if getattr(self, attr) is True: print('using {}.'.format(attr)) if self.mix_softmax: print('using mix type: {}, pre drop mix: {}, pre mix: {}.'.format( self.mix_type, self.pre_drop_mix, self.pre_mix)) self.bias = None self.stochastic_w = None if self.mix_type == 9: # fixed randn projection (allow negative values) self.stochastic_w = torch.randn(self.num_heads, self.num_heads) self.stochastic_w = nn.Parameter(self.stochastic_w / ( self.stochastic_w.sum(dim=0, keepdim=True) + self.stochastic_w.sum(dim=0, keepdim=True).sign() * 1e-12), requires_grad=False) elif self.mix_type == 10: # fixed rand projection self.stochastic_w = torch.rand(self.num_heads, self.num_heads) self.stochastic_w = nn.Parameter( self.stochastic_w / self.stochastic_w.sum(dim=0, keepdim=True), requires_grad=False) elif self.mix_type in [11, 12, 13]: # data dependent parameter self.stochastic_w = nn.Parameter( torch.randn(self.head_dim, self.num_heads) / self.head_dim) elif self.mix_type in [14, 15, 16]: self.bias = nn.Parameter( torch.eye(self.num_heads, self.num_heads)) self.stochastic_w = nn.Parameter( torch.randn(self.head_dim, self.num_heads) / self.head_dim) elif self.mix_type in [17, 18, 19]: self.bias = nn.Parameter( torch.eye(self.num_heads, self.num_heads)) self.stochastic_w = nn.Parameter( torch.zeros(self.head_dim, self.num_heads)) else: self.stochastic_w = nn.Parameter( torch.eye(self.num_heads, self.num_heads)) self.r_temperature = 1.0 / self.temperature # Synthesizer initializer starts here self.synth_attn_type = synth_attn_type self.synth_hidden_dim = synth_hidden_dim # For dense variant self.synth_factor_dim = synth_factor_dim # 0 for not using, else enables factorization self.synth_trainable_random = synth_trainable_random # True for trainable random variant self.synth_max_len_seq = synth_max_len_seq if self.synth_attn_type == 'vanilla': pass else: print('Synthesizer attn_type: ', self.synth_attn_type, 'hidden dim: ', self.synth_hidden_dim, 'factor dim: ', self.synth_factor_dim, 'trainable_random: ', self.synth_trainable_random, 'max_len_seq: ', self.synth_max_len_seq) if self.synth_attn_type == 'dense': self.synth_attn = DenseAttention(self.synth_max_len_seq, self.head_dim, self.synth_hidden_dim) elif self.synth_attn_type == 'random': self.synth_attn = RandomAttention(num_heads, self.synth_max_len_seq, self.synth_trainable_random) elif self.synth_attn_type == 'dense_factorized': self.synth_attn = FactorizedDenseAttention( self.synth_max_len_seq, self.head_dim, self.synth_factor_dim) elif self.synth_attn_type == 'random_factorized': self.synth_attn = FactorizedRandomAttention( num_heads, self.synth_factor_dim, self.synth_max_len_seq, self.synth_trainable_random) else: print('unknown attn_type: ', self.attn_type) exit(1)
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, biased_attn_weight=True, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.kk_bias_r = nn.Parameter(torch.zeros(num_heads), requires_grad=biased_attn_weight) self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False self.enable_torch_version = False if hasattr(F, "multi_head_attention_forward"): self.enable_torch_version = True else: self.enable_torch_version = False
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, normalized_attention=False, normalized_attention_logsoftmax=False, normalized_attention_by_entropy=False, positional_embeddings_in_attention=False, symmetric_kv_context_params=False, symmetric_kv_positional_params=False, #normalized_attention_by_positional_score=False, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.positional_embeddings_in_attention = positional_embeddings_in_attention self.symmetric_kv_context_params = symmetric_kv_context_params self.symmetric_kv_positional_params = symmetric_kv_positional_params #self.normalized_attention_by_positional_score=normalized_attention_by_positional_score self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if self.symmetric_kv_context_params: assert self.kdim == embed_dim, ( "Symmetric context attention requires kdim == embed_dim") self.q_proj.weight = self.k_proj.weight self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.normalized_attention = normalized_attention self.normalized_attention_logsoftmax = normalized_attention_logsoftmax self.normalized_attention_by_entropy = normalized_attention_by_entropy if self.normalized_attention: self.attention_gain = quant_noise( nn.Linear(embed_dim, num_heads, bias=True), q_noise, qn_block_size) if self.positional_embeddings_in_attention: self.pos_k_proj = quant_noise( nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.pos_q_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if self.symmetric_kv_positional_params: assert self.kdim == embed_dim, ( "Symmetric positional attention requires kdim == embed_dim" ) self.pos_q_proj.weight = self.pos_k_proj.weight self.pos_embeddings = SinusoidalPositionalEmbedding( embed_dim, None) self.reset_parameters() self.onnx_trace = False
def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0, qn_block_size=8): super().__init__(embed_dim, num_heads, kdim=kdim, vdim=vdim, dropout=dropout, bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=self_attention, encoder_decoder_attention=encoder_decoder_attention, q_noise=q_noise, qn_block_size=qn_block_size) self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=True), q_noise, qn_block_size) self.k_proj.bias = nn.Parameter(torch.zeros_like(self.k_proj.bias, requires_grad=False))
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, capsule_proj_weight=None, capsule_proj_bias=None, dynamic_routing_weights=None, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) # Added capsule weights self.dynamic_routing_weights = dynamic_routing_weights #[nn.Parameter(torch.ones( self.head_dim, self.num_heads, self.num_heads, device='cuda', dtype= torch.half, requires_grad=True)) for _ in range (0, self.num_heads)] if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False